#!/usr/bin/perl use strict; use warnings; use HTML::TokeParser; use Data::Dumper; my $html_file = './test.html'; my $html = ''; open(F,"<$html_file"); while () { $html .= $_; } close(F); my $word_to_repl = $ARGV[0] || 0; chomp $word_to_repl; my $p = HTML::TokeParser->new( \$html ); # setup text position info for TokeParser. The char is # the token type and the int is the position in the resulting # array of the unmanipulated text--which is what we want to # inspect. my $text_pos = {'S' => 4, 'E' => 2, 'T' => 1, 'C' => 1, 'D' => 1, 'PI' => 2 }; my $base_count = 0; my @word_list = (); while (my $token = $p->get_token) { my $token_type = $token->[0] || ''; my $token_pos = $text_pos->{$token_type} || ''; # die hard if we have any sort of parsing error, as everything # is likely screwed as a result, anyway. if (!$token_type || !$token_pos) { print "Ouch.. parsing error!\n"; exit 0; } if ($token_type eq 'T') { # got text, run a regex with positional counts my $text = $token->[$token_pos]; # regex grabs all words out of $text. It *also* grabs HTML &nnnn; type # special chars complete with the & and ; so I can skip them. The # "\w+\'?\w+" bit allows me to grab contracted words (eg don't), but causes # a failure in finding single letter words ("I" and "a"). while ($text =~ m/(\&?\b\w+\'?\w+?\b\;?)/g) { # skip if this is a &nnnn; style HTML char if ($1 !~ /^\&/) { # start byte is the summation of base_count and where # this regex started off. my $start = $base_count + $-[0]; push @word_list, { word => $1, start => $start }; } } } # increment base_count with the length of this segment $base_count += length($token->[$token_pos]); } print "Original HTML:\n"; print "----------------------------------\n"; print "$html\n\n"; my $word_href = $word_list[$word_to_repl]; my $start = $word_href->{start}; my $word = $word_href->{word}; my $offset = length($word); print "Replacing [$word] at ($start,$offset)\n\n"; substr($html,$start,$offset,'POOP'); print "New HTML:\n"; print "----------------------------------\n"; print "$html\n\n";