#!/usr/bin/perl

use strict;
use warnings;
use HTML::TokeParser;
use Data::Dumper;


my $html_file = './test.html';

my $html = '';
open(F,"<$html_file");
while (<F>) {
        $html .= $_;
}
close(F);

my $word_to_repl = $ARGV[0] || 0;
chomp $word_to_repl;

my $p = HTML::TokeParser->new( \$html );

# setup text position info for TokeParser.  The char is
# the token type and the int is the position in the resulting
# array of the unmanipulated text--which is what we want to
# inspect.
my $text_pos = {'S'     => 4,
                'E'     => 2,
                'T'     => 1,
                'C'     => 1,
                'D'     => 1,
                'PI'    => 2 };

my $base_count = 0;
my @word_list = ();
while (my $token = $p->get_token) {
        my $token_type = $token->[0] || '';
        my $token_pos  = $text_pos->{$token_type} || '';

        # die hard if we have any sort of parsing error, as everything
        # is likely screwed as a result, anyway.
        if (!$token_type || !$token_pos) {
                print "Ouch.. parsing error!\n";
                exit 0;
        }

        if ($token_type eq 'T') {
                # got text, run a regex with positional counts
                my $text = $token->[$token_pos];

                # regex grabs all words out of $text.  It *also* grabs HTML &nnnn; type
                # special chars complete with the & and ; so I can skip them.  The
                # "\w+\'?\w+" bit allows me to grab contracted words (eg don't), but causes
                # a failure in finding single letter words ("I" and "a").
                while ($text =~ m/(\&?\b\w+\'?\w+?\b\;?)/g) {
                        # skip if this is a &nnnn; style HTML char
                        if ($1 !~ /^\&/) {
                                # start byte is the summation of base_count and where
                                # this regex started off.
                                my $start = $base_count + $-[0];
                                push @word_list, { word => $1, start => $start };
                        }
                }
        }

        # increment base_count with the length of this segment
        $base_count += length($token->[$token_pos]);
}


print "Original HTML:\n";
print "----------------------------------\n";
print "$html\n\n";

my $word_href   = $word_list[$word_to_repl];
my $start       = $word_href->{start};
my $word        = $word_href->{word};
my $offset      = length($word);

print "Replacing [$word] at ($start,$offset)\n\n";
substr($html,$start,$offset,'POOP');

print "New HTML:\n";
print "----------------------------------\n";
print "$html\n\n";