# this assumes an html file in @ARGV or on STDIN:
my $src;
{ # read the entire HTML input stream as one contiguous string:
local $/ = undef;
$src = <>;
}
my $htm = HTML::TokeParser->new( \$src );
my $inscript = 0;
my $ignore = join '|', qw/script style cssheader/;
while ( my $tkn = $htm->get_token )
{
if ( $$tkn[0] eq 'S' and $$tkn[1] =~ /^(?:$ignore)$/ )
{
$inscript++; # skip anything having to do with scripts, styles or css
next;
}
elsif ( $$tkn[0] eq 'E' and $$tkn[1] =~ /^(?:$ignore)$/ )
{
$inscript--;
next;
}
elsif ( $$tkn[0] eq 'T' and ! $inscript ) {
# we have text that is not part of scripting or styling,
# so do something with this text...
}
}