in reply to Slow regexp
When test.html looks likeuse strict; use HTML::TokeParser; use Data::Dumper; my %tokens; my %tokencount; my $p = HTML::TokeParser->new("test.html") or die "Can't open: $!"; while (my $token = $p->get_token) { if ( $token->[0] eq "S" ) { $tokens{$token->[1]}++ unless $token->[1] =~ /meta/i; } elsif ( $token->[0] eq "E" ) { $tokens{$token->[1]}--; } elsif ( $token->[0] eq "T" ) { my @words = ( $token->[1] =~ /\b(\w+)/g ); for ( keys %tokens ) { $tokencount{$_} += @words if $tokens{$_} > 0; } } } print Dumper (\%tokencount);
it will print<html lang='en-US'> <head> <title>Stuff</title> <meta name='author' content='Jojo' /> </head> <body> <h2>I like potatoes!</h2> <h1>Me not!</h1> </body> </html>
$VAR1 = { 'h1' => 2, 'body' => 5, 'head' => 1, 'html' => 6, 'title' => 1, 'h2' => 3 };
|
|---|