in reply to Re: Perl Array Question, combining HTML::HeadParser and regex
in thread Perl Array Question, combining HTML::HeadParser and regex
poj#!perl use strict; use warnings; use File::Find; use HTTP::Headers; use HTML::HeadParser; use Text::CSV; # config my $dfile = 'all_tags.csv'; my $dir = 'Test'; my @TAGS = ('Content-Base', 'Title', 'X-Meta-author', 'X-Meta-description', 'X-Meta-keywords', 'X-Meta-name',); # match words my @WORDS = qw( press founder professor Dr. Ph.D M.D called receives joins timing find two self bottom true amazing forget night next day ); my $words = join '|',map { quotemeta } @WORDS ; my $regex = qr/.{0,25} (?:$words) .{0,25}/; # output my $csv = Text::CSV->new({eol => $/}); open my $fh1, ">:encoding(utf8)", $dfile or die "Error opening $dfile: $!"; $csv->print($fh1,['Search Words',@WORDS]); # header $csv->print($fh1,['Filename',@TAGS,'Search Results']); # header # input find ({wanted =>\&HTML_Files, no_chdir => 1}, $dir); close $fh1 or die "Error closing $dfile: $!"; exit; sub HTML_Files { parse_HTML_Header($File::Find::name) if /\.html?$/; } sub parse_HTML_Header { my $ifile = shift; print "parsing $ifile\n"; open my $fh0, '<', $ifile or die "Error opening $ifile: $!\n"; my $text = do{ local $/; <$fh0> }; close $fh0; my @matches = ($text =~ /($regex)/gisx); #print join "\n",@matches; my $h = HTTP::Headers->new; my $p = HTML::HeadParser->new($h); $p->parse($text); my @cols = map{ $h->header($_) || '' }@TAGS; $csv->print($fh1, [$ifile,@cols,@matches]); }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^3: Perl Array Question, combining HTML::HeadParser and regex
by Anonymous Monk on Feb 01, 2016 at 15:50 UTC | |
|
Re^3: Perl Array Question, combining HTML::HeadParser and regex
by Anonymous Monk on Feb 01, 2016 at 14:36 UTC | |
by poj (Abbot) on Feb 01, 2016 at 17:13 UTC |