in reply to Parsing web content with File::Find, HTML::HeadParser and using a regex
I think your problem is here, outputing the same line for each header tag
for ($h->header_field_names){ $csv->print ($fh1, [map {$File::Find::name, $h->header($_), "=CH +AR(13)"} @fields]); . .
Here's a cleaned up version of your script (untested)
#!perl use strict; use warnings; use File::Find; use HTTP::Headers; use HTML::HeadParser; use Text::CSV; # config my $dfile = 'all_tags.csv'; my $dir = 'Test'; my @TAGS = ('Content-Base', 'Title', 'X-Meta-author', 'X-Meta-description', 'X-Meta-keywords', 'X-Meta-name',); # output my $csv = Text::CSV->new({eol => $/}); open my $fh1, ">:encoding(utf8)", $dfile or die "Error opening $dfile: $!"; $csv->print($fh1,['Filename',@TAGS]); # header # input find ({wanted =>\&HTML_Files, no_chdir => 1}, $dir); close $fh1 or die "Error closing $dfile: $!"; exit; sub HTML_Files { parse_HTML_Header($File::Find::name) if /\.html?$/; } sub parse_HTML_Header { my $ifile = shift; print "parsing $ifile\n"; open my $fh0, '<', $ifile or die "Error opening $ifile: $!\n"; my $text = do{ local $/; <$fh0> }; close $fh0; my $h = HTTP::Headers->new; my $p = HTML::HeadParser->new($h); $p->parse($text); my @cols = map{ $h->header($_) }@TAGS; $csv->print($fh1, [$ifile,@cols]); my $string = quotemeta 'CEO'; while ( $text =~ m/ ( .{0,25} $string.{0,25} ) /gisx ) { print $fh1 $1,"\n"; } }
Did you solve this HTML::HeadParser challenges ?
poj
|
|---|