I've written a perl script that reads multiple html files in a folder and writes specific headers grabbed to one file line by line, 1 for each html file. The script works fine on HTML examples that I've created (very basic) but writes nothing when I use real html files from the web. I'm thinking that certain characters in the print function that are getting parsed are confusing the parser. I'm looking in to using HTML::Tidy to clean the HTML in the folders before I run the script but I'm not sure if that will help. Here's my code:
use strict; use warnings; use File::Find; use HTTP::Headers; use HTML::HeadParser; use Text::CSV; my $csv1 = Text::CSV->new ( { binary => 1 } ) or die Text::CSV->error_ +diag(); my $csv2 = Text::CSV->new ( { binary => 1 } ) or die Text::CSV->error_ +diag(); $csv1->eol ("\n"); $csv2->eol ("\n"); my $csv3= "\n"; my $dfile = 'all_tags.csv'; open my $fh1, ">:encoding(utf8)", "$dfile" or die "Error opening $dfil +e: $!"; #open my $fh2, ">:encoding(utf8)", "$dfile" or die "Error opening $dfi +le: $!"; my $dir = 'Test'; find ({wanted =>\&HTML_Files, no_chdir => 1}, $dir); ## find({wanted => \&listfiles, no_chdir => 1}, "."); sub listfiles{ # print $File::Find::name,"\n"; } ## #print "directory is"; #print $dir; close $fh1 or die "Error closing $dfile: $!"; exit; sub HTML_Files { Parse_HTML_Header($File::Find::name) if /\.html?$/; } sub Parse_HTML_Header { my $ifile = $File::Find::name; print $ifile; open(my $fh0, '<', $ifile) or die "Error opening $ifile: $!\n"; my $text = ''; { $/ = undef; $text = <$fh0>; } close $fh0; my $h = HTTP::Headers->new; my $p = HTML::HeadParser->new($h); $p->parse($text); my $csv = Text::CSV->new({eol => $/}); my @fields = ('Title', 'X-Meta-Author', 'X-Meta-Description', 'X-Me +ta-name'); #print $fh1, "Hi"; #for ($h->header_field_names) {$csv->print($fh1, [map { $h->header( +$_)} @fields]); for ($h->header_field_names) # # {$csv->print ($fh1, [map {$File::Find::name, $h->header($_), "=CH +AR(13)"} @fields]); # print $fh1 "\n"; # } }
any ideas or suggesting of other pm s to look at are greatly appreciated.
In reply to HTML::HeadParser challenges by Anonymous Monk
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |