I have a perl script that I'm debugging and looking for some assistance. My script looks in a folder and pulls out header and meta tags and writes to one file. I just added a piece where it also does a reggae search and is supposed to write to the same file/line as the header/tag parse. I'm getting all of the info in the file that I need but it's a mess. I'm getting duplicate entries of the file name across cell when I open the CSV in excel and the regex print to file after find appear but above and below the file entry. I will continue to troubleshoot but thought that I would post with hopes that someone a little more skilled than me at perl (I'm an html and java guy) can point me in the right direction. Thanks in advance. Please see my ugly but somewhat working code below
use strict; use warnings; use File::Find; use HTTP::Headers; use HTML::HeadParser; use Text::CSV; my $csv1 = Text::CSV->new ( { binary => 1 } ) or die Text::CSV->error_ +diag(); my $csv2 = Text::CSV->new ( { binary => 1 } ) or die Text::CSV->error_ +diag(); $csv1->eol ("\n"); $csv2->eol ("\n"); my $csv3= "\n"; my $dfile = 'all_tags.csv'; open my $fh1, ">:encoding(utf8)", "$dfile" or die "Error opening $dfil +e: $!"; #open my $fh2, ">:encoding(utf8)", "$dfile" or die "Error opening $dfi +le: $!"; my $dir = 'Test'; find ({wanted =>\&HTML_Files, no_chdir => 1}, $dir); ## find({wanted => \&listfiles, no_chdir => 1}, "."); sub listfiles{ # print $File::Find::name,"\n"; } ## #print "directory is"; #print $dir; close $fh1 or die "Error closing $dfile: $!"; exit; sub HTML_Files { Parse_HTML_Header($File::Find::name) if /\.html?$/; } sub Parse_HTML_Header { my $ifile = $File::Find::name; print $ifile; open(my $fh0, '<', $ifile) or die "Error opening $ifile: $!\n"; my $text = ''; { $/ = undef; $text = <$fh0>; } close $fh0; my $h = HTTP::Headers->new; my $p = HTML::HeadParser->new($h); $p->parse($text); my $csv = Text::CSV->new({eol => $/}); my @fields = ('Content-Base', 'Title', 'X-Meta-author', 'X-Meta-des +cription', 'X-Meta-keywords', 'X-Meta-name',); #print $fh1, "Hi"; #for ($h->header_field_names) {$csv->print($fh1, [map { $h->header( +$_)} @fields]); for ($h->header_field_names) # {$csv->print ($fh1, [map {$File::Find::name, $h->header($_), "=CH +AR(13)"} @fields]); #print $fh1 "\n"; # my $string = quotemeta 'CEO'; my $slurp; { local $/ = undef; #no_chdir => 1; #chdir("/Users/tlialin/desktop/01282016/"); print $ifile; open my $textfile, '<', $ifile or die $!; $slurp = <$textfile>; close $textfile; } while( $slurp =~ m/ ( .{0,25} $string.{0,25} ) /gisx ) { #print "Found $1\n"; print $fh1 $1; print $fh1 "\n"; } # } }
In reply to Parsing web content with File::Find, HTML::HeadParser and using a regex by Anonymous Monk
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |