Anonymous Monk has asked for the wisdom of the Perl Monks concerning the following question:
I have a perl script that I'm debugging and looking for some assistance. My script looks in a folder and pulls out header and meta tags and writes to one file. I just added a piece where it also does a reggae search and is supposed to write to the same file/line as the header/tag parse. I'm getting all of the info in the file that I need but it's a mess. I'm getting duplicate entries of the file name across cell when I open the CSV in excel and the regex print to file after find appear but above and below the file entry. I will continue to troubleshoot but thought that I would post with hopes that someone a little more skilled than me at perl (I'm an html and java guy) can point me in the right direction. Thanks in advance. Please see my ugly but somewhat working code below
use strict; use warnings; use File::Find; use HTTP::Headers; use HTML::HeadParser; use Text::CSV; my $csv1 = Text::CSV->new ( { binary => 1 } ) or die Text::CSV->error_ +diag(); my $csv2 = Text::CSV->new ( { binary => 1 } ) or die Text::CSV->error_ +diag(); $csv1->eol ("\n"); $csv2->eol ("\n"); my $csv3= "\n"; my $dfile = 'all_tags.csv'; open my $fh1, ">:encoding(utf8)", "$dfile" or die "Error opening $dfil +e: $!"; #open my $fh2, ">:encoding(utf8)", "$dfile" or die "Error opening $dfi +le: $!"; my $dir = 'Test'; find ({wanted =>\&HTML_Files, no_chdir => 1}, $dir); ## find({wanted => \&listfiles, no_chdir => 1}, "."); sub listfiles{ # print $File::Find::name,"\n"; } ## #print "directory is"; #print $dir; close $fh1 or die "Error closing $dfile: $!"; exit; sub HTML_Files { Parse_HTML_Header($File::Find::name) if /\.html?$/; } sub Parse_HTML_Header { my $ifile = $File::Find::name; print $ifile; open(my $fh0, '<', $ifile) or die "Error opening $ifile: $!\n"; my $text = ''; { $/ = undef; $text = <$fh0>; } close $fh0; my $h = HTTP::Headers->new; my $p = HTML::HeadParser->new($h); $p->parse($text); my $csv = Text::CSV->new({eol => $/}); my @fields = ('Content-Base', 'Title', 'X-Meta-author', 'X-Meta-des +cription', 'X-Meta-keywords', 'X-Meta-name',); #print $fh1, "Hi"; #for ($h->header_field_names) {$csv->print($fh1, [map { $h->header( +$_)} @fields]); for ($h->header_field_names) # {$csv->print ($fh1, [map {$File::Find::name, $h->header($_), "=CH +AR(13)"} @fields]); #print $fh1 "\n"; # my $string = quotemeta 'CEO'; my $slurp; { local $/ = undef; #no_chdir => 1; #chdir("/Users/tlialin/desktop/01282016/"); print $ifile; open my $textfile, '<', $ifile or die $!; $slurp = <$textfile>; close $textfile; } while( $slurp =~ m/ ( .{0,25} $string.{0,25} ) /gisx ) { #print "Found $1\n"; print $fh1 $1; print $fh1 "\n"; } # } }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Parsing web content with File::Find, HTML::HeadParser and using a regex
by poj (Abbot) on Jan 31, 2016 at 19:59 UTC | |
|
Re: Parsing web content with File::Find, HTML::HeadParser and using a regex
by Anonymous Monk on Feb 01, 2016 at 01:31 UTC |