####
Filename Content-Base Title X-Meta-author X-Meta-description X-Meta-keywords X-Meta-name
Test/1.html Aberdeen%20Animal%20trait%20analysis , News Search | Ask.com
####
my $string = quotemeta 'CEO';
while ( $text =~ m/ ( .{0,25} $string.{0,25} ) /gisx ) {
print $fh1 $1, ",";
}
####
Test/Ames_Animal trait analysis.html.result.txt_parsed_for_news.txt.html Ames Animal trait analysis , News Search | Ask.com
Test/Ames_Biobank.html.result.txt_parsed_for_clinic.txt.html
both adults and infants. Dr. Kocher has requested who hrough a separate study. Dr. Lazaridis' samples alon colon and rectal cancer. Dr. Nelson has requested sto rointestinal microbiome. Dr. Nelson and her colleague in a new research study. Dr. Ames is recruiting parti ers.
In addition Dr. Thibodeau has expanded t sh; who have PKD.
Dr. Harris' goal is to bette h another study.
Dr. Heit has also asked for ients who've had a clot. Dr. Heit's goal is to identi To study microvesicles Dr. Jayachandran is requesti pice caregivers.
Dr. Kaur is researching whet 18">Nilufer Taner M.D. Ph.D. is studying geneti 0027660">Janet E. Olson Ph.D. and Awards
##
my $string = quotemeta 'CEO';
while ( $text =~ m/ ( .{0,25} $string.{0,25} ) /gisx ) {
print $fh1 $1, ",";
}
####
#!perl
use strict;
use warnings;
use File::Find;
use HTTP::Headers;
use HTML::HeadParser;
use Text::CSV;
# config
my $dfile = 'all_tags.csv';
my $dir = 'Test';
my @TAGS = ('Content-Base', 'Title',
'X-Meta-author', 'X-Meta-description',
'X-Meta-keywords', 'X-Meta-name',);
my @TAGS2 = ('CEO', 'founder',
'professor', 'Dr.',
'Ph.D', 'M.D.',
'company called', 'startup called',
'joins', 'receives funding',
'SBIR', 'receiving the grant',
'seed investment', 'seed fund',
'appointed', 'chosen',
'secures', 'award',
'seed investment', 'awarded',
);
# output
my $csv = Text::CSV->new({eol => $/});
open my $fh1, ">:encoding(utf8)", $dfile
or die "Error opening $dfile: $!";
$csv->print($fh1,['Filename',@TAGS]); # parser header
my $string = map {quotemeta} @TAGS2;
#my $text =
while ( my $text =~ m/ ( .{0,25} $string.{0,25} ) /gisx ) {
$string->print($fh1, ['Filename',@TAGS2]);# regex header
}
# input
find ({wanted =>\&HTML_Files, no_chdir => 1}, $dir);
close $fh1 or die "Error closing $dfile: $!";
exit;
sub HTML_Files {
parse_HTML_Header($File::Find::name) if /\.html?$/;
}
sub parse_HTML_Header {
my $ifile = shift;
print "parsing $ifile\n";
open my $fh0, '<', $ifile or die "Error opening $ifile: $!\n";
my $text = do{ local $/; <$fh0> };
close $fh0;
my $h = HTTP::Headers->new;
my $p = HTML::HeadParser->new($h);
$p->parse($text);
my @cols = map{ $h->header($_) }@TAGS;
$csv->print($fh1, [$ifile,@cols]);
my @cols2 = map{ $h->$string($_) }@TAGS2;
$string->print($fh1, [$ifile,@cols2]);
#my $string = quotemeta 'awarded';
#while ( $text =~ m/ ( .{0,25} $string.{0,25} ) /gisx ) {
#print $fh1 $1,"\n";
# }
}