Aberdeen%20Genetic%20purity , News Search

##

Filename	Content-Base	Title	X-Meta-author	X-Meta-description	X-Meta-keywords	X-Meta-name
Test/1.html	Aberdeen%20Animal%20trait%20analysis , News Search | Ask.com					

##

##

my $string = quotemeta 'CEO';
  while ( $text =~ m/ ( .{0,25} $string.{0,25} ) /gisx ) {	
    print $fh1 $1, ",";
	 }

##

##

Test/Ames_Animal trait analysis.html.result.txt_parsed_for_news.txt.html	Ames Animal trait analysis , News Search | Ask.com						
Test/Ames_Biobank.html.result.txt_parsed_for_clinic.txt.html							
both adults and infants. Dr. Kocher has requested who	hrough a separate study. Dr. Lazaridis' samples	 alon	colon and rectal cancer. Dr. Nelson has requested sto	rointestinal microbiome. Dr. Nelson and her colleague	in a new research study. Dr. Ames is recruiting parti	ers.	
In addition	 Dr. Thibodeau has expanded t	sh; who have PKD.					
Dr. Harris' goal is to bette	h another study.						
							
Dr. Heit has also asked for 	ients who've had a clot. Dr. Heit's goal is to identi	 To study microvesicles	 Dr. Jayachandran is requesti	pice caregivers.			
							
Dr. Kaur is researching whet	18">Nilufer Taner	 M.D.	 Ph.D.	 is studying geneti	0027660">Janet E. Olson	 Ph.D.	 and Awards							
    ##

my $string = quotemeta 'CEO';
  while ( $text =~ m/ ( .{0,25} $string.{0,25} ) /gisx ) {	
    print $fh1 $1, ",";
	 }

####

#!perl
use strict;
use warnings;
use File::Find;
use HTTP::Headers;
use HTML::HeadParser;
use Text::CSV;

# config
my $dfile  = 'all_tags.csv';
my $dir    = 'Test';
my @TAGS = ('Content-Base', 'Title', 
            'X-Meta-author', 'X-Meta-description', 
            'X-Meta-keywords', 'X-Meta-name',);
			
my @TAGS2 = ('CEO', 'founder', 
			'professor', 'Dr.', 
			'Ph.D', 'M.D.',
			'company called', 'startup called',
			'joins', 'receives funding',
			'SBIR', 'receiving the grant',
			'seed investment', 'seed fund',
			'appointed', 'chosen',
			'secures', 'award',
			'seed investment', 'awarded',
			);	
			
			
              
# output
my $csv = Text::CSV->new({eol => $/});
open my $fh1, ">:encoding(utf8)", $dfile 
    or die "Error opening $dfile: $!";
$csv->print($fh1,['Filename',@TAGS]); # parser header

my $string = map {quotemeta} @TAGS2;
#my $text = 
while ( my $text =~ m/ ( .{0,25} $string.{0,25} ) /gisx ) {	
$string->print($fh1, ['Filename',@TAGS2]);# regex header
	 }

# input              
find ({wanted =>\&HTML_Files, no_chdir => 1}, $dir);
close $fh1 or die "Error closing $dfile: $!";
exit;

sub HTML_Files {
  parse_HTML_Header($File::Find::name) if /\.html?$/;
}

sub parse_HTML_Header {

  my $ifile = shift;
  print "parsing $ifile\n";
  
  open my $fh0, '<', $ifile or die "Error opening $ifile: $!\n";
  my $text = do{ local $/; <$fh0> };
  close $fh0;

  my $h = HTTP::Headers->new;
  my $p = HTML::HeadParser->new($h);
  $p->parse($text);
   
  my @cols = map{ $h->header($_) }@TAGS;
  $csv->print($fh1, [$ifile,@cols]);
  my @cols2 = map{ $h->$string($_) }@TAGS2;
  $string->print($fh1, [$ifile,@cols2]);

  #my $string = quotemeta 'awarded';
  #while ( $text =~ m/ ( .{0,25} $string.{0,25} ) /gisx ) {	
  #print $fh1 $1,"\n";
#	 }
	
 }