in reply to Re: Problem parsing through downloaded web pages
in thread Problem parsing through downloaded web pages

Links from file:
<tr bgcolor="#cccccc"> <th align="center"><font size="2" color="#330066" FACE="Garmond,Helvet +ica,Times">1</font></th><td align="left"><a href="http://securities.s +tanford.edu/1014/TCHC00"><font size="2" color="#330066" FACE="Garmond +,Helvetica,Times">21st Century Holding Co.</font></a></td><td align=" +left"><font size="2" color="#330066" FACE="Garmond,Helvetica,Times">< +a href="http://biz.yahoo.com/p/T/TCHC.html">NASDAQ</a></font></td><td + align="left"><font size="2" color="#330066" FACE="Garmond,Helvetica, +Times"><a href="http://finance.yahoo.com/q?s=TCHC&d=t">TCHC</a></font +></td><td align="left"><font size="2" color="#330066" FACE="Garmond,H +elvetica,Times">06/27/2000</font></td><td align="left"><font size="2" + color="#330066" FACE="Garmond,Helvetica,Times">S.D. New York</font>< +/td> </tr> <tr> <th align="center"><font size="2" color="#330066" FACE="Garmond,Helvet +ica,Times">2</font></th><td align="left"><a href="http://securities.s +tanford.edu/1009/TMRT99"><font size="2" color="#330066" FACE="Garmond +,Helvetica,Times">2TheMart.com</font></a></td><td align="left"><font +size="2" color="#330066" FACE="Garmond,Helvetica,Times"><a href="http +://biz.yahoo.com/p/T/TMRT.html">OTC-BB</a></font></td><td align="left +"><font size="2" color="#330066" FACE="Garmond,Helvetica,Times"><a hr +ef="http://finance.yahoo.com/q?s=TMRT&d=t">TMRT</a></font></td><td al +ign="left"><font size="2" color="#330066" FACE="Garmond,Helvetica,Tim +es">09/13/1999</font></td><td align="left"><font size="2" color="#330 +066" FACE="Garmond,Helvetica,Times">C.D. California</font></td> </tr> <tr bgcolor="#cccccc"> <th align="center"><font size="2" color="#330066" FACE="Garmond,Helvet +ica,Times">3</font></th><td align="left"><a href="http://securities.s +tanford.edu/1024/SIXQPK02-01"><font size="2" color="#330066" FACE="Ga +rmond,Helvetica,Times">360Networks, Inc.</font></a></td><td align="le +ft"><font size="2" color="#330066" FACE="Garmond,Helvetica,Times"><a +href="http://biz.yahoo.com/p/T/TSIXQ.html">OTC-BB</a></font></td><td +align="left"><font size="2" color="#330066" FACE="Garmond,Helvetica,T +imes"><a href="http://finance.yahoo.com/q?s=TSIXQ&d=t">TSIXQ</a></fon +t></td><td align="left"><font size="2" color="#330066" FACE="Garmond, +Helvetica,Times">06/21/2002</font></td><td align="left"><font size="2 +" color="#330066" FACE="Garmond,Helvetica,Times">S.D. New York</font> +</td> </tr> <tr> <th align="center"><font size="2" color="#330066" FACE="Garmond,Helvet +ica,Times">4</font></th><td align="left"><a href="http://securities.s +tanford.edu/1009/COMS97"><font size="2" color="#330066" FACE="Garmond +,Helvetica,Times">3Com Corporation 97</font></a></td><td align="left" +><font size="2" color="#330066" FACE="Garmond,Helvetica,Times"><a hre +f="http://biz.yahoo.com/p/C/COMS.html">NASDAQ</a></font></td><td alig +n="left"><font size="2" color="#330066" FACE="Garmond,Helvetica,Times +"><a href="http://finance.yahoo.com/q?s=COMS&d=t">COMS</a></font></td +><td align="left"><font size="2" color="#330066" FACE="Garmond,Helvet +ica,Times">12/05/1997</font></td><td align="left"><font size="2" colo +r="#330066" FACE="Garmond,Helvetica,Times">N.D. California</font></td +> </tr>
Code:
use LWP; use HTML::LinkExtor; use utf8; # The mother function sub GetLink { my $thelink = $_[0]; my $theagent = LWP::UserAgent->new; my $therequest = HTTP::Request->new(GET => $thelink); my $theresponse = $theagent->request($therequest); $theresponse->is_success or die "$thelink: ",$theresponse->message,"\n +"; my $thearesp = $theresponse->content; return $thearesp; } # Retrieve each entry into the Stanford database # Read in a preferences and/or file with a firm code (ticker, permno, +etc..) my ($theFirmObs, $theFile); my $line = ""; my %firmArray; open FIRMS, "<firms.htm" or die "The master list file didn't open"; # Use this to unpack? file and strip all but href my $tempIndex = 0; while ($line = <FIRMS>) { # LAYOUT NEEDED VARIABLES my ($ticker, $name, $court, $docketNumber) = ""; my ($dateFiled, $classStart, $classEnd, $plaintiffFirms) = 0; my ($tenB5, $SEA1933, $SEA1934) = 0; my ($settle, $insurance, $fAndM) = 0; # FIND AND HIT EACH URL if ($line =~ m%(http://securities.stanford.edu/\w*/\w*\-?\w*\-?\w* +)%) { $tempIndex++; $theFirmObs = $1; $theFile = &GetLink($theFirmObs); # Use the mother function print "$tempIndex: $theFirmObs\n";# DEBUG # Kill the header and left column to make it easier? # Output file: including name based on code given, date, etc. name + it .htm open FILETEMP, "+>" . "$tempIndex.htm" or die "The outfile did +n't work"; print FILETEMP $theFile; open FILETEMP2, "+>" . "$tempIndex" . "z.htm" or die "The seco +nd outfile didn't work"; if ($theFile =~ m%Conclusion:%) { print "went in A\n"; # print FILETEMP2 "conclusion "; } if ($theFile =~ m%Summary:%) { print "went in B\n"; # print FILETEMP2 "Summary "; } if ($theFile =~ m%the%) { print "went in 1\n"; # print FILETEMP2 "$tempIndex: the\n"; } if ($tempIndex > 3) {die "done now.\n";} } }