in reply to Problem parsing through downloaded web pages

Please do post some code and example text. We can only guess as to what you have, how you're getting it, what it is, and what you expect it to be without that.

  • Comment on Re: Problem parsing through downloaded web pages

Replies are listed 'Best First'.
Re^2: Problem parsing through downloaded web pages
by malomar66 (Acolyte) on Jan 05, 2007 at 07:07 UTC
    Links from file:
    <tr bgcolor="#cccccc"> <th align="center"><font size="2" color="#330066" FACE="Garmond,Helvet +ica,Times">1</font></th><td align="left"><a href="http://securities.s +tanford.edu/1014/TCHC00"><font size="2" color="#330066" FACE="Garmond +,Helvetica,Times">21st Century Holding Co.</font></a></td><td align=" +left"><font size="2" color="#330066" FACE="Garmond,Helvetica,Times">< +a href="http://biz.yahoo.com/p/T/TCHC.html">NASDAQ</a></font></td><td + align="left"><font size="2" color="#330066" FACE="Garmond,Helvetica, +Times"><a href="http://finance.yahoo.com/q?s=TCHC&d=t">TCHC</a></font +></td><td align="left"><font size="2" color="#330066" FACE="Garmond,H +elvetica,Times">06/27/2000</font></td><td align="left"><font size="2" + color="#330066" FACE="Garmond,Helvetica,Times">S.D. New York</font>< +/td> </tr> <tr> <th align="center"><font size="2" color="#330066" FACE="Garmond,Helvet +ica,Times">2</font></th><td align="left"><a href="http://securities.s +tanford.edu/1009/TMRT99"><font size="2" color="#330066" FACE="Garmond +,Helvetica,Times">2TheMart.com</font></a></td><td align="left"><font +size="2" color="#330066" FACE="Garmond,Helvetica,Times"><a href="http +://biz.yahoo.com/p/T/TMRT.html">OTC-BB</a></font></td><td align="left +"><font size="2" color="#330066" FACE="Garmond,Helvetica,Times"><a hr +ef="http://finance.yahoo.com/q?s=TMRT&d=t">TMRT</a></font></td><td al +ign="left"><font size="2" color="#330066" FACE="Garmond,Helvetica,Tim +es">09/13/1999</font></td><td align="left"><font size="2" color="#330 +066" FACE="Garmond,Helvetica,Times">C.D. California</font></td> </tr> <tr bgcolor="#cccccc"> <th align="center"><font size="2" color="#330066" FACE="Garmond,Helvet +ica,Times">3</font></th><td align="left"><a href="http://securities.s +tanford.edu/1024/SIXQPK02-01"><font size="2" color="#330066" FACE="Ga +rmond,Helvetica,Times">360Networks, Inc.</font></a></td><td align="le +ft"><font size="2" color="#330066" FACE="Garmond,Helvetica,Times"><a +href="http://biz.yahoo.com/p/T/TSIXQ.html">OTC-BB</a></font></td><td +align="left"><font size="2" color="#330066" FACE="Garmond,Helvetica,T +imes"><a href="http://finance.yahoo.com/q?s=TSIXQ&d=t">TSIXQ</a></fon +t></td><td align="left"><font size="2" color="#330066" FACE="Garmond, +Helvetica,Times">06/21/2002</font></td><td align="left"><font size="2 +" color="#330066" FACE="Garmond,Helvetica,Times">S.D. New York</font> +</td> </tr> <tr> <th align="center"><font size="2" color="#330066" FACE="Garmond,Helvet +ica,Times">4</font></th><td align="left"><a href="http://securities.s +tanford.edu/1009/COMS97"><font size="2" color="#330066" FACE="Garmond +,Helvetica,Times">3Com Corporation 97</font></a></td><td align="left" +><font size="2" color="#330066" FACE="Garmond,Helvetica,Times"><a hre +f="http://biz.yahoo.com/p/C/COMS.html">NASDAQ</a></font></td><td alig +n="left"><font size="2" color="#330066" FACE="Garmond,Helvetica,Times +"><a href="http://finance.yahoo.com/q?s=COMS&d=t">COMS</a></font></td +><td align="left"><font size="2" color="#330066" FACE="Garmond,Helvet +ica,Times">12/05/1997</font></td><td align="left"><font size="2" colo +r="#330066" FACE="Garmond,Helvetica,Times">N.D. California</font></td +> </tr>
    Code:
    use LWP; use HTML::LinkExtor; use utf8; # The mother function sub GetLink { my $thelink = $_[0]; my $theagent = LWP::UserAgent->new; my $therequest = HTTP::Request->new(GET => $thelink); my $theresponse = $theagent->request($therequest); $theresponse->is_success or die "$thelink: ",$theresponse->message,"\n +"; my $thearesp = $theresponse->content; return $thearesp; } # Retrieve each entry into the Stanford database # Read in a preferences and/or file with a firm code (ticker, permno, +etc..) my ($theFirmObs, $theFile); my $line = ""; my %firmArray; open FIRMS, "<firms.htm" or die "The master list file didn't open"; # Use this to unpack? file and strip all but href my $tempIndex = 0; while ($line = <FIRMS>) { # LAYOUT NEEDED VARIABLES my ($ticker, $name, $court, $docketNumber) = ""; my ($dateFiled, $classStart, $classEnd, $plaintiffFirms) = 0; my ($tenB5, $SEA1933, $SEA1934) = 0; my ($settle, $insurance, $fAndM) = 0; # FIND AND HIT EACH URL if ($line =~ m%(http://securities.stanford.edu/\w*/\w*\-?\w*\-?\w* +)%) { $tempIndex++; $theFirmObs = $1; $theFile = &GetLink($theFirmObs); # Use the mother function print "$tempIndex: $theFirmObs\n";# DEBUG # Kill the header and left column to make it easier? # Output file: including name based on code given, date, etc. name + it .htm open FILETEMP, "+>" . "$tempIndex.htm" or die "The outfile did +n't work"; print FILETEMP $theFile; open FILETEMP2, "+>" . "$tempIndex" . "z.htm" or die "The seco +nd outfile didn't work"; if ($theFile =~ m%Conclusion:%) { print "went in A\n"; # print FILETEMP2 "conclusion "; } if ($theFile =~ m%Summary:%) { print "went in B\n"; # print FILETEMP2 "Summary "; } if ($theFile =~ m%the%) { print "went in 1\n"; # print FILETEMP2 "$tempIndex: the\n"; } if ($tempIndex > 3) {die "done now.\n";} } }