and the output#!/usr/bin/perl -w use strict; use LWP::Simple; use HTML::TokeParser; my $url ="http://perlmonks.org/index.pl?node_id=110166"; my $rawHTML = get($url); # attempt to d/l the page to mem die "LWP::Simple messed up $!" unless $rawHTML; my ($tp , %monks ); $tp = HTML::TokeParser->new(\$rawHTML) or die "WTF $tp gone bad: $!"; # And now -- a generic HTML::TokeParser loop while (my $t = $tp->get_token) { if( ($$t[0] eq "S") and ($$t[1] eq "tr") and (exists $$t[2]->{bgcolor} and $$t[2]->{bgcolor} eq "eeeeee") ) { my @t = ( $t,# 0 <TR BGCOLOR=eeeeee> $tp->get_token,# 1 <TD colspan=2> $tp->get_token,# 2 <font size=2> $tp->get_token,# 3 <A HREF="/index.pl?node_id=110171&lastnode_id=1 +10166"> $tp->get_token,# 4 Re: Name Space $tp->get_token,# 5 </A> $tp->get_token,# 6 <BR> $tp->get_token,# 7 by $tp->get_token,# 8 <A HREF="/index.pl?node_id=1936&lastnode_id=110 +166"> $tp->get_token,# 9 japhy $tp->get_token,#10 </A> $tp->get_token,#11 on Sep 04, 2001 at 13:42 $tp->get_token,#12 </font> $tp->get_token,#13 </TD> $tp->get_token,#14 </tr> ); if( ($t[0][0] eq "S" and $t[0][1] eq "tr" and $t[0][2]->{'bgcolor'} eq "eeeeee") and ($t[1][0] eq "S" and $t[1][1] eq "td") and ($t[2][0] eq "S" and $t[2][1] eq "font") and ($t[3][0] eq "S" and $t[3][1] eq "a") and # reply link ($t[4][0] eq "T") and # reply to original node ($t[5][0] eq "E" and $t[5][1] eq "a") and ($t[6][0] eq "S" and $t[6][1] eq "br") and ($t[7][0] eq "T" and $t[7][1] =~ /by/ ) and ($t[8][0] eq "S" and $t[8][1] eq "a") and # userlink ($t[9][0] eq "T" ) and # username ($t[10][0] eq "E" and $t[10][1] eq "a") and ($t[11][0] eq "T" and $t[11][1] =~ /on \w{3} \d{2}, \d{4} at/) +and ($t[12][0] eq "E" and $t[12][1] eq "font") and ($t[13][0] eq "E" and $t[13][1] eq "td") and ($t[14][0] eq "E" and $t[14][1] eq "tr") ) { print $t[3][4], # a href $t[9][1], # monk name "</A>|\n"; $monks{$t[9][1]}= "$t[3][4]" . "$t[9][1]</A>"; } } } # endof while (my $token = $p->get_token) undef $rawHTML; # no more raw html undef $tp; # destroy the HTML::TokeParser object (don't need it n +o more) print "<H1> or sorted </H1>\n"; for my $key (sort keys %monks) { print $monks{$key},"|\n"; } __END__ ## one token per line <TR BGCOLOR=eeeeee> <TD colspan=2> <font size=2> <A HREF="/index.pl?node_id=110171&lastnode_id=110166"> Re: Name Space </A> <BR> by <A HREF="/index.pl?node_id=1936&lastnode_id=110166"> japhy </A> on Sep 04, 2001 at 13:42 </font> </TD> </tr>
riight, but like I said, i'm deliberately matching only replies of depth 1, which all do conform (only 2nd level replies got the ul bug, and If i was parsing them, I'd just have the improper html in there regardless). I saw what you did ;D
___crazyinsomniac_______________________________________
Disclaimer: Don't blame. It came from inside the void
perl -e "$q=$_;map({chr unpack qq;H*;,$_}split(q;;,q*H*));print;$q/$q;"
In reply to Re: Re: (crazyinsomniac) Re: Extract info from HTML
by crazyinsomniac
in thread Extract info from HTML
by George_Sherston
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |