in reply to Skipping HTML tags with HTML::TokeParser
#!/usr/bin/perl -w use strict; use HTML::TokeParser; my $sample_HTML = "<a href=\"http://www.foobar1.com/\">link 1</a> " . "<!--ignore--><a href=\"http://www.foobar2.com/\">link 2</a> " . "<a href=\"http://www.foobar3.com/\">link 3</a> "; my $p = HTML::TokeParser->new( \$sample_HTML ); my $token; my $link_count = 1; while( $token = $p->get_token() ) { if( $token->[0] eq 'S' && $token->[1] eq 'a' ) { my $text = $token->[2]->{'href'}; print "Found link $link_count: $text\n"; $link_count++; } if( $token->[0] eq 'C' && $token->[1] eq '<!--ignore-->' ) { while ( $token->[0] ne 'E' && $token->[1] ne 'a' ) { $token = $p->get_token(); } } } __output___ %perl ignore_some.pl Found link 1: http://www.foobar1.com/ Found link 2: http://www.foobar3.com/
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Skipping HTML tags using the "get_token" method with HTML::TokeParser module
by Anonymous Monk on Mar 14, 2002 at 10:04 UTC | |
by ichimunki (Priest) on Mar 14, 2002 at 19:14 UTC |