in reply to Re: Problem with parsing HTML with Regex's
in thread Problem with parsing HTML with Regex's
use YAPE::HTML; use Data::Dumper; use strict; use warnings; my $content = q[ <img src="img.link1"> <img src="img.link2" alt="foo"> <img height=20 width=25 src=img.link3 > <IMG src='img.link4'> <link href="css.link1"> <a class=foo href="normal.link1"> <A href=normal.link2 class="foo" > <a href="normal.link3"> <a class=foo href='normal.link4'> <a Href='normal.link5'> ]; my $parser = YAPE::HTML->new($content); my( @a, @link, @img ); # here is the tokenizing part while ( my $chunk = $parser->next ) { if( $chunk->type eq 'tag' ){ if( $chunk->tag eq 'a' ){ push @a, $chunk->get_attr('href') if $chunk->has_attr('href'); } elsif( $chunk->tag eq 'link' ){ push @link, $chunk->get_attr('href') if $chunk->has_attr('href'); } elsif($chunk->tag eq 'img'){ push @img, $chunk->get_attr('src') if $chunk->has_attr('src'); } } } print Dumper \@img,\@link,\@a; __END__ $VAR1 = [ 'img.link1', 'img.link2', 'img.link3', 'img.link4' ]; $VAR2 = [ 'css.link1' ]; $VAR3 = [ 'normal.link1', 'normal.link2', 'normal.link3', 'normal.link4', 'normal.link5' ];
|
|---|