#!/usr/bin/perl -w use strict; use HTML::TreeBuilder; my $data = <<'EOF';
EOF # This list of URLs to omit is case-insensitive. my %omit = map {lc($_),1} qw( http://www.cnn.com/WEATHER/index.html ftp://DEBIAN.SECSUP.ORG/PUB/LINUX/DEBIAN/README ); print "\nParsing with HTML::TreeBuilder...\n"; my $tree = HTML::TreeBuilder->new; $tree->parse($data); $tree->eof(); for (@{ $tree->extract_links('a') }) { my($real_url, $element, $attr, $tag) = @$_; if( $omit{ lc($real_url) } ) { print "Skip this url: $real_url\n"; next; } print "Good URL: $real_url\n"; } $tree = $tree->delete; print "\nParsing with regex...\n"; my @tags = ($data =~ m{()}ig ); foreach my $url (@tags) { my ($real_url) = ( $url=~ m{}i ) or die "URL '$url' failed pattern match"; if( $omit{ lc($real_url) } ) { print "Skip this url: $real_url\n"; next; } print "Good URL: $real_url\n"; }