use strict;
use HTML::TreeBuilder 2.97;
use LWP::UserAgent;
sub get_headlines {
my $url = $_[0] || die "What URL?";
my $response = LWP::UserAgent->new->request(
HTTP::Request->new( GET => $url )
);
unless($response->is_success) {
warn "Couldn't get $url: ", $response->status_line, "\n";
return;
}
my $tree = HTML::TreeBuilder->new();
$tree->parse($response->content);
$tree->eof;
my @out;
foreach my $link (
$tree->look_down( # !
'_tag', 'a',
sub {
return unless $_[0]->attr('href');
my @c = $_[0]->content_list;
@c == 1 and ref $c[0] and $c[0]->tag eq 'b';
}
)
) {
push @out, [ $link->attr('href'), $link->as_text ];
}
warn "Odd, fewer than 6 stories in $url!" if @out < 6;
$tree->delete;
return @out;
}
foreach my $section (qw[tc sc hl wl en]) {
my @links = get_headlines(
"http://dailynews.yahoo.com/h/$section/"
);
print
$section, ": ", scalar(@links), " stories\n",
map((" ", $_->[0], " : ", $_->[1], "\n"), @links),
"\n";
}
####
C:\cygwin64\home\Fred\pages2\hunt>perl lib2.pl
Couldn't get http://dailynews.yahoo.com/h/tc/: 500 Can't connect to dailynews.ya
hoo.com:80 (Bad hostname)
tc: 0 stories
Couldn't get http://dailynews.yahoo.com/h/sc/: 500 Can't connect to dailynews.ya
hoo.com:80 (Bad hostname)
sc: 0 stories
Couldn't get http://dailynews.yahoo.com/h/hl/: 500 Can't connect to dailynews.ya
hoo.com:80 (Bad hostname)
hl: 0 stories
Couldn't get http://dailynews.yahoo.com/h/wl/: 500 Can't connect to dailynews.ya
hoo.com:80 (Bad hostname)
wl: 0 stories
Couldn't get http://dailynews.yahoo.com/h/en/: 500 Can't connect to dailynews.ya
hoo.com:80 (Bad hostname)
en: 0 stories