#!perl
use strict;
use HTML::TreeBuilder 2.97;
use LWP::UserAgent;
sub get_headlines {
my $url = $_[0] || die "What URL?";
my $response = LWP::UserAgent->new->request(
HTTP::Request->new( GET => $url )
);
unless($response->is_success) {
warn "Couldn't get $url: ", $response->status_line, "\n";
return;
}
my $tree = HTML::TreeBuilder->new();
$tree->parse($response->content);
$tree->eof;
my @out;
foreach my $link (
$tree->look_down( # !
'_tag', 'a',
sub {
return 1 if $_[0]->attr('class') =~ /title/;
# my @c = $_[0]->content_list;
# @c == 1 and ref $c[0] and $c[0]->tag eq 'b';
}
)
) {
push @out, [ $link->attr('href'), $link->as_text, ];
}
warn "Odd, fewer than 6 stories in $url!" if @out < 6;
$tree->delete;
return @out;
}
#science health world entertainment
open OUT,'>:utf8','yahoo.txt' or die "$!";
foreach my $section (qw[tech science health world entertainment]) {
my @links = get_headlines(
"https://uk.news.yahoo.com/$section/"
);
print OUT
$section, ": ", scalar(@links), " stories\n",
map((" ", $_->[1], "\n"), @links),"\n";
}