package WWW::CAKEmusic::News;
use strict;
# Debug
use warnings;
# use diagnostics;
# use LWP::Debug qw( + );
# use Data::Dumper;
use Carp;
use WWW::Mechanize;
use HTML::TokeParser::Simple;
use constant URL => 'http://www.cakemusic.com/news.html';
use constant FEED_TITLE => 'CAKE News';
use constant FEED_DESC => 'Recent news from the band, CAKE.';
# Could've used a Date module...
my %months = (
jan => '01', feb => '02',
mar => '03', apr => '04',
may => '05', jun => '06',
jul => '07', aug => '08',
sep => '09', oct => '10',
nov => '11', dec => '12'
);
sub new {
my $class = shift;
my $self = {};
bless $self, $class;
$self->fetch;
return $self;
}
# Fetch the news page
sub fetch {
my $self = shift;
my $agent = WWW::Mechanize->new;
$agent->get( URL );
croak 'Error fetching ' . URL . ': ' . $agent->response->status_li
+ne unless $agent->success;
$self->{ _raw } = $agent->content;
$self->parse;
}
# Parse the data
sub parse {
my $self = shift;
my $capture = 0;
my $content;
# remove all irrelevant parts of the content
foreach( split( /\n/, $self->{ _raw } ) ) {
$capture++ if /Newsflash for/ and not $capture;
last if /<DL>/;
next if not $capture or /^$/;
s/\s+/ /g;
$content .= $_;
}
my $parser = HTML::TokeParser::Simple->new( \$content );
# this will make sure text tokens won't be split
$parser->unbroken_text(1);
my @items;
while( my $token = $parser->get_token ) {
$_ = $token->as_is;
next unless /\S/;
if( $token->is_text ) {
# new day of news
if( /Newsflash for/ ) {
push @items, { title => $_, descriptions => [] };
s/^Newsflash for (Week of)?\s*//i;
/^(...)\S* (\d+). .*(.{4})$/;
$items[ -1 ]->{ date } = join( '-', $3, $months{ lc( $
+1 ) }, sprintf( '%02d', $2 ) );
}
# just plain text
else {
$items[ -1 ]->{ descriptions }->[ -1 ] .= $_;
}
}
# each news item is a list-item
elsif( $token->is_start_tag( 'li' ) ) {
push @{ $items[ -1 ]->{ descriptions } }, '';
}
}
$self->{ items } = \@items;
}
# use XML::RSS to make an RSS feed
sub as_rss {
my $self = shift;
require XML::RSS;
my $feed = XML::RSS->new;
$feed->channel(
title => FEED_TITLE,
link => URL,
description => FEED_DESC
);
for my $item ( @{ $self->{ items } } ) {
for my $index ( 1..scalar @{ $item->{ descriptions } } ) {
$feed->add_item(
title => $item->{ title },
link => URL . '#' . $item->{ date } . '-' . ( $
+index ),
description => $item->{ descriptions }->[ $index - 1 ]
+,
dc => {
date => $item->{ date }
}
);
}
}
return $feed->as_string;
}
# use XML::Atom to make an Atom feed
sub as_atom {
my $self = shift;
require XML::Atom::Feed;
require XML::Atom::Entry;
my $feed = XML::Atom::Feed->new;
$feed->title( FEED_TITLE );
for my $item ( @{ $self->{ items } } ) {
for my $desc ( @{ $item->{ descriptions } } ) {
my $entry = XML::Atom::Entry->new;
$entry->title( $item->{ title } );
$entry->content( $desc );
$feed->add_entry( $entry );
}
}
return $feed->as_xml;
}
package main;
my $news = WWW::CAKEmusic::News->new;
my $output = "as_$ARGV[ 0 ]";
print $news->$output;
=head1 NAME
cakenews - grab the latest CAKE news in Atom or RSS format
=head1 SYNOPSIS
cakenews.pl rss > cake.rss
cakenews.pl atom > cake.atom
=head1 DESCRIPTION
This script grabs the contents of CAKE's news page, parses the entries
+ and
outputs it as either an RSS feed or Atom feed.
=head1 NOTICE
Please do not abuse CAKE's server with this script. Consider using
WWW::Mechanize::Cached if you want to use this on a regular basis.
|