Beefy Boxes and Bandwidth Generously Provided by pair Networks
Problems? Is your data what you think it is?
 
PerlMonks  

cakenews.pl - Syndicate CAKE news

by LTjake (Prior)
on Dec 24, 2003 at 03:49 UTC ( [id://316780]=sourcecode: print w/replies, xml ) Need Help??
Category: Web Stuff
Author/Contact Info /msg LTjake
Description: This script grabs the contents of CAKE's news page, parses the entries and outputs it as either an RSS feed or Atom feed.
package WWW::CAKEmusic::News;

use strict;

# Debug
use warnings;
# use diagnostics;
# use LWP::Debug qw( + );
# use Data::Dumper;

use Carp;

use WWW::Mechanize;
use HTML::TokeParser::Simple;

use constant URL        => 'http://www.cakemusic.com/news.html';
use constant FEED_TITLE => 'CAKE News';
use constant FEED_DESC  => 'Recent news from the band, CAKE.';

# Could've used a Date module...
my %months = (
    jan => '01', feb => '02',
    mar => '03', apr => '04',
    may => '05', jun => '06',
    jul => '07', aug => '08',
    sep => '09', oct => '10',
    nov => '11', dec => '12'
);

sub new {
    my $class = shift;
    my $self  = {};

    bless $self, $class;

    $self->fetch;

    return $self;
}

# Fetch the news page
sub fetch {
    my $self  = shift;
    my $agent = WWW::Mechanize->new;

    $agent->get( URL );

    croak 'Error fetching ' . URL . ': ' . $agent->response->status_li
+ne unless $agent->success;

    $self->{ _raw } = $agent->content;
    $self->parse;
}

# Parse the data
sub parse {
    my $self    = shift;
    my $capture = 0;
    my $content;

    # remove all irrelevant parts of the content
    foreach( split( /\n/, $self->{ _raw } ) ) {
        $capture++ if /Newsflash for/ and not $capture;
        last if /<DL>/;
        next if not $capture or /^$/;
        s/\s+/ /g;
        $content .= $_;
    }

    my $parser = HTML::TokeParser::Simple->new( \$content );

    # this will make sure text tokens won't be split
    $parser->unbroken_text(1);

    my @items;

    while( my $token = $parser->get_token ) {
        $_ = $token->as_is;

        next unless /\S/;

        if( $token->is_text ) {

            # new day of news
            if( /Newsflash for/ ) {
                push @items, { title => $_, descriptions => [] };
                s/^Newsflash for (Week of)?\s*//i;
                /^(...)\S* (\d+). .*(.{4})$/;
                $items[ -1 ]->{ date } = join( '-', $3, $months{ lc( $
+1 ) }, sprintf( '%02d', $2 ) );
            }

            # just plain text
            else {
                $items[ -1 ]->{ descriptions }->[ -1 ] .= $_;
            }
        }

        # each news item is a list-item
        elsif( $token->is_start_tag( 'li' ) ) {
            push @{ $items[ -1 ]->{ descriptions } }, '';
        }
    }

    $self->{ items } = \@items;
}

# use XML::RSS to make an RSS feed
sub as_rss {
    my $self = shift;

    require XML::RSS;
    my $feed = XML::RSS->new;

    $feed->channel(
        title       => FEED_TITLE,
        link        => URL,
        description => FEED_DESC
    );

    for my $item ( @{ $self->{ items } } ) {
        for my $index ( 1..scalar @{ $item->{ descriptions } } ) {
            $feed->add_item(
                    title       => $item->{ title },
                link        => URL . '#' . $item->{ date } . '-' . ( $
+index ),
                description => $item->{ descriptions }->[ $index - 1 ]
+,
                dc          => {
                    date => $item->{ date }
                }
            );
        }
    }

    return $feed->as_string;
}

# use XML::Atom to make an Atom feed
sub as_atom {
    my $self = shift;

    require XML::Atom::Feed;
    require XML::Atom::Entry;

    my $feed = XML::Atom::Feed->new;
    $feed->title( FEED_TITLE );

    for my $item ( @{ $self->{ items } } ) {
        for my $desc ( @{ $item->{ descriptions } } ) {
            my $entry = XML::Atom::Entry->new;

            $entry->title( $item->{ title } );
            $entry->content( $desc );

            $feed->add_entry( $entry );
        }
    }

    return $feed->as_xml;    
}

package main;

my $news   = WWW::CAKEmusic::News->new;
my $output = "as_$ARGV[ 0 ]";

print $news->$output;

=head1 NAME

cakenews - grab the latest CAKE news in Atom or RSS format

=head1 SYNOPSIS

    cakenews.pl rss > cake.rss
    cakenews.pl atom > cake.atom

=head1 DESCRIPTION

This script grabs the contents of CAKE's news page, parses the entries
+ and
outputs it as either an RSS feed or Atom feed.

=head1 NOTICE

Please do not abuse CAKE's server with this script. Consider using
WWW::Mechanize::Cached if you want to use this on a regular basis.

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://316780]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others having an uproarious good time at the Monastery: (3)
As of 2024-04-19 05:35 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found