I have done something similar with a cron job that checks a database table for sites to cache. It then fills a second table with the cached content. I don't know how useful it is, but I had a good time doing it and maybe it will be helpful/interesting to you.
It is broken down into three parts:
- db layout
- caching script
- a frontend rss generator for rss links
The rss frontend called via a browser outputs an rss feed describing the cached rss sites and content and a link to each (rss serving rss links... I always thought it would have an interesting application... but I've yet to really use it).
The DB layout
+---------+------------------------+----------------------------------
+-------------------------+
| Column | Type | Modifiers
+ |
+---------+------------------------+----------------------------------
+-------------------------+
| sid | integer | not null default nextval('public.
+rsssites_sid_seq'::text) |
| title | character varying(255) | not null
+ |
| url | character varying(255) | not null
+ |
| active | boolean |
+ |
| baseurl | character varying(255) |
+ |
+---------+------------------------+----------------------------------
+-------------------------+
Indexes: rsssites_pkey primary key btree (sid),
rsssites_sid_key unique btree (sid)
+--------+------------------------+-----------+
| Column | Type | Modifiers |
+--------+------------------------+-----------+
| cid | integer | |
| title | character varying(255) | not null |
| url | character varying(255) | not null |
+--------+------------------------+-----------+
Foreign Key constraints: rss_site FOREIGN KEY (cid) REFERENCES rsssite
+s(sid) ON UPDATE CASCADE ON DELETE CASCADE
The Caching script
#!/usr/bin/perl
use strict;
use warnings;
######################################################################
+#######
# Takes rss files from across the internet and sticks them into the da
+tabase.
# Best ran from cron
######################################################################
+#######
if( -f '/var/run/retrieve_rss.pid'){
system('kill -9 `cat /var/run/retrieve_rss.pid`');
system('rm /var/run/retrieve_rss.pid');
}
open(PIDFILE,'>/var/run/retrieve_rss.pid');
print PIDFILE $$,"\n";
close(PIDFILE);
my $DEBUG = defined $ARGV[0] ? $ARGV[0] : 0;
### initial setup
use LWP::Simple;
use XML::RSS;
use DBI;
my $dbh = DBI->connect('DBI:Pg:dbname=DBNAME','DBUSER','DBPASS',{AutoC
+ommit => '0'});
my $sth = $dbh->prepare('select sid,url from rsssites where active is
+true');
$sth->execute();
while(my ($site_id,$site_url) = $sth->fetchrow_array()){
eval 'get_links($site_id,$site_url)';
print $@,"\n" if $@;
}
### disconnect
$sth->finish();
$dbh->disconnect;
### done
unlink '/var/run/retrieve_rss.pid';
1;
### this is for rss sites
sub get_links {
my ($id,$url) = @_;
### DEBUG
print "Getting links for $url\n" if $DEBUG;
my $document = get($url) || return;
# clean the string (this fixes some broken rss)
$document =~ s/\015\012?/\012/g || 1;
$document =~ s/&(?!(?:[a-zA-Z0-9]+|#\d+);)/&/g || 1;
# parse a string
my $rss = new XML::RSS(Style => 'Debug') || return;
$rss->parse($document) || return;
# clear out the db, check for a failure, rollback, and move on...
unless( clear_db($id) ){
$dbh->rollback;
return;
}
foreach my $item (@{$rss->{'items'}}) {
my ($title,$link);
$title = $item->{'title'};
$link = $item->{'link'};
chomp($title,$link);
### remove unsightly site specific links
next if ($title =~ /Customize this feed/i);
### stick it into the database
my $sth = $dbh->prepare('insert into rsscontent (cid,title,url
+) values (?,?,?)');
$sth->execute($id,$title,$link);
$sth->finish();
# check to see if an error has been raised and rollback if tru
+e
if($dbh->errstr){
$dbh->rollback;
print "Rolling back line [$title][$link]: $dbh->errstr\n"
+if $DEBUG;
return;
}
}
# check to see if an error has been raised...
# if so, rollback, if not, commit
unless($dbh->errstr){
print "Committing for $url\n" if $DEBUG;
$dbh->commit;
}else{
print "Rolling back $url: $dbh->errstr\n" if $DEBUG;
$dbh->rollback;
}
return;
}
sub clear_db {
my ($sid) = @_;
if( defined($sid) ){
$dbh->do("delete from rsscontent where cid = $sid");
unless($DBI::errstr){
print "Successfully cleared content for $sid\n" if $DEBUG;
return 1;
}else{
print "Failed to clear content for $sid: $DBI::errstr\n" i
+f $DEBUG;
return 0;
}
}
return 0;
}
RSS frontend
#!/usr/bin/perl
use strict;
use warnings;
use DBI;
use XML::RSS;
use CGI qw(:standard);
my $cgi = new CGI();
my $dbh = DBI->connect('DBI:Pg:dbname=DBNAME','DBUSER','DBPASS');
unless( $dbh ){
print $cgi->header(),$cgi->start_html('Oops'),$cgi->h1('We have a
+problem'),$cgi->end_html();
exit;
}
my $rss = new XML::RSS();
if(defined( $cgi->param('site') ) ){
my $site_data = $dbh->selectrow_hashref('select title,baseurl from
+ rsssites where sid = ' . $cgi->param('site') );
$rss->channel(
title => $site_data->{title},
link => $site_data->{baseurl},
description => $site_data->{title}
);
my $query = 'select title,url from rsscontent where cid = ?';
my $sth = $dbh->prepare($query);
$sth->execute( $cgi->param('site') );
while (my ($title,$link) = $sth->fetchrow_array()){
$title = $cgi->escapeHTML($title);
$rss->add_item(
title => $title,
link => $link,
);
}
}else{ ### no site param
$rss->channel(
title => 'RSS caching system',
link => 'http://www.localhost/cgi-bin/rss',
description => 'RSS interface to cached news',
);
$rss->image(
title => 'Localhost',
url => 'http://www.localhost/images/favicon.png',
link => 'http://www.localhost',
);
$rss->textinput(
title => 'Localhost search',
description => 'Use the text input below to search Localhost',
name => 'search_term',
link => 'http://www.localhost/search'
);
my $query = 'select sid,title from rsssites';
my $sth = $dbh->prepare($query);
$sth->execute();
while (my ($sid,$title) = $sth->fetchrow_array()){
$rss->add_item(
title => $title,
link => "http://www.localhost/cgi-bin/rss?site=$sid"
);
}
}
$dbh->disconnect();
print $cgi->header( -type=>'text/xml' );
print $rss->as_string;
1;
cp
----
"Never be afraid to try something new. Remember, amateurs built the ark. Professionals built the Titanic." |