#!/usr/bin/perl use strict; use warnings; ############################################################################# # Takes rss files from across the internet and sticks them into the database. # Best ran from cron ############################################################################# if( -f '/var/run/retrieve_rss.pid'){ system('kill -9 `cat /var/run/retrieve_rss.pid`'); system('rm /var/run/retrieve_rss.pid'); } open(PIDFILE,'>/var/run/retrieve_rss.pid'); print PIDFILE $$,"\n"; close(PIDFILE); my $DEBUG = defined $ARGV[0] ? $ARGV[0] : 0; ### initial setup use LWP::Simple; use XML::RSS; use DBI; my $dbh = DBI->connect('DBI:Pg:dbname=DBNAME','DBUSER','DBPASS',{AutoCommit => '0'}); my $sth = $dbh->prepare('select sid,url from rsssites where active is true'); $sth->execute(); while(my ($site_id,$site_url) = $sth->fetchrow_array()){ eval 'get_links($site_id,$site_url)'; print $@,"\n" if $@; } ### disconnect $sth->finish(); $dbh->disconnect; ### done unlink '/var/run/retrieve_rss.pid'; 1; ### this is for rss sites sub get_links { my ($id,$url) = @_; ### DEBUG print "Getting links for $url\n" if $DEBUG; my $document = get($url) || return; # clean the string (this fixes some broken rss) $document =~ s/\015\012?/\012/g || 1; $document =~ s/&(?!(?:[a-zA-Z0-9]+|#\d+);)/&/g || 1; # parse a string my $rss = new XML::RSS(Style => 'Debug') || return; $rss->parse($document) || return; # clear out the db, check for a failure, rollback, and move on... unless( clear_db($id) ){ $dbh->rollback; return; } foreach my $item (@{$rss->{'items'}}) { my ($title,$link); $title = $item->{'title'}; $link = $item->{'link'}; chomp($title,$link); ### remove unsightly site specific links next if ($title =~ /Customize this feed/i); ### stick it into the database my $sth = $dbh->prepare('insert into rsscontent (cid,title,url) values (?,?,?)'); $sth->execute($id,$title,$link); $sth->finish(); # check to see if an error has been raised and rollback if true if($dbh->errstr){ $dbh->rollback; print "Rolling back line [$title][$link]: $dbh->errstr\n" if $DEBUG; return; } } # check to see if an error has been raised... # if so, rollback, if not, commit unless($dbh->errstr){ print "Committing for $url\n" if $DEBUG; $dbh->commit; }else{ print "Rolling back $url: $dbh->errstr\n" if $DEBUG; $dbh->rollback; } return; } sub clear_db { my ($sid) = @_; if( defined($sid) ){ $dbh->do("delete from rsscontent where cid = $sid"); unless($DBI::errstr){ print "Successfully cleared content for $sid\n" if $DEBUG; return 1; }else{ print "Failed to clear content for $sid: $DBI::errstr\n" if $DEBUG; return 0; } } return 0; }