#!/usr/bin/env perl
# sf-digest.pl by Tim Rayner, 2002
# (timrayner@btinternet.com)
# This code may be modified and distributed on
# the same terms as the Perl source code.
#
# Script to periodically query a web page, extract
# a table and identify new rows in that table.
# New results are cached in $savefile; if a second
# file ($archivefile) has not been modified
# within the time $cacheperiod, the new table rows
# (i.e. those in $savefile but not in
# $archivefile) are mailed to the $mailto email address.
# The results are then written out to
# $archivefile to be omitted from future emails.
#
# The reason for this convoluted approach is that we
# aim to capture table rows which may not
# be present on the web page for very long (in some
# cases, only a matter of hours). However,
# we want to avoid sending emails more than once a
# day (change $cacheperiod to alter this
# behaviour). We also want to maintain a persistant
# cache of results to overcome difficulties
# connecting to the web page (originally
# sourceforge.net, hardly a paragon of reliability).
#
# Use the command 'sf-digest.pl now' (as opposed to
# simply 'sf-digest.pl') to override
# the result cache mechanism and mail all the
# current results now.
#
use strict;
use warnings;
use LWP::UserAgent;
use HTTP::Request;
use HTML::TableExtract;
use MIME::Lite;
#############################################
############ User Config Section ############
#############################################
#
# Address to send mail to. You will want to change this.
#
my $mailto='user@host';
#
# Address to send mail from. Ensure that your SMTP
# server or local MTA will accept this value.
#
my $mailfrom='user@host';
#
# Subject line of the sent email:
#
my $mailsubject='SourceForge Update';
#
# Table columns to extract (in order that they will
# appear in the email).
# N.B. Keep 'Request ID' as column 1; changing this
# will break the script.
#
my $tableheaders=['Request ID','Date','Summary'];
#
# Lines at start and end of the email body text:
#
my $mailbodyhead="<html>Here are today's new SourceForge requests:<br>
+<br>".
"Follow the link to see the full description or to add comments to
+ an item. <br>".
"You can use the monitor option to receive any comments added
+to an item.<br>";
my $mailbodytail="<br>You can also go to the SourceForge GO Curator Re
+quests Tracker<br>".
" to see the complete list of submissions:<br>".
qq!<a href="http://sourceforge.net/tracker/?atid=440764&group_id=3
+6855&func=browse">http://sourceforge.net/tracker/?at
id=440764&group_id=36855&func=browse</a><br><br>!.
"Signed,<br><br>the sf-digest daemon.</html>";
#
# Save files used to store the last set of data downloaded.
# It's likely that you will want to change these to a
# set location (e.g. /home/user/.sf-digest-latest.txt).
#
my $savefile='/home/user/.sf-digest-latest.txt';
my $archivefile='/home/user/.sf-digest-archive.txt';
#
# SMTP server via which to send mail. If undefined,
# use local sendmail command.
#
my $smtp_server='';
#
# Period for which results are cached prior to emailing
# them (in seconds). Initially set to 1 day minus 15
# minutes (85500 seconds)
#
my $cacheperiod=(1*24*60*60)-(15*60);
#####################################################
### URL vars - You shouldn't need to touch these, ###
### unless SourceForge does. ###
#####################################################
#
# Web page to check (we will concatenate an offset
# ($delta) later)
#
my $url = 'http://sourceforge.net/tracker/index.php?func=browse&group_
+id=36855&atid=440764&offset=';
#
# Offset between pages
#
my $delta = 50;
#
# Total limit on number of request IDs to download
# from the web page. This is a safety feature, and
# as such should not need changing. Change this if
# the project ever balloons out of control :-)
#
my $limit=1000;
#
# HTML tags embedded in the email:
# $idurl=$idurl_start.'<request ID>'.$idurl_end.<request ID>."</a>";
# (see below).
#
# - part one of requestID URL:
my $idurl_start='<a href="http://sourceforge.net/tracker/index.php?fun
+c=detail&aid=';
#
# - second part of requestID URL:
my $idurl_end='&group_id=36855&atid=440764">';
#############################################
########## End User Config Section ##########
#############################################
sub gettable{
# download the table data, return a hashref
# with column 1 as key and the other columns
# as values, joined in a tab-delimited string
my $url=shift();
my $delta=shift();
my $limit=shift();
my $tableheaders=shift();
my %results;
# Here we $limit results to prevent infinite loop
OFFSET: for (my $offset=0;$offset<$limit;$offset=$offset+$delta){
my $pageurl=$url.$offset;
my $ua = LWP::UserAgent->new(timeout => 10);
my $request = HTTP::Request->new('GET',$pageurl);
my $response = $ua->request($request);
if ($response->is_success){
my $te = new HTML::TableExtract( headers => $tableheaders );
$te->parse($response->content);
last OFFSET unless $te->table_states; # No more table to pa
+rse
foreach my $ts ($te->table_states) {
foreach my $rowref ($ts->rows) {
my @row=@{$rowref};
# Strip out useless rows
# (this is SourceForge-specific)
next if (($row[0]=~ /^\S$/) || ($row[0]=~ /\<-- Prev
+ious 50/));
# Format data and push into %results
my $idurl=$idurl_start.$row[0].$idurl_end.$row[0]."<
+/a>";
$results{$row[0]}= join("\t", $idurl, @row[1..$#row]
+);
}
}
} else {
print "Error: ".$response->status_line."\n";
last OFFSET;
}
}
return \%results;
}
sub readfile{
# Read in the old results file,
# return old results hashref
my $file=shift;
my %oldresults;
open (SAVEFILE,"<$file") or do {
warn ("No save file; creating one named \'$file\'.\n");
return undef;
};
while (my $line=<SAVEFILE>){
chomp $line;
$line=~/(\w*)\t(.*)/;
$oldresults{$1}=$2;
}
return \%oldresults;
}
sub writefile{
# write new results to save file
my $file=shift();
my %results=%{shift()};
open (SAVEFILE,">$file") or die ("Could not open save file for wri
+ting: $!\n");
foreach my $key (sort keys %results){
print SAVEFILE ("$key\t$results{$key}\n");
}
}
############
### Main ###
############
# Set the cache period to zero if we're called with
# the 'now' directive (i.e. 'sf-digest now')
if ($ARGV[0] && ($ARGV[0] eq 'now')){$cacheperiod = 0;}
# Get old and new table data; overwrite old save
# file with new data
my $resref=&readfile($savefile);
my %allresults=%{$resref} if $resref;
my %newresults=%{&gettable($url,$delta,$limit,$tableheaders)};
# Merge the hashes to prevent false positive
# upon SourceForge timeouts,
# write everything out to the save file
@allresults{keys %newresults} = values %newresults;
&writefile($savefile,\%allresults);
# We can either quit now or send the new message.
# If the archive file is older than 1 day minus 5 minutes,
# or if the archive file does not exist (i.e. first run),
# we send the message.
if ((! -f $archivefile) || (((stat($archivefile))[9]) <= (time-$cachep
+eriod))){
# Read in the archive file
my $archiveref=&readfile($archivefile);
my %archiveresults=%{$archiveref} if $archiveref;
# Construct main mail body text;
#omit entries found in the archived table data
my @mailbody;
foreach my $id (sort keys %allresults){
# Strip out non-ascii characters
#(certain mail reader programs prefer this)
$allresults{$id}=~ s/[^[:ascii:]]//g;
push (@mailbody, "$allresults{$id}\n") unless $archiveresults{
+$id};
}
# Construct the rest of the mail and send it
if (@mailbody){ # Don't send if there are no changes
# Finish off the mail
unshift (@mailbody, $mailbodyhead);
push (@mailbody, $mailbodytail);
my $body= join("<br>", @mailbody);
my $mail=MIME::Lite->new(
From => $mailfrom,
To => $mailto,
Subject => $mailsubject,
Type => 'text/html',
Encoding => 'quoted-printable',
Data => $body,
);
if ($smtp_server){ # Finally, send the mail
$mail->send('smtp',$smtp_server);
}else{
$mail->send();
}
}
# Merge all data and spew it into the archive
@archiveresults{keys %allresults} = values %allresults;
&writefile($archivefile,\%archiveresults);
}
|