Well, this is a VERY localized script - but I thought it was cool, seems to be working quite well, and I am a palm user that does not surf with it. This script will grab the daily headlines from www2.alberta.com/news and (using cron) update my palm pilot with the file before I leave.
I am ready to be critiqued :-)
#!/usr/bin/perl -w
###########################################
# This will hopefully become a full #
# fledged perl script that downloads #
# news from alberta.com and converts #
# to txt or some other format readable by #
# palms the idea is to run this nightly #
# using cron, and schedule syncs with the #
# Palm Pilot #
# - Slycer 10/19/2000 #
###########################################
use strict;
use LWP::Simple;
my $content; #the content of the alberta.com ind
+ex
my $workingdir = "/home/slycer"; #the dir to write all the fi
+les to
my $url = "www2.alberta.com/news"; #the url of alberta.com news
my $outlist= "abcom.htm"; #the outfile for the above
my $outhtm= "news.htm"; #the outfile for all the urls below
my (@urls, #the list of good urls
@end_ref, #the ending line of the stories
@start_ref,); #the starting line of the stories
my %storieshash; #the hash that contains the text
my $nfile="news.txt"; #the news file that we read/write t
+o (plain
#text)
my $lines="83"; #the amount of CRAP between stories
my $conprog="/usr/bin/txttopdb"; #the script to convert to pdb format
my $final="stripped.txt"; #the final file to convert
my $pdbfile="news.pdb"; #the name of the pdb file
my $title="DailyNews"; #the Title (as displayed in the Pal
+m)
chdir $workingdir;
#####First go get the index file from alberta.com######
open (OUT,">$outlist")||warn "Could not create alberta.com news";
unless (defined ($content = get "http:\/\/$url")) {
die "Unable to access abcom website : $!";
}
print OUT $content;
close OUT;
#####Make the URLS usable#####
open(CLGR,"<$outlist");
while (<CLGR>){
if (/.*fs\.*/){
s/.+href=\"//i;
s/\".+//;
chomp ($_);
$_= "\"http:\/\/$url\/$_\"";
print "$_\n";
push (@urls => $_);
}
}
close (CLGR);
###Done creating our urls - now go get the stuff - uses the subroutine
+###
&fetch;
###Convert it all to text - don't bother reinventing the wheel###
my $text= `lynx -dump -nolist $outhtm`;
open (NEWS => ">$nfile");
print NEWS $text;
close (NEWS);
###Now that we have a plain text file - strip the CRAP out
&strip;
### call the txttopdb script (from freshmeat) to convert to Palm forma
+t
#for some reason having troubles finding the files!!
#so we will joing the dir and file name...
$final=join '/',$workingdir,$final;
$pdbfile=join '/',$workingdir,$pdbfile;
`$conprog -t $title $final $pdbfile\n`;
###lastly - do some cleanup
unlink ($outlist,$outhtm,$nfile,$final)|| warn "Could not erase the fi
+les $!";
###Subs from here on down###
#######
#FETCH#
#######
sub fetch{
open (HTM => ">$outhtm")||warn "wtf.. $!";
chomp (@urls);
foreach (@urls){
print "getting $_\n";
unless (defined ($content = get $_)) {
warn "could not get $_\n";
}
print HTM $_;
print HTM $content;
}
close (HTM);
}
#######
#STRIP#
#######
sub strip{
###This part finds out the ENDS of each story###
$end_ref[0]=0; #set first story to begin @ line 0
open (NEWS,"<$nfile") #the news file in plain text..
||warn "Could not open news file $!";
my $i=1;
while (<NEWS>){
$storieshash{$i}=$_;
if ($_ =~ /.*opyright*/){ #all of them end in "copyright"
push (@end_ref=>$i); #a reference to the hash that marks t
+he end
#of the story
}
++$i;
}
###This next part needs to figure out how many lines between the END o
+f
###story (marked by "copyright") and the beggining of the next ($lines
###l8r) Yes, this is aimed strictly at alberta.com
foreach (@end_ref){push (@start_ref => ($_ + $lines))}
shift @end_ref; #get rid of the zero
pop @start_ref; #get rid of the last story
open (FINAL,">$final");
foreach (@end_ref){
print FINAL "\n Story starts here\n";
for ($i = shift(@start_ref);$i <= $_;$i++){
print FINAL "$storieshash{$i}";
}
}
close FINAL;
}