#!/usr/bin/perl -w ########################################### # This will hopefully become a full # # fledged perl script that downloads # # news from alberta.com and converts # # to txt or some other format readable by # # palms the idea is to run this nightly # # using cron, and schedule syncs with the # # Palm Pilot # # - Slycer 10/19/2000 # ########################################### use strict; use LWP::Simple; my $content; #the content of the alberta.com ind +ex my $workingdir = "/home/slycer"; #the dir to write all the fi +les to my $url = "www2.alberta.com/news"; #the url of alberta.com news my $outlist= "abcom.htm"; #the outfile for the above my $outhtm= "news.htm"; #the outfile for all the urls below my (@urls, #the list of good urls @end_ref, #the ending line of the stories @start_ref,); #the starting line of the stories my %storieshash; #the hash that contains the text my $nfile="news.txt"; #the news file that we read/write t +o (plain #text) my $lines="83"; #the amount of CRAP between stories my $conprog="/usr/bin/txttopdb"; #the script to convert to pdb format my $final="stripped.txt"; #the final file to convert my $pdbfile="news.pdb"; #the name of the pdb file my $title="DailyNews"; #the Title (as displayed in the Pal +m) chdir $workingdir; #####First go get the index file from alberta.com###### open (OUT,">$outlist")||warn "Could not create alberta.com news"; unless (defined ($content = get "http:\/\/$url")) { die "Unable to access abcom website : $!"; } print OUT $content; close OUT; #####Make the URLS usable##### open(CLGR,"<$outlist"); while (<CLGR>){ if (/.*fs\.*/){ s/.+href=\"//i; s/\".+//; chomp ($_); $_= "\"http:\/\/$url\/$_\""; print "$_\n"; push (@urls => $_); } } close (CLGR); ###Done creating our urls - now go get the stuff - uses the subroutine +### &fetch; ###Convert it all to text - don't bother reinventing the wheel### my $text= `lynx -dump -nolist $outhtm`; open (NEWS => ">$nfile"); print NEWS $text; close (NEWS); ###Now that we have a plain text file - strip the CRAP out &strip; ### call the txttopdb script (from freshmeat) to convert to Palm forma +t #for some reason having troubles finding the files!! #so we will joing the dir and file name... $final=join '/',$workingdir,$final; $pdbfile=join '/',$workingdir,$pdbfile; `$conprog -t $title $final $pdbfile\n`; ###lastly - do some cleanup unlink ($outlist,$outhtm,$nfile,$final)|| warn "Could not erase the fi +les $!"; ###Subs from here on down### ####### #FETCH# ####### sub fetch{ open (HTM => ">$outhtm")||warn "wtf.. $!"; chomp (@urls); foreach (@urls){ print "getting $_\n"; unless (defined ($content = get $_)) { warn "could not get $_\n"; } print HTM $_; print HTM $content; } close (HTM); } ####### #STRIP# ####### sub strip{ ###This part finds out the ENDS of each story### $end_ref[0]=0; #set first story to begin @ line 0 open (NEWS,"<$nfile") #the news file in plain text.. ||warn "Could not open news file $!"; my $i=1; while (<NEWS>){ $storieshash{$i}=$_; if ($_ =~ /.*opyright*/){ #all of them end in "copyright" push (@end_ref=>$i); #a reference to the hash that marks t +he end #of the story } ++$i; } ###This next part needs to figure out how many lines between the END o +f ###story (marked by "copyright") and the beggining of the next ($lines ###l8r) Yes, this is aimed strictly at alberta.com foreach (@end_ref){push (@start_ref => ($_ + $lines))} shift @end_ref; #get rid of the zero pop @start_ref; #get rid of the last story open (FINAL,">$final"); foreach (@end_ref){ print FINAL "\n Story starts here\n"; for ($i = shift(@start_ref);$i <= $_;$i++){ print FINAL "$storieshash{$i}"; } } close FINAL; }
In reply to Alberta.com on the Palm by the_slycer
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |