1: #This was originally for jcwren's programming contest,
2: #but after suggestions from friends, it really doesn't
3: #match the critera anymore, so I didn't feel comfortable
4: #posting in under the [id://43857|original thread], but
5: #I'm still sufficiently happy with it that I wanted to
6: #post it. This is the first time I've used LWP, and the
7: #contest was a great way to learn about a great, versatile module.<br>
8:
9: #This script reads in CNNi's headlines, the "interactive headlines" (linked to larger write ups), and weather.
10:
11: #As always, I look forward to your comments, criticisms and suggestions.
12:
13: #updated 20000108 -- added some parameters and self_url() trickery.
14: # see lines 101 and 201-237
15: <code>
16: use strict;
17: use LWP::Simple;
18: use LWP::UserAgent;
19: use CGI::Pretty qw(:all);
20: use Date::Manip; &Date_Init ("TZ=US/Mountain");
21: my %state_abbr = (
22: "ALABAMA" => 'AL',
23: "ALASKA" => 'AK',
24: "ARIZONA" => 'AZ',
25: "ARKANSAS" => 'AR',
26: "CALIFORNIA" => 'CA',
27: "COLORADO" => 'CO',
28: "CONNECTICUT" => 'CT',
29: "DELAWARE" => 'DE',
30: "D.C." => 'DC',
31: "FLORIDA" => 'FL',
32: "GEORGIA" => 'GA',
33: "HAWAII" => 'HI',
34: "IDAHO" => 'ID',
35: "ILLINOIS" => 'IL',
36: "INDIANA" => 'IN',
37: "IOWA" => 'IA',
38: "KANSAS" => 'KS',
39: "KENTUCKY" => 'KY',
40: "LOUISIANA" => 'LA',
41: "MAINE" => 'ME',
42: "MARYLAND" => 'MD',
43: "MASSACHUSETTS" => 'MA',
44: "MICHIGAN" => 'MI',
45: "MINNESOTA" => 'MN',
46: "MISSISSIPPI" => 'MS',
47: "MISSOURI" => 'MO',
48: "MONTANA" => 'MT',
49: "NEBRASKA" => 'NE',
50: "NEVADA" => 'NV',
51: "NEW HAMPSHIRE" => 'NH',
52: "NEW JERSEY" => 'NJ',
53: "NEW MEXICO" => 'NM',
54: "NEW YORK" => 'NY',
55: "NORTH CAROLINA" => 'NC',
56: "NORTH DAKOTA" => 'ND',
57: "OHIO" => 'OH',
58: "OKLAHOMA" => 'OK',
59: "OREGON" => 'OR',
60: "PENNSYLVANIA" => 'PA',
61: "PUERTO RICO" => 'PR',
62: "RHODE ISLAND" => 'RI',
63: "SOUTH CAROLINA" => 'SC',
64: "SOUTH DAKOTA" => 'SD',
65: "TENNESSEE" => 'TN',
66: "TEXAS" => 'TX',
67: "UTAH" => 'UT',
68: "VERMONT" => 'VT',
69: "VIRGIN ISLANDS" => 'VI',
70: "VIRGINIA" => 'VA',
71: "WASHINGTON" => 'WA',
72: "WEST VIRGINIA" => 'WV',
73: "WISCONSIN" => 'WI',
74: "WYOMING" => 'WY',
75: );
76:
77:
78:
79: my $newsurl ="http://headlinenews.cnn.com/QUICKNEWS/virtual/swf.headline.txt";
80: my $weatherurl ="http://headlinenews.cnn.com/QUICKNEWS/virtual/swf.weather.txt";
81: my $interurl ="http://headlinenews.cnn.com/QUICKNEWS/virtual/swf.interactive.txt";
82: my $interactive_ticker;
83: my $news_ticker = retrieve_news();
84: my $weather_ticker = retrieve_weather();
85: if (param("showinteractive")!~/no/i){
86: $interactive_ticker= retrieve_interactive();
87: }
88: my @state_weather =split /&/, $weather_ticker;
89:
90: my $citystate;
91: my $lowstate; my $lowtemp;
92: my $histate; my $hitemp;
93:
94: my ($URL, @lines, @entry, $content);
95: my ($moddate);
96: my (%headlines);
97:
98: write_header("CNN wrapup");
99: %headlines = log_news ($news_ticker);
100: write_news( $interactive_ticker, %headlines),"<P>";
101: if (param("showweather")!~/no/i){
102: write_weather(@state_weather);
103: }
104: write_footer();
105:
106: ####################
107: #
108: # Prints out a nice HTML header.
109: #
110: ####################
111: sub write_header {
112: my $title=shift;
113: print header();
114: print start_html(-Title => $title, -BGCOLOR=>"#000000",-TEXT=>"#00FF00", -LINK=>"#33FF00", -VLINK=>"00CC00", -ALINK=>"FFFFFF");
115: print start_form(); # start_multipart_form() if file upload
116: my $das_url =&self_url();
117: print "<META HTTP-EQUIV='Refresh' CONTENT='600;URL=$das_url'>";
118: }
119:
120: ####################
121: #
122: # End of page stuff.
123: #
124: ####################
125: sub write_footer {
126: my $newsdate=get_moddate($newsurl);$newsdate= scalar localtime $newsdate; $newsdate = UnixDate($newsdate, "%m/%d/%Y %H:%M");
127: my $weatherdate= get_moddate($weatherurl);$weatherdate= scalar localtime $weatherdate ;$weatherdate = UnixDate($weatherdate, "%m/%d/%Y %H:%M");
128: my $presenttime=ParseDate("now");
129: print "<center><p><A HREF='$newsurl'>Headlines</A> last updated ", $newsdate,
130: "\n | <A HREF='$weatherurl'>Weather</A> last updated ", $weatherdate,
131: "\n | This page last updated ", UnixDate($presenttime, "%m/%d/%Y %H:%M"),"</CENTER>",
132: end_form(), end_html() }
133:
134: ####################
135: #
136: # wrapper for LWP->get() call.
137: #
138: ####################
139: sub retrieve_weather {
140: return "&" . get ($weatherurl);
141: }
142:
143: ####################
144: #
145: # wrapper for LWP->get() call.
146: #
147: ####################
148: sub retrieve_news {
149: return get ($newsurl);
150: }
151:
152: ####################
153: #
154: # wrapper for LWP->get() call.
155: #
156: ####################
157: sub retrieve_interactive{
158: return get ($interurl);
159: }
160:
161: ####################
162: #
163: # prints out the interactive and state headlines in a table
164: #
165: ####################
166:
167: sub write_news{
168: my ($l_inter, %l_headlines) = @_;
169: my $statectr;
170: my $new_tagstart;
171: my $new_tagend;
172:
173: #start the table.
174: print '<TABLE align=CENTER BORDER="1">';
175: print '<TR><TD ALIGN=CENTER colspan=3 valign=center><h4> Breaking news</h4></TD></TR>';
176:
177: # split up the interactive headlines.
178: @lines = split /&intheadline\d*\=/, $l_inter;
179:
180: # get last modified date of the $interurl file. format nicely.
181: $moddate = scalar localtime get_moddate ($interurl);
182: $moddate= UnixDate($moddate, "%m/%d/%Y %H:%M");
183:
184: # split the headline information URL & headline text & put into table.
185: foreach (@lines) {
186: my $hl_found=0;
187: @entry = split /&inturl\d*\=/;
188: @entry[1] =~ s/^\s//;
189: @entry[1] =~ s/\s\s/ /g;
190: @entry[0] =~ s/interactivecount\=\d*//;
191: if (@entry[0]){
192: print "<TR><TD colspan =3 align=center><A HREF=http:\/\/headlinenews.cnn.com@entry[1] target=new>@entry[0]</a><\/TD><\/TR>\n";
193: }
194: }
195:
196:
197: # begin next part of table.
198: print '<TR><TD colspan=3 align=center><h4>Headlines from around the country</h4>';
199:
200: # print out the state HTML anchors in two lines.
201: # note there's no checking to see if one exists or not :(
202: foreach my $thisstate ( sort keys %state_abbr) {
203: $statectr ++;
204: print "<A HREF=#",$state_abbr{$thisstate},">", $state_abbr{$thisstate}, "</a> ";
205: ($statectr==26) && print "<BR>";
206: }
207: # print an HTML anchor to the Weather.
208: print '<BR><A HREF=#WEATHER>Weather</A></TD></TR>';
209:
210: # sorting headlines on state, then timestamp, then headline text.
211: PRINTHEADLINE: foreach my $thisheadline (sort {($l_headlines{$a}{"State"} cmp $l_headlines{$b}{"State"})or
212: ($l_headlines{$b}{"Timestamp"} cmp $l_headlines{$a}{"Timestamp"}) or
213: ($l_headlines{$a}{"Headline"} cmp $l_headlines{$b}{"Headline"}) } keys %l_headlines) {
214: my $broken_headline ="";# $l_headlines{$thisheadline}{"Headline"};
215:
216: if (UnixDate ($l_headlines{$thisheadline}{"Timestamp"}, "%m/%d/%Y") eq UnixDate (&ParseDate("Today"),"%m/%d/%Y")) {
217: $new_tagstart='<FONT COLOR="#FFFFFF">' ;$new_tagend= '</FONT>';
218: } else {
219: if (param ("newonly")=~/yes/i) {
220: next PRINTHEADLINE
221: }
222: $new_tagstart='' ;$new_tagend= '';
223: }
224:
225: if (param("search")!~/^$/) {
226: my $searchin = param("search");
227: $searchin =~ s/(\?)/\\w/gi;
228: $searchin =~ s/(\*)/\.$1?/gi; #line noise? ha!
229: $searchin = "\\b". $searchin . "\\b";
230: if (defined(param("searchmethod"))) {
231: if (param("searchmethod")=~/showonly/ ) {
232: if ($l_headlines{$thisheadline}{"Headline"}!~/$searchin/i) {next PRINTHEADLINE}
233: }
234: }
235: $l_headlines{$thisheadline}{"Headline"}=~ s/($searchin)/<FONT COLOR="#FF0000">$1<\/FONT>/i;
236: }
237:
238: if (param("definitions")=~/yes/i){
239: map {$broken_headline .= "<A HREF='http://www.m-w.com/cgi-bin/dictionary?va=$_'>$_</A> " }split (/ /,$l_headlines{$thisheadline}{"Headline"});
240: } else {
241: $broken_headline=$l_headlines{$thisheadline}{"Headline"}
242: }
243:
244: #
245: # Can you figure out why the anchor's printed out in the headline cell, rather than
246: # the state or at the beginning of the row?
247: #
248: print "<TR>",
249: "<TD>",$new_tagstart,$l_headlines{$thisheadline}{"State"},$new_tagend,"<\/TD>",
250: "<TD>",$new_tagstart,$l_headlines{$thisheadline}{"Timestamp"},"<\/TD>",$new_tagend,
251: "<TD>","<A NAME='\#",$new_tagstart,$state_abbr{$l_headlines{$thisheadline}{"State"}},"'> ",$new_tagstart,$broken_headline,"<\/TD>",$new_tagend,
252: "<\/TR>\n";
253: }
254: # end table.
255: print "<\/TABLE>";
256: print &self_url();
257: }
258: ####################
259: #
260: # put the weather into a table.
261: #
262: #
263: ####################
264: sub write_weather {
265: my %cityinfo;
266: my $ctr=0; # used for columnating.
267: #
268: # split into HoH format
269: # {INDEX}{INFOTYPE}{INFODATA}
270: # e.g. {1}{CITY}{BIRMINGHAM, AL}
271: # {1}{TEMP}{-5}
272: # etc...
273: #
274: foreach my $temp (@_) {
275: $temp=~/([a-zA-z]*)(\d*)\=(.*)/;
276: $cityinfo{$2}{$1}=$3;
277: }
278:
279: # print out the HTML Anchor.
280: print "<A NAME=WEATHER>";
281:
282: # begin table.
283: print "<TABLE align=CENTER border=1>\n";
284: print "<TR><TD colspan=12 align=center>Weather from all 50 states<\/TD><\/TR>\n";
285: print "<TR><TD>City Name<\/TD><TD colspan=2> Current<br>Conditions<\/TD><TD>Current<br>Temperature<\/TD>",
286: "<TD>City Name<\/TD><TD colspan=2> Current<br>Conditions<\/TD><TD>Current<br>Temperature<\/TD>",
287: "<TD>City Name<\/TD><TD colspan=2> Current<br>Conditions<\/TD><TD>Current<br>Temperature<\/TD>\n";
288:
289: $ctr = 0;
290: foreach my $thiscity (sort { $cityinfo{$a}{"city"}cmp $cityinfo{$b}{"city"}} keys %cityinfo ) {
291: if ($cityinfo{$thiscity}{"weatherurl"} =~m#http://www.cnn.com/WEATHER/(c)(.)/#i) {
292: $citystate=", Canada"
293: } else {
294: # use the second saved part of the regexp to populate the state. or district.
295: # stupid districts.
296: $cityinfo{$thiscity}{"weatherurl" } =~m#http://www.cnn.com/WEATHER/(..)/(.*)/#i;
297: $citystate= ", $2 ";
298: }
299: # most of the time, the picture for a weather condition is just the
300: # condition name, stripped of whitespace, plus '.GIF'.
301: $cityinfo{$thiscity}{"weatherpic"} = $cityinfo{$thiscity}{"condition"}. ".gif";
302: $cityinfo{$thiscity}{"weatherpic"} =~ s/[ |\/]/./g;
303:
304: # but not always. stupid special exceptions.
305: $cityinfo{$thiscity}{"weatherpic"} =~ s/haze/hazy/g;
306: $cityinfo{$thiscity}{"weatherpic"} =~ s/foggy/fog/g;
307: $cityinfo{$thiscity}{"weatherpic"} =~ s/lgt\.snow\.fog/snow.fog/g;
308: $cityinfo{$thiscity}{"weatherpic"} =~ s/snow\.and\.fog/snow.fog/g;
309: $cityinfo{$thiscity}{"weatherpic"} =~ s/frz\.rain/freezing.rain/g;
310: $cityinfo{$thiscity}{"weatherpic"} =~ s/mist/misty/g;
311: $cityinfo{$thiscity}{"weatherpic"} =~ s/lgt.snowshower/light.snow/g;
312:
313:
314: # three columns to a row.
315: if (!(($ctr-1) % 3)) {print "<\/TR>"}
316:
317: # print out one row of data.
318: $cityinfo{$thiscity}{weatherurl} && print "<TD><A HREF='", $cityinfo{$thiscity}{weatherurl},"' target=new>", $cityinfo{$thiscity}{"city"} ,"$citystate<\/A><\/TD>",
319: "<TD ALIGN='justify'>" , $cityinfo{$thiscity}{"condition"},
320: "<\/TD><TD><IMG SRC='http://www.cnn.com/WEATHER/images/9712/conds/" , $cityinfo{$thiscity}{"weatherpic"},"'>",
321: "<\/TD>",
322: "<TD ALIGN='CENTER'>" , $cityinfo{$thiscity}{"temperature"}, "<\/TD>";
323: print "\n";
324:
325: # log information for highest/ lowest temps.
326: if (($cityinfo{$thiscity}{"temperature"} < $lowtemp )or ($lowtemp==undef)){
327: $lowstate = "<A HREF='". $cityinfo{$thiscity}{weatherurl}."' target=new>". $cityinfo{$thiscity}{"city"} ."$citystate<\/A>";
328: $lowtemp = $cityinfo{$thiscity}{"temperature"}
329: }
330: if ($cityinfo{$thiscity}{"temperature"} > $hitemp){
331: $histate = "<A HREF='". $cityinfo{$thiscity}{weatherurl}."' target=new>". $cityinfo{$thiscity}{"city"} ."$citystate<\/A>";
332: $hitemp = $cityinfo{$thiscity}{"temperature"} ;
333: }
334: $ctr++;
335: }
336: # reaching the end of the table.
337: # print out highest & lowest of the moment.
338: print "<TR><TD colspan=12 align=center>Extremes of the moment!<\/TD><\/TR>\n";
339: print "<TR><TD>High<\/TD><TD colspan=4>$histate<\/TD><TD>$hitemp<\/TD><TD>Lo<\/TD><TD colspan=4>$lowstate<\/TD><TD>$lowtemp<\/TD><\/TR>\n";
340: print "<\/TABLE>\n";
341: }
342:
343: ####################
344: #
345: # checks for new news & writes it to a headline file.
346: #
347: ####################
348: sub log_news {
349: my %l_data;
350: my @lines;
351: my $hl_found;
352: #
353: # no headlines? that's OK!
354: # (no die statement)
355: open HL, "headlines.txt";
356: @lines = <HL> ; # headlines.txt format : state\ttimestamp\theadline
357: close HL;
358:
359: #
360: # start by reading all of the existing headlines from "headlines.txt";
361: #
362: foreach (@lines) {
363: @entry = split /\t/;
364: my $nextkey = scalar keys %l_data;
365: if (@entry[0]) {
366: $l_data{$nextkey}{"State"}=@entry[0];
367: $l_data{$nextkey}{"Timestamp"}=@entry[1];
368: $l_data{$nextkey}{"Headline"}=@entry[2];
369: chomp $l_data{$nextkey}{"Headline"};
370: }
371: }
372:
373: #
374: # OK, read in the new headlines, and if they don't match any existing ones,
375: # read em into the hash.
376: #
377: @lines = split /&state\d*\=/, shift;
378: $moddate = scalar localtime get_moddate ($newsurl);
379: $moddate= UnixDate($moddate, "%m/%d/%Y %H:%M");
380:
381: foreach (@lines) {
382: $hl_found=0;
383: @entry = split /&headline\d*\=/;
384: @entry[1] =~ s/&headlineload\=done//;
385: @entry[1]=~s/^\s//;
386: @entry[1]=~s/\s\s/ /g;
387: @entry[0] =~ s/headlinecount\=\d*//;
388: @entry[0]= trim(@entry[0]);
389: my $nextkey = scalar keys %l_data;
390: foreach my $thisheadline (keys %l_data) {
391: if ($l_data{$thisheadline}{"Headline"} eq @entry[1]){$hl_found=1};
392: last if $hl_found;
393: }
394: if ((not $hl_found) && (@entry[0])){
395: $l_data{$nextkey}{"State"}=@entry[0];
396: $l_data{$nextkey}{"Timestamp"}=$moddate ;
397: $l_data{$nextkey}{"Headline"}=@entry[1];
398: }
399: }
400:
401:
402: # Now that all that's been done,
403: # sort & write all the headlines to "headlines.txt".
404: open HL, ">headlines.txt";
405: foreach my $thisheadline (sort {($l_data{$a}{"State"} cmp $l_data{$b}{"State"})or
406: ($l_data{$a}{"Timestamp"} cmp $l_data{$b}{"Timestamp"}) or
407: ($l_data{$a}{"Headline"} cmp $l_data{$b}{"Headline"}) } keys %l_data){
408: print (HL $l_data{$thisheadline}{"State"},"\t");
409: print (HL $l_data{$thisheadline}{"Timestamp"},"\t");
410: print (HL $l_data{$thisheadline}{"Headline"},"\n");
411: }
412: close HL;
413: return %l_data;
414: }
415:
416:
417: ####################
418: #
419: # wrapper for LWP::Useragent code.
420: # returns last modified date of a url.
421: # in : the url to date
422: # out : last modified date, or 0 if unable to determine date
423: #
424: ####################
425: sub get_moddate {
426: my $ans;
427: my $lurl=shift;
428: my $ua = LWP::UserAgent->new();
429:
430: $ans = $ua->request(HTTP::Request->new("HEAD", $lurl));
431: if ($ans->is_success) {
432: return $ans->last_modified || 0 ;
433: } else {
434: print STDERR "$lurl: Error [", $ans->code, "] ", $ans->message, "!\n";
435: }
436: }
437:
438: ####################
439: #
440: # emulates trim function found
441: # in other languages. Thank you, cookbook! :)
442: # in : a string
443: # out : a string *cough*
444: #
445: ####################
446: sub trim {
447:
448: my @out = @_;
449: for (@out) {
450: s/^\s+//;
451: s/\s+$//;
452: }
453: return wantarray ? @out : $out[0];
454: }
455: </code>