neuroball has asked for the wisdom of the Perl Monks concerning the following question:
I write today to you with a request. I would like some (or all) of you to evaluate my news retriever script. It is working right now and I don't have any problems with it in any way.
What I would like to learn from this is the following: Is their something that is totally flawed, should I break my script into smaller modules and includes them. Are their better (faster/easier/smoother) ways of doing things? What I practically would like from you guys and girls is an honest code review.
Btw. This is production code that fills a database with news articles which are extracted from XML code. The script needs around 5 minutes to parse through 1200 articles/stories on a hp dc320(230??), whereby most of the time is spend receiving the information over the internet.
thanks
/oliver/
Btw. Code resulting from this script and it's twin, the news producer can be viewed at this rather bland news web site.
#!/usr/bin/perl # One tab stop in this script equals two spaces # modules for easier debugging and # better coding standards use warnings; use diagnostics; use strict; # Data::Dumper is used during manual debugging # E.g.: print Dumper ($var); #use Data::Dumper; # modules actually needed by the script use Fcntl ':flock'; use XML::Simple; use LWP::UserAgent; use URI::Escape; use File::Basename; use DBI; use Mail::Mailer; # prevent script from running more than once at a time # by locking file so that it can only be accessed # by the currently running script # DIE doesn't print to the screen on Windows INIT { open LH, $0 or die "INIT: Can't open $0 for locking!\nError: $!\n"; flock LH, LOCK_EX|LOCK_NB or die "INIT: $0 is already running somewhere!\n"; } # check for debug switch and set DEBUGGING constant my $debug_switch; BEGIN { foreach (@ARGV) { if (/^-d$/i) { $debug_switch = 1; } } } use constant DEBUGGING => $debug_switch || 0; ###################################################################### +######### # Variables follow ###################################################################### +######### # URL that returns list of YBX categories my $category_list_http = qw !http://api.yellowbrix.com/api/?service=ca +tegory_list&method=xml&id=verizon&password=xxxxxx!; # URL to xml file on yellowbrix server. CATEGORY must be provided my $category_http_start = qw !http://api.yellowbrix.com/api/?service=h +eadlines&method=xml&id=verizon&password=xxxxxx&category=!; my $category_http_end = qw !&enabletext=1!; # URL to weather on yellowbrix server. weather query must be appended. my $weather_http = qw !http://api.yellowbrix.com/api/?service=weather& +method=xml&id=verizon&password=xxxxxx&query=!; # database connection variables my $sql_server = 'localhost'; my $sql_db = 'harvest'; my $sql_username = 'harvest'; my $sql_password = 'xxxxxxxx'; my $sql_dbh; # mySQL escape character my $sql_escape_char = '\\'; # months linked to their respective calendar number my %months = ( "Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4, "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8, "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12 ); # locations for which we will check the weather my @weather_location = ( "New York, NY", "Chicago, IL", "Los Angeles, CA" ); # set number of warnings displayed to zero my $warnings = 0; ###################################################################### +######### # subs follow ###################################################################### +######### # Catch 'warn' and 'die' signals and # redirect them to our own anonymous functions $SIG{__WARN__} = sub { $warnings++; print STDOUT @_; sendMail ('warn', @_); if ($warnings > 10) { die "Too many warnings... I am killing the script!"; } }; $SIG{__DIE__} = sub { print STDOUT @_; sendMail ('die', @_); exit; }; # send e-mail to people in $mail_info 'to' field # informing them about a script problem sub sendMail { my $reason = shift; my $message = shift; if ($reason =~ /e$/) { $reason .= 'd'; } else { $reason .= 'ed'; } my $mail_info = { 'From' => 'webharvester@verizon.net', 'To' => [ 'x1@verizon.com', 'x2@verizon.com', 'x3@verizon.com'], 'Subject' => "SMB Harvesting Script $reason!" }; eval { my $mailer = Mail::Mailer->new('smtp', Server => 'xxxxxx.im.gt +e.com'); $mailer->open( $mail_info ); print $mailer <<"EOF_MAIL"; All, I am sending you this e-mail because I ${reason}. $message yours truly, $0 EOF_MAIL $mailer->close or die "Mail: Can't close mailer: $!\n"; }; if ($@) { die "Mail: Couldn't send mail: $@\n"; } } # receive an http address and return content or http-code sub getHTTPAddress ($) { my $http_address = shift; print "getHTTPAddress: $http_address -> " if DEBUGGING; my $ua; my $request; my $response; $ua = LWP::UserAgent->new; $request = HTTP::Request->new(GET => $http_address); $response = $ua->request($request); if ($response->is_success) { print "success\n" if DEBUGGING; return $response->content; } else { print "getHTTPAddress: Couldn't download $http_address!\n". "Error: $response->{code} - $response->{message}.\n" if DE +BUGGING; die "getHTTPAddress: Couldn't download $http_address!\n". "Error: $response->{code} - $response->{message}.\n"; } } # open connection to mysql database sub sqlConnectToDB { $sql_dbh = DBI->connect ("dbi:mysql:$sql_db:$sql_server", $sql_use +rname, $sql_password) or die "sqlConnectToDB: Cannot connect to dbi:mysql:$sql_db:$s +ql_server!\n"; } # send a sql statement to the mysql database # E.g. DELETE, INSERT, ... No Data is returned sub sqlStatement ($) { my $statement = shift; print $statement."\n" if DEBUGGING; my $sql_sth = $sql_dbh->prepare ($statement) or die "sqlStatement: Couldn't prepare DB with '$statement'.\n +Error: ".$sql_dbh->errstr."\n"; $sql_sth->execute() or die "sqlStatement: Couldn't execute DB with '$statement'.\n +Error: ".$sql_dbh->errstr."\n"; } # send a sql request to the mysql database # E.g. SELECT, ... Data is returned as an array reference sub sqlRequest ($) { my $statement = shift; print $statement."\n" if DEBUGGING; my $sql_sth = $sql_dbh->prepare ($statement) or die "sqlRequest: Couldn't prepare DB with '$statement'.\nEr +ror: ".$sql_dbh->errstr."\n"; $sql_sth->execute() or die "sqlRequest: Couldn't execute DB with '$statement'.\nEr +ror: ".$sql_dbh->errstr."\n"; return $sql_sth->fetchall_arrayref; } # parse weather xml and return a hash with the values sub parseWeather ($) { my $weather_xml = shift; my %data; if ($weather_xml =~ /^\D\D\D/) { my $weather_ref = XMLin ($weather_xml); %data = ( name => $weather_ref->{'Forecast'}->{'cityname'}, temp => $weather_ref->{'Forecast'}->{'current_weather'}->{ +'current_temp'}, condition => $weather_ref->{'Forecast'}->{'current_weather +'}->{'current_condition'}, image => "http://bizinfo.verizon.net/images/verizon/weathe +r/small/". basename ($weather_ref->{'Forecast'}->{'current_weathe +r'}->{'current_image_url'}), ); return %data; } } # parse article xml and return hash of values # # <Story> # <headline>...</headline> # <url>...</url> # <pubsource>...</pubsource> # <pubdate>...</pubdate> # <arrival_time>...</arrival_time> # <category>...</category> # <Summary> # <text>...</text> (number of text-tags is bigger than zero # <text>...</text> # <text>...</text> # <tracking_url>...</tracking_url> # </Summary> # <expiration_date>...</expiration_date> # <text>...</text> # <tracking_url>...</tracking_url> # </Story> sub parseArticle ($$) { my $article = shift; my $category_id = shift; my %data; $article->{url} =~ /story_id=(\d+)&/; $data{story_id} = $1; $data{arrival_date} = $article->{arrival_time}; $data{category} = $category_id; $data{pub_source} = $article->{pubsource}; $data{xml_url} = ''; $data{ybh_url} = $article->{url}; $data{img_url} = $article->{image_url}; $data{headline} = $article->{headline}; $data{summary_url} = $article->{Summary}->{tracking_url}; if (ref $article->{Summary}->{text} eq "ARRAY") { $data{summary} = join ("<p/>", @{$article->{Summary}->{text}}) +; } else { $data{summary} = $article->{Summary}->{text}; } if (!defined $data{img_url}) { $data{img_url} = ""; } $data{pub_source} =~ s/(')/$sql_escape_char'/gs; $data{headline} =~ s/(')/$sql_escape_char'/gs; $data{summary} =~ s/(')/$sql_escape_char'/gs; $data{pub_source} =~ s/(")/$sql_escape_char"/gs; $data{headline} =~ s/(")/$sql_escape_char"/gs; $data{summary} =~ s/(")/$sql_escape_char"/gs; $data{arrival_date} =~ /(\w+)\s(\d+),\s(\d+):(\d+)\s(\w+)/; my ($month, $day, $hour, $minute, $median) = ($1, $2, $3, $4, $5); my $year = (localtime)[5] + 1900; $data{display_date} = "$months{$month}/$day/$year $hour:$minute $m +edian"; if ($median eq 'PM') { $hour += 12; } $data{arrival_date} = "$year-$months{$month}-$day $hour:$minute:00 +"; return %data; } ###################################################################### +######### # main script follows ###################################################################### +######### print scalar (localtime)."\n"; # set environmental variables for MailTools my $old_maildomain = $ENV{"MAILDOMAIN"}; my $old_mailaddress = $ENV{"MAILADDRESS"}; $ENV{"MAILDOMAIN"} = 'verizon.net'; $ENV{"MAILADDRESS"} = 'webharvester@verizon.net'; # Connect to database, delete old articles, and select all categories sqlConnectToDB; sqlStatement ("TRUNCATE TABLE articles"); my @categories = @{ sqlRequest ("SELECT name FROM category") }; # run through the following loop for each of the retrieved categories foreach (@categories) { # retrieve the category name my $category_name = $_->[0]; my $category_filename = $category_name; # replace all non-word characters with an underscore $category_filename =~ s/\W/_/gs; # retrieve xml file with all articles my $headline_xml = getHTTPAddress ($category_http_start.uri_escape + ($category_name).$category_http_end); # check for non-existence of HTML error if ($headline_xml =~ /^\D\D\D/) { # repair DTD of the XML $headline_xml =~ s/\s>/>/; # parse XML my $headline_ref = XMLin ($headline_xml); # check for the number of stories received in through the XML +file my @stories; if (ref $headline_ref->{Story} ne 'ARRAY') { # received just one story warn "Not enough articles in $category_name!\n". "\tOnly 1 article was received.\n"; push (@stories, $headline_ref->{Story}); } else { @stories = @{$headline_ref->{Story}}; # received less than 3 stories warn "Not enough articles in $category_name!\n". "\tOnly $#stories articles were received.\n" unless ($#stories > 2); } # get category id from database my $category_id = sqlRequest ("SELECT id FROM category WHERE n +ame = '$category_name'"); # run through loop for each of the stories foreach (@stories) { # parse article information from XML and # add new entry to articles database table my %article_data = parseArticle($_, $category_id->[0]->[0] +); sqlStatement ('INSERT articles ('. 'story_id,'. 'arrival_date,'. 'display_date,'. 'category,'. 'pub_source,'. 'xml_url,'. 'ybh_url,'. 'img_url,'. 'headline,'. 'summary,'. 'summary_url'. ') VALUES ('. "'$article_data{story_id}' +,". "'$article_data{arrival_da +te}',". "'$article_data{display_da +te}',". "'$article_data{category}' +,". "'$article_data{pub_source +}',". "'$article_data{xml_url}', +". "'$article_data{ybh_url}', +". "'$article_data{img_url}', +". "'$article_data{headline}' +,". "'$article_data{summary}', +". "'$article_data{summary_ur +l}'". ')'); } } } # delete old weather information sqlStatement ("TRUNCATE TABLE city_weather"); # run through loop for each city in the weather location array foreach (@weather_location) { # receive XML file my $weather_xml = getHTTPAddress ($weather_http.$_); # check for non-existence of HTML error if ($weather_xml =~ /^\D\D\D/) { # parse weather data from XML file and add it to the database my %weather_data = parseWeather ($weather_xml); sqlStatement ('INSERT city_weather ('. 'name,'. 'temp,'. 'weather,'. 'img_url'. ') VALUES ('. "'$weather_data{name}',". "'$weather_data{temp}',". "'$weather_data{condition}',". "'$weather_data{image}'". ')'); } } # disconnect from database $sql_dbh->disconnect; # cleanup the MailTools environmental variables if ($old_maildomain) { $ENV{"MAILDOMAIN"} = $old_maildomain; } else { delete $ENV{"MAILDOMAIN"}; } if ($old_mailaddress) { $ENV{"MAILADDRESS"} = $old_mailaddress; } else { delete $ENV{"MAILADDRESS"}; } print scalar (localtime)."\n"; __END__
Edit by tye, fix READMORE tag
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Evaluate my news retriever... (pretty please!)
by jarich (Curate) on Dec 15, 2003 at 05:54 UTC | |
|
Re: Evaluate my news retriever... (pretty please!)
by Zaxo (Archbishop) on Dec 15, 2003 at 04:41 UTC |