The code below scrapes data (permission has been given for the accessing of the source website for collecting the large amount of data) for over 100 years worth of temperature data. This script takes around 8 hours to complete. How can I improve the speed and minimize my total run time?
Thanks,
Dave
#!/usr/bin/perl # Makes the script executable use strict; # ensures proper variable declarations, etc. use warnings; # allows warnings to be issued #print "Create a new directory: "; #my $dir = <STDIN>; #chomp ($dir); #mkdir "$dir"; # Open files for writing open DATA, ">tdata.txt" or die "open: $!\n"; open STATS, ">yearly_stats.txt" or die "open: $!\n"; print "Scraped data will be written to file 'tdata.txt'\n"; # Create date list use Date::Simple::D8 (':all'); my $today = Date::Simple::D8->today(); my $start = Date::Simple::D8->new('18960101'); my $end = Date::Simple::D8->new("$today"); my @dates; while ( $start < $end ) { push @dates, $start; + $start = $start->next; } # Initiate browsing agent print "Initiating the browsing agent...\n"; use WWW::Mechanize; my $url = "http://bub2.meteo.psu.edu/wxstn/wxstn.htm"; my $mech = WWW::Mechanize->new(keep_alive => 1); print "Accessing URL...\n"; $mech->get($url); print "Collecting data...\n"; # Start the scraping while (@dates) { $mech->submit_form( form_number => 1, fields => { dtg => $dates[0], } ); # Download the resulting page, text only, and scrape for data my $page = $mech->content(format=>'text'); # Daily max, min, average my @data = ($page =~ /Temperature\s+:\s+(\d\d)/g); # Daily 30-year max normal my ($thirtyyrhi) = $page =~ /30-Year Average High Temperature\s+:\ +s+(\S*)/; if ($thirtyyrhi eq '(N/A)') { $thirtyyrhi = "99.99"; } # Daily 30-year min normal my ($thirtyyrlo) = $page =~ /30-Year Average Low Temperature\s+:\s ++(\S*)/; if ($thirtyyrlo eq '(N/A)') { $thirtyyrlo = "99.99"; } # Assign data to the array my $hlahdd = ("$dates[0] $data[0] $data[1] $data[2] $thirtyyrhi $t +hirtyyrlo\n"); # Print the array to screen and to file print "$hlahdd"; print DATA "$hlahdd"; # Pause... then go back a page sleep .1; $mech->back(); # remove the date just used shift @dates; } # Exit the scraping loop # Close the written file close DATA; close STATS; print "Success!\n";
In reply to Need to Improve Scraping Speed by cheech
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |