in reply to Help needed with a Perl script
For those interested, here is the script itself: (at least it starts with -Tw yay!)
/usr/local/bin/perl -Tw #################################### # Content Harvester # # v1.1 - Jamie Unwin, Kieran Topping # Guardian Unlimited, Guardian Newspapers Limited 2000 # # Automatically harvest distributed Guardian Unlimited content # # =================================================== # IMPORTANT NOTES - PLEASE READ # =================================================== # This script is provided "as is" and as an EXAMPLE only. # # This script will need to be modified in order to fit into your parti +cular # environment, and to add an appropriate level of error checking. # # Guardian Unlimited cannot offer technical support for implementing t +his script. # # Modification and execution should only be attempted by the Webmaster + or # Sys-Admin of your site, and then only if they have experience and # responsibility in the following fields: # # * Perl # * LWP module # * Webserver & (your particular) operating system. # # No responsibility can be accepted by Guardian Unlimited for any dama +ge caused # to your website or computer systems arising from use of this script. # # If in doubt, DO NOT EXECUTE THIS SCRIPT. # # See http://www.guardianunlimited.co.uk/distribution # for further conditions of use # ################################################ # load required modules use strict; # this turns on strict error checking use LWP::UserAgent; # this loads the LWP module (used to retrieve a w +eb page) ################################################ # Global variables ################################################ # --This scalar will need to be edited to suit your particular environ +ment-- # Path to the document root of your web space # (on your local file system) # my $doc_root = '/www/htdocs'; # --This scalar will need to be edited to suit your particular environ +ment-- # Path to local directory relative to your document root # (this is where the retrieved pages will be stored) # my $content_directory = $doc_root . '/content'; # --This hash will need to be edited to suit your particular environme +nt-- # URLs of the content you wish to retrieve. # # The format is - 'local filename' => 'remote url' # # The 'local filename' is the name that the file will have on your web +space. # This is chosen by you, and is specified relative to the content dire +ctory. # e.g. 'guardian_news.html' # # The 'remote url' is the URL of the content you wish to retrieve. # You can obtain these URLs by following the instructions at # http://www.guardianunlimited.co.uk/distribution # These will look like # http://www.guardianunlimited.co.uk/Distribution/[...].html # my %content_to_retrieve = ( 'guardian_news.html' => 'http://www.guardianunlimited.co.uk/Dis +tribution/[...].html', 'guardian_tv_radio.html' => 'http://www.guardianunlimited.co.uk/Dis +tribution/[...].html' ); ############################################## # Main ############################################# # create a user agent (this is like a browser) my $ua = new LWP::UserAgent; $ua->agent('ContentHarvester/1.1 (GU)'); # loop through each piece of content to be harvested foreach my $local_filename (keys %content_to_retrieve) { my $remote_url = $content_to_retrieve{$local_filename}; # get the page (retrieve content) my $request = new HTTP::Request('GET', $remote_url); my $response = $ua->request($request); my $content = $response->content; # check we got the page unless ($response->is_success) { die "$remote_url, $response->error_as_HTML\n"; } # save the file to the local file system open (CONTENT, ">$content_directory/$local_filename") or die "Can't store the retrived file locally, $content_director +y/$local_filename, $!\n"; print CONTENT $content . "\n"; close CONTENT; }
|
|---|