As a side project to sharpen my non-existant LWP skills I started writing a small web spider. It requests an html page and stores the file in a directory hierarchy based on the domain (e.g. archive/www/perlmonks/org/index). It has worked for the simple test urls I've given it so far but everytime I look at this section of code I keep thinking there has to be a much better way to do it. Am I on the right track with this, or should I be going about it in a totally different way? Any suggestions to improve the code's performance or clarity would be greatly appreciated.
#!/usr/bin/perl -w use strict; use LWP::UserAgent; my $ua = LWP::UserAgent->new; my %config = ( agent => "cjf/0.0.1", # User agent archive => "archive" ); # data storage root director +y $ua->agent("$config{agent}"); request_page("http://www.perlmonks.org/"); sub request_page { my $url = shift; my $req = HTTP::Request->new(GET => "$url"); $req->header('Accept' => 'text/html'); my $res = $ua->request($req); if (!$res->is_success) { print "Error: " . $res->status_line . "\n"; exit; } $url =~ s#^http://##; print "$url has been retrieved\n"; archive_page($url, $res->content); } sub archive_page { my ($url, $data) = @_; my ($domain, $path) = split('/', $url, 2); my @sections = split /\./, $domain; chdir "$config{archive}" or die "Can't chdir to $config{archive} $ +!\n"; foreach my $section (@sections) { if (-e $section && -d $section) { chdir "$section" or die "Can't change directory to $sectio +n: $!\n"; } else { mkdir "./$section" or die "Can't mkdir $section: $!\n"; chdir "$section" or die "Can't change directory to $sectio +n: $!\n"; } } if ($path) { my ($filename, @directories) = reverse split('/', $path); foreach my $directory (@directories) { if (-e $directory && -d $directory) { chdir "$directory" or die "Can't change directory to $ +directory: $!\n"; } else { mkdir "./$directory" or die "Can't mkdir $directory: $ +!\n"; chdir "$directory" or die "Can't change directory to $ +directory: $!\n"; } } open DATA, ">$filename" or die "Can't create $filename file: $ +!\n"; print DATA $data; close DATA; } else { open DATA, ">index" or die "Can't create data file: $!\n"; print DATA $data; close DATA; } print "$url has been archived\n"; }
In reply to Storing data in a directory hierarchy by cjf
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |