I know, I know there's a module doing this on CPAN. But sometime using Perl for a Cool Purpose is just sheer fun. So, below is a script that downloads my entire use.perl journal:
use warnings;
use strict;
use LWP::Simple;
$|++;
# When updates are done, entries can be restricted by their number
# in order to disregard ones already archived.
# For example, if the last entry number was 12345, then only entries
# with number > 12345 should be downloaded
my $main_url = "http://use.perl.org/journal.pl?op=list&uid=4197";
my $journal_path = "use.perl.org/~spur/journal";
my $content = get($main_url) or die "Could not fetch $main_url: $!";
print "Fetched $main_url\n";
print "Processing...\n\n";
my @lines = split("\n+", $content);
for (my $i = 0; $i <= $#lines; ++$i)
{
if ($lines[$i] =~ /$journal_path\/(\d+)/)
{
my $entry_url = "http://" . $journal_path . "/$1";
# look for the post title
$lines[$i] =~ /<B>(.+)<\/B>/
or die "Error line $i: title not found\n";
my $title = $1;
# we don't like spaces in file names
$title =~ tr/ /_/;
# neither these characters
$title =~ s/[":?<>\/\\]//g;
# look for the post date (on the next line)
$lines[$i+1] =~ /<EM>([\d.]+)/
or die "Error in line $i+1: date not found\n";
my $date = $1;
my $target_name = $date . "__" . $title . ".html";
my $entry_content = get($entry_url)
or die "Could not fetch $entry_url: $!";
print "Fetched $entry_url\n";
open(TARGET_FILE, ">$target_name");
print TARGET_FILE $entry_content;
print "Wrote to $target_name\n";
}
}
print "Done.\n";