This is yet another HTTP downloader. I wrote it because of some features existing downloaders did not provide, specifically downloading a list of URLs and letting me choose a filename to save each (instead of the program generating a filename automatically).
#!perl -T =head1 NAME wgetas - download many small files by HTTP, saving to filename of your choice =head1 SYNOPSIS B<wgetas -i> I<listfile> =head1 DESCRIPTION Downloads a list of small files through HTTP. The URLs to download an +d their corresponding filenames are read from a text file. If a file to download to already exists, the corresponding download is skipped silently. This allows easy resuming an interrupted list of downloads. =head1 OPTIONS =over =item B<-i> I<listfile> Gives a file that lists the URLs to download and filenames to save it: each line has a HTTP URL and a filename separated by whitespace. The filename cannot contain whitespace. =item B<-P> I<prefixdir> Gives a directory to download files into: filenames in the listfile ar +e relative to this. Default is the the working directory. =item B<-m> I<tempdir> Directory to put temporary files in. These temporary files then have to be renamed to the final location, so this directory must be on the same filesystem as where the downloads go. =item B<-U> I<user-agent> User-Agent header to send in the HTTP requests. =item B<-w> I<seconds> Approximate number of seconds per requests to wait between requests. =item B<-e> I<outlistfile> Gives a file that lists the files we have already tried to download. The list will be read on startup and any filenames found there will be excluded from downloading. On a successful downloads, as well as on download that has failed in a permanent way, the filename were we woul +d download is written out in this file. =item B<-f> Continue downloading after some permanent HTTP errors. These currentl +y include 404 Not Found and 403 Permission Denied. Allowed only if the B<-e> option is also set. =back =head1 NOTES The downloaded files must not be large, because they're read into memo +ry, though this could be changed easily. The files are downloaded in bursts of 5 files, and we're waiting about 5 minutes (about 1 minute for each file) between bursts. This tool is not very general purpose: it does only what I currently n +eed. =head1 BUGS Please report bugs to the author by email: L<ambrus@math.bme.hu> =cut
use warnings; use 5.014; use EV; use AnyEvent; use AnyEvent::HTTP; use File::Temp qw/tempfile/; use File::Spec::Functions; use Fcntl (); use URI; use URI::file; use Getopt::Long; use Time::HiRes (); our $VERSION = 0.002; $EV::DIED = sub { warn $@; exit 1; }; $AnyEvent::HTTP::PERSISTENT_TIMEOUT = 10; sub montime { Time::HiRes::clock_gettime(Time::HiRes::CLOCK_MONOTONIC()); } our $dirprefix = "."; our $listfile; our $dirtemp; our $useragent = "wgetas"; our $sleepnorm = 60; our $nburst = 5; our $outlistfile; our $keepgoing; our($versionopt, $helpopt); for (@ARGV) { /\A(.*)\z/s or die "internal error: untaint argv"; $_ = $1; } Getopt::Long::Configure qw"bundling gnu_compat prefix_pattern=(--|-)"; GetOptions( "i|input_file=s" => \$listfile, "P|directory-prefix=s" => \$dirprefix, "m|directory-temp=s" => \$dirtemp, "w|wait=f" => \$sleepnorm, "U|user-agent=s" => \$useragent, "b|burst=i" => \$nburst, "e|outlistfile=s" => \$outlistfile, "f|keep-going!" => \$keepgoing, "version" => \$versionopt, "help" => \$helpopt, ); if ($versionopt) { die "wgetas $VERSION\n"; } elsif ($helpopt) { my $helpstr = q(Usage: wgetas -i listfile Downloads a list of small files through HTTP. Each line of th +e listfile has an URL and a filename to save as. Downloads for existing files are skipp +ed. For more documentation and options, try perldoc wgetas. ); $helpstr =~ s/\n\s*/\n/g; die $helpstr; } @ARGV and die "error: too many command-line arguments"; length($listfile) or die "error: missing argument -i listfile"; length($dirtemp) or $dirtemp = $dirprefix; our @spec; { open my $U, "<", $listfile; chomp(@spec = <$U>); } our %exclude; our $OUTLIST; if (length($outlistfile)) { if (!open $OUTLIST, "+<", $outlistfile) { if ($!{ENOENT}) { open $OUTLIST, "+>", $outlistfile or die "error: cannot create outlistfile ($outlistfile): +$!"; } else { die "error: cannot open outlistfile ($outlistfile): $!"; } } flock $OUTLIST, Fcntl::LOCK_EX | Fcntl::LOCK_NB or die "error: cannot lock outlistfile ($outlistfile): $!"; while (defined(my $l = <$OUTLIST>)) { chomp($l); $exclude{$l} = 1; } eof($OUTLIST) or die "error reading outlistfile: $!"; } else { $keepgoing and die "error: -f option not allowed because -e option + is unset"; } our %fname; our $sleepcycle = 0; our $sleepnormcredit = 0; our $sleepextracredit = 0; sub paysleep { my($burst) = @_; my $s = 0; if ( !$burst || $nburst <= $sleepcycle || $sleepnorm < $sleepextracredit ) { $s = $sleepnormcredit + $sleepextracredit; $sleepnormcredit = $sleepextracredit = 0; $sleepcycle = 0; } my $ct = AnyEvent->condvar; my $t = AnyEvent->timer(after => $s, cb => sub { $ct->send() }); $ct->recv; } PAGE: for my $speci (keys @spec) { my $spec = $spec[$speci]; my $specln = 1 + $speci; $spec =~ /\S/ or next; my($uri, $fnamej, $rest) = split " ", $spec; length($rest) and die "invalid spec (too many words, line $specln) +: $spec"; $uri =~ m"^(https?://[!#-;=?-Z\[\]_a-z~]*)$" or die "invalid uri ( +line $specln): $uri"; $uri = $1; if (!defined($fnamej)) { $uri =~ m"/([^/]+)/*$" or die "error: cannot find suffix or uri (line $specln): $uri +"; $fnamej = $1; } $fnamej =~ /\A([\x01-\xff]*)\z/ or die "error: invalid filename (l +ine $specln): $fnamej"; $fnamej = $1; my @fnamej_seg = URI::file->new($fnamej)->path_segments; length($fnamej_seg[0]) or die "error: filename is absolute (line $specln): $fnamej"; grep { ".." eq $_ } @fnamej_seg and die "error: filename contains up-dir (line $specln): $fnamej"; $fname{$fnamej}++ and warn "warning: filename appears twice (line +$specln): $fnamej"; $exclude{$fnamej} and next; my $fname = catfile($dirprefix, $fnamej); -f $fname and next; my $timestart = montime(); print STDERR "GET $uri $fname : "; flush STDERR; my($hbody, $hhead, $status, $reason); my $cntconn = 0; { my $cond = AnyEvent->condvar; my $hagent = AnyEvent::HTTP::http_get( $uri, recurse => 0, timeout => 120, on_prepare => sub { $cntconn++ }, headers => {"user-agent" => $useragent}, sub { ($hbody, $hhead) = @_; $cond->send; }, ); $cond->recv; ($status, $reason) = @$hhead{(qw"Status Reason")}; } my $timeend = montime(); my $timedelta = $timeend - $timestart; my $bodylen = length($hbody); $sleepcycle++; $sleepnormcredit += abs($sleepnorm * (0.8 + rand(0.4))); $sleepextracredit += abs(1.2 * $timedelta); print STDERR sprintf "%.2fs %dc %db %s %s\n", $timeend-$timestart, $cntconn, $bodylen, $status, $reason; flush STDERR; if (200 == $status) { my($O, $oname) = tempfile("get-XXXXXXXX", SUFFIX => ".tmp", DI +R => $dirtemp, UNLINK => 0) or die; print $O $hbody or die "error writing output file"; close $O or die "error closing output file"; rename $oname, $fname or die "error renaming output file: $!"; } elsif ($keepgoing && (404 == $status || 403 == $status)) { 1; } else { die "Error getting page: $status $reason"; } if ($OUTLIST) { say $OUTLIST $fnamej or die "error writing to outlistfile: $!" +; flush $OUTLIST; } paysleep 1; } # PAGE if ($OUTLIST) { close $OUTLIST or die "error closing outlistfile: $!"; } paysleep 0; say STDERR "All done."; __END__
Update: 2011-11-07: slightly modified version of the code. This eliminates Coro from the dependencies, for that's not really necessary, plus fixes the URL handling so it accepts tildes. Also, it uses taint mode (perl -T) now.
Update 2013-04-29: see also reshtml - extract image (and other resource) urls from a HTML.
|
---|
Replies are listed 'Best First'. | |
---|---|
Re: wgetas - download many small files by HTTP, saving to filename of your choice
by afoken (Chancellor) on Oct 22, 2011 at 19:47 UTC | |
by ambrus (Abbot) on Oct 22, 2011 at 22:06 UTC | |
Re: wgetas - download many small files by HTTP, saving to filename of your choice
by ambrus (Abbot) on Nov 07, 2011 at 20:05 UTC |