use warnings; use 5.014; use EV; use AnyEvent; use AnyEvent::HTTP; use File::Temp qw/tempfile/; use File::Spec::Functions; use Fcntl (); use URI; use URI::file; use Getopt::Long; use Time::HiRes (); our $VERSION = 0.002; $EV::DIED = sub { warn $@; exit 1; }; $AnyEvent::HTTP::PERSISTENT_TIMEOUT = 10; sub montime { Time::HiRes::clock_gettime(Time::HiRes::CLOCK_MONOTONIC()); } our $dirprefix = "."; our $listfile; our $dirtemp; our $useragent = "wgetas"; our $sleepnorm = 60; our $nburst = 5; our $outlistfile; our $keepgoing; our($versionopt, $helpopt); for (@ARGV) { /\A(.*)\z/s or die "internal error: untaint argv"; $_ = $1; } Getopt::Long::Configure qw"bundling gnu_compat prefix_pattern=(--|-)"; GetOptions( "i|input_file=s" => \$listfile, "P|directory-prefix=s" => \$dirprefix, "m|directory-temp=s" => \$dirtemp, "w|wait=f" => \$sleepnorm, "U|user-agent=s" => \$useragent, "b|burst=i" => \$nburst, "e|outlistfile=s" => \$outlistfile, "f|keep-going!" => \$keepgoing, "version" => \$versionopt, "help" => \$helpopt, ); if ($versionopt) { die "wgetas $VERSION\n"; } elsif ($helpopt) { my $helpstr = q(Usage: wgetas -i listfile Downloads a list of small files through HTTP. Each line of the listfile has an URL and a filename to save as. Downloads for existing files are skipped. For more documentation and options, try perldoc wgetas. ); $helpstr =~ s/\n\s*/\n/g; die $helpstr; } @ARGV and die "error: too many command-line arguments"; length($listfile) or die "error: missing argument -i listfile"; length($dirtemp) or $dirtemp = $dirprefix; our @spec; { open my $U, "<", $listfile; chomp(@spec = <$U>); } our %exclude; our $OUTLIST; if (length($outlistfile)) { if (!open $OUTLIST, "+<", $outlistfile) { if ($!{ENOENT}) { open $OUTLIST, "+>", $outlistfile or die "error: cannot create outlistfile ($outlistfile): $!"; } else { die "error: cannot open outlistfile ($outlistfile): $!"; } } flock $OUTLIST, Fcntl::LOCK_EX | Fcntl::LOCK_NB or die "error: cannot lock outlistfile ($outlistfile): $!"; while (defined(my $l = <$OUTLIST>)) { chomp($l); $exclude{$l} = 1; } eof($OUTLIST) or die "error reading outlistfile: $!"; } else { $keepgoing and die "error: -f option not allowed because -e option is unset"; } our %fname; our $sleepcycle = 0; our $sleepnormcredit = 0; our $sleepextracredit = 0; sub paysleep { my($burst) = @_; my $s = 0; if ( !$burst || $nburst <= $sleepcycle || $sleepnorm < $sleepextracredit ) { $s = $sleepnormcredit + $sleepextracredit; $sleepnormcredit = $sleepextracredit = 0; $sleepcycle = 0; } my $ct = AnyEvent->condvar; my $t = AnyEvent->timer(after => $s, cb => sub { $ct->send() }); $ct->recv; } PAGE: for my $speci (keys @spec) { my $spec = $spec[$speci]; my $specln = 1 + $speci; $spec =~ /\S/ or next; my($uri, $fnamej, $rest) = split " ", $spec; length($rest) and die "invalid spec (too many words, line $specln): $spec"; $uri =~ m"^(https?://[!#-;=?-Z\[\]_a-z~]*)$" or die "invalid uri (line $specln): $uri"; $uri = $1; if (!defined($fnamej)) { $uri =~ m"/([^/]+)/*$" or die "error: cannot find suffix or uri (line $specln): $uri"; $fnamej = $1; } $fnamej =~ /\A([\x01-\xff]*)\z/ or die "error: invalid filename (line $specln): $fnamej"; $fnamej = $1; my @fnamej_seg = URI::file->new($fnamej)->path_segments; length($fnamej_seg[0]) or die "error: filename is absolute (line $specln): $fnamej"; grep { ".." eq $_ } @fnamej_seg and die "error: filename contains up-dir (line $specln): $fnamej"; $fname{$fnamej}++ and warn "warning: filename appears twice (line $specln): $fnamej"; $exclude{$fnamej} and next; my $fname = catfile($dirprefix, $fnamej); -f $fname and next; my $timestart = montime(); print STDERR "GET $uri $fname : "; flush STDERR; my($hbody, $hhead, $status, $reason); my $cntconn = 0; { my $cond = AnyEvent->condvar; my $hagent = AnyEvent::HTTP::http_get( $uri, recurse => 0, timeout => 120, on_prepare => sub { $cntconn++ }, headers => {"user-agent" => $useragent}, sub { ($hbody, $hhead) = @_; $cond->send; }, ); $cond->recv; ($status, $reason) = @$hhead{(qw"Status Reason")}; } my $timeend = montime(); my $timedelta = $timeend - $timestart; my $bodylen = length($hbody); $sleepcycle++; $sleepnormcredit += abs($sleepnorm * (0.8 + rand(0.4))); $sleepextracredit += abs(1.2 * $timedelta); print STDERR sprintf "%.2fs %dc %db %s %s\n", $timeend-$timestart, $cntconn, $bodylen, $status, $reason; flush STDERR; if (200 == $status) { my($O, $oname) = tempfile("get-XXXXXXXX", SUFFIX => ".tmp", DIR => $dirtemp, UNLINK => 0) or die; print $O $hbody or die "error writing output file"; close $O or die "error closing output file"; rename $oname, $fname or die "error renaming output file: $!"; } elsif ($keepgoing && (404 == $status || 403 == $status)) { 1; } else { die "Error getting page: $status $reason"; } if ($OUTLIST) { say $OUTLIST $fnamej or die "error writing to outlistfile: $!"; flush $OUTLIST; } paysleep 1; } # PAGE if ($OUTLIST) { close $OUTLIST or die "error closing outlistfile: $!"; } paysleep 0; say STDERR "All done."; __END__