#!/usr/bin/perl # $Id: eftp.pl,v 1.13 2002/09/16 15:36:45 erolison Exp erolison $ use strict; use warnings; require Net::FTP; require IO::File; require threads; use integer; #features, parallelise download. Search mirrors. save state and resume. #Be warned. This program deposits temp files in the current directory. my $debug = 1; #DEBUG is nice cos it tells you what's going on... #some defs. my $buffer = 4096; # how many bytes to roll back a failed download. my $block_read = 1024; #how many bytes to read at a time. should be less than $buffer / 2 #to prevent resume corruption. my $min_segment_size = 102400; # 100k, minimum download per 'parallel slice' #(so we don't download 1 byte from 1000 different servers ever). my $num_threads = 10; #how many mirrors to 'get' on the search, and thus how many downloads to run at once. my $timeout = 60; #how long to wait for a connection. my $anon_email_addr = 'test@localhost.com'; sub parse_url { my ( $hostname, $path, $filename, $user, $pass ); my $url = shift(@_); if ( $url =~ s,ftp://,, ) { if ( $debug ) { print "$url is an FTP url\n"; } my ( $first, $second ) = split ("@", $url ); if ( $second ) { #if second exists, it means there was an '@' in the url, so the first part is the logon my @tmp = split(":", $first); $user = $tmp[0]; if ( $tmp[1] ) { $pass = $tmp[1]; } if ( $debug ) { print "User: $user, pass: $pass\n"; } $first=$second; } my @tmp = split("/", $first); $hostname = shift(@tmp); $filename = pop(@tmp); $path = join("/", @tmp ); if ( $debug ) { print "$hostname - $path - $filename\n"; } } return ( $hostname, $path, $filename, $user, $pass ); } sub single_download { my @args = @_; my ( $host, $path, $filename, $user, $pass, $start_offset, $segment, $expected_size ) = @args; if ( $debug ) { print "single_download got:\n" ; print "host:$host\npath:$path\nfilename:$filename\n"; print "start_offset:$start_offset, segment:$segment, expected size: $expected_size\n"; } $start_offset = 0 unless ( $start_offset ); #my $stash = "$host:$filename$start_offset"; my $stash = "$filename:$start_offset:$host"; #first we check if we can resume this download... it may be that there is a part file already. my $ftp_session = Net::FTP -> new($host, Debug => $debug, Timeout => $timeout ) or return "FAILED:$stash"; if ( !$user && !$pass) { if ( $debug ) { print "Using anonymous login\n"; } $user = "anonymous"; $pass = 'test@localhost.com'; } $ftp_session -> login($user, $pass); $ftp_session -> binary; my $fhandle = new IO::File; if ( -f "$stash" ) { open ( $fhandle, "+<$stash" ) || die "$!"; #bit of an odd open, but this is because we #need to seek within it. my $seek_offset = ( -s $stash ) - $buffer; $seek_offset = ($seek_offset < 0) ? 0 : $seek_offset; $start_offset += $seek_offset; print "$stash already exists, resuming download from $start_offset (file offset $seek_offset)\n"; seek ($fhandle, $seek_offset, 0 ) or die "couldn't seek: $!"; # 0 to seek means absolute value, 1 means current + offset, 2 means EOF + offset } else { open ( $fhandle, ">$stash" ) || die "$!"; } #need to add some checking. It may be server path /var/tmp # or it may be server path ~/var/tmp. Check either for the file. $ftp_session -> cwd($path); if ( $ftp_session -> size($filename) != $expected_size ) { return "FAILED:$stash"; } if ( !$segment ) { if ( $debug ) { print "segment size set to EOF\n"; } $segment = ( $ftp_session -> size($filename) ); $segment -= $start_offset; } if ( $debug ) { print "$segment bytes to fetch ( $stash )\n"; } if ( $start_offset ) { ${*$ftp_session}{'net_ftp_rest'} = $start_offset; } my $data = $ftp_session -> retr($filename); my $iobuf; my $remaining = $segment; while ( $data && ($remaining > 0) && ( my $amount = $data -> read ( $iobuf, ($block_read < $segment) ? $block_read : $segment, $timeout) ) ) { print $fhandle $iobuf; $remaining -= $amount; #if ( $debug ) { print "$segment bytes left to read\n"; } #even for debugging, this is too noisy. } if ( $data ) { $data -> abort(); } close($fhandle); $ftp_session -> quit; if ( ( -s $stash ) >= $segment || $start_offset + ( -s $stash ) >= $expected_size ) { print STDOUT "$stash finished.\n"; return $stash; } else { print STDOUT "$stash failed.\n"; unlink $stash; return "FAILED:$stash"; } } sub byOffset { ( split(":", $a ) )[1] <=> ( split (":", $b ) )[1]; } sub merge_files { my @args = @_; #args 0 should be a file name of the type $filename:$offset:$host my @tmp = split(':', $args[0]); pop(@tmp); # ditch the @host off the end. pop(@tmp); # and the offset. my $target = join(':', @tmp); #in the unlikely even that the filename contains an : open ( DEST, ">$target" ) or die "$!"; #take a list of files to merge, sort them into order, and re-combine them. foreach my $file ( sort byOffset ( @_ ) ) { #my $fname = ( split('@', $file) )[0]; if ( $debug ) { print "opening $file for merge\n"; } open ( SRC, "$file" ) or die "$!"; seek ( DEST, (split(":", $file))[1], 0); print DEST ; close SRC; #unlink $file; } close DEST; } sub get_mirrors { use LWP::Simple; my $hit_pattern = 'click.alltheweb.com/go2/2/atw'; #not pretty, but they don't write nice HTML for me. my ( $hostname, $path, $filename, $user, $pass, $filesize ) = @_; my $srch_url = join("", "http://www.alltheweb.com/search?advanced=1&cat=ftp&q=", "$filename", "&jsact=&cs=utf-8&ftype=6&ld=&lp=", ($filesize > 0 ) ? "&ls1=$filesize&ls2=$filesize&" : "&", "hits=$num_threads", "&matches=&hitsprmatch=" ); if ( $debug ) { print "Searching: $srch_url\n"; } #WARNING: This code removed, (and subsequent bits hacked to fit) because it causes #Bizarre SvTYPE [248] errors with threading. #my $user_agent = LWP::UserAgent -> new; #$user_agent -> agent ("eftp/0.3 "); #my $request = HTTP::Request -> new ( GET => $srch_url ); #my $response = $user_agent -> request ( $request ); my $response = get($srch_url); #with return @test here, it is not... if ( $response ) { my @links = split("\n", $response ); my %results; foreach my $link ( @links ) { next unless ( $link =~ m/$hit_pattern/ ); if ( $debug ) { print "LINK: $link"; }; $link =~ s/.*href=\"//gi; $link =~ s/\">.*//gi; my @tmp=split("/", $link); while ( @tmp && !($tmp[0] eq "ftp" ) ) { shift(@tmp); } $tmp[0]="ftp:/"; if ( $debug ) { print join("/",@tmp), "\n"; } $results{join("/",@tmp)} = 1; } #return @test; return keys ( %results ); } else { print "Mirror search failed.\n"; return ( "" ); } } sub get_size { #log into a remote server, and do a 'size' command to give us some #idea of paralellising. my @args = @_; my ( $host, $path, $filename, $user, $pass) = @args; my $ftp_session = Net::FTP -> new ( $host, Debug => $debug, Timeout => $timeout) || return 0; if ( !$user && !$pass) { if ( $debug ) { print "Using anonymous login\n"; } $user = "anonymous"; $pass = $anon_email_addr; } $ftp_session -> login($user, $pass); $ftp_session -> binary; $ftp_session -> cwd($path); my $size = $ftp_session -> size ( $filename ); $ftp_session -> quit; if ( !$size ) { return 0 }; return $size; } my $URL = $ARGV[0]; if ( ! $URL ) { print "Usage: eftp.pl \n"; exit 1;} my @login_info = parse_url($URL); my $filesize = get_size(@login_info); if ( $debug ) { print "$URL: got size of $filesize\n";} my @mirrors = get_mirrors(@login_info, $filesize); #for testing if ( $#mirrors == -1 ) { @mirrors = ( "$URL" ); } if ( $debug ) { print "Mirror List:\n"; foreach my $mir (@mirrors) { print "$mir\n"; } } my $chunk_size = ( $filesize / ( 1 + $#mirrors ) > $min_segment_size ) ? $filesize / ( 1 + $#mirrors ) + 10 : $min_segment_size ; my $start_point = 0; my @thread_list; while ( $start_point < $filesize ) { foreach my $mirror ( @mirrors ) { next unless ( $start_point < $filesize ); if ( $debug ) { print "starting connect $mirror $start_point $chunk_size\n"; } if ( get_size(parse_url($mirror)) == $filesize ) #opens an ftp session and stats the file #to make sure it exists. { if ( $debug ) { print "OK to download $mirror.\n" }; push( @thread_list, threads -> new ( \&single_download, parse_url($mirror), $start_point, $chunk_size, $filesize ) ); $start_point += $chunk_size; } else { print "Error retrieving from $start_point:$chunk_size $mirror. Filesize doesn't match.\n"; } } #foreach if ( $start_point == 0 ) { die "No valid mirrors for file.\n"; } } my @merge_list; if ( $debug ) { print "Waiting for thread merge...\n";} foreach my $thread ( @thread_list ) { my $file = $thread -> join; if ( $file && ! ( $file =~ m/FAIL/ ) ) { if ( $debug ) { print "Adding $file to merge list\n" }; push(@merge_list, $file); } else { print "$file downloading in non-threaded mode from primary mirror\n"; my ($fail, $fname, $startpoint, $mirror) = split(":", $file ); if ( $debug ) { print "$file = File: $fname, Offset: $startpoint, mirror: $mirror\n"; } $file = single_download(parse_url($URL), $startpoint, $chunk_size, $filesize); if ( $debug ) { print "RET: $file\n"; } if ( $file =~ m/FAIL/ ) { print "WARNING: Significant error. retry of $file unsuccessful\n"; exit(1); } push(@merge_list, $file); } } merge_files(@merge_list);