Parallel threading resumable FTP client

On my (windows) PC, I have had numerous download managers. The sort of program that allows you to start/resume file downloads, and run parallel downloads of separate mirror servers. Then my insanity struck, and it occurred to me that that could be done in Perl.
So after much hackery and joy, here it is:

All written within 'strict'.
Invocation is 'eftp.pl ftp://ftp.cpan.org/pub/CPAN/src/perl-5.8.0.tar.gz'
A ^C in the middle should stop it, and leave all the temp files, and restarting should cause a resume. (Hopefully, I haven't tested _extensively_ but it seems to work).
It was also used as an excuse to try out perl threading.
Future features include being able to specify alternate mirrors (rather than forcing reliance on the mirror search) and multiple different file parallel downloads. And maybe even somethings neat like bandwidth control.... (oh, and of course, much documentation :))

#!/usr/bin/perl 

# $Id: eftp.pl,v 1.13 2002/09/16 15:36:45 erolison Exp erolison $

use strict;
use warnings;

require Net::FTP;
require IO::File;
require threads;
use integer;

#features, parallelise download. Search mirrors. save state and resume
+.
#Be warned. This program deposits temp files in the current directory.

my $debug = 1;
#DEBUG is nice cos it tells you what's going on...

#some defs. 
my $buffer = 4096;     # how many bytes to roll back a failed download
+.
my $block_read = 1024;     #how many bytes to read at a time. should b
+e less than $buffer / 2 
                        #to prevent resume corruption.
my $min_segment_size = 102400; # 100k, minimum download per 'parallel 
+slice' 
      #(so we don't download 1 byte from 1000 different servers ever).
my $num_threads = 10; 
   #how many mirrors to 'get' on the search, and thus how many downloa
+ds to run at once.

my $timeout = 60; #how long to wait for a connection.
my $anon_email_addr = 'test@localhost.com';

sub parse_url
{
  my ( $hostname, $path, $filename, $user, $pass );

  my $url = shift(@_);

  if ( $url =~ s,ftp://,, )
  {
    if ( $debug ) { print "$url is an FTP url\n"; }

    my ( $first, $second ) = split ("@", $url );

    if ( $second )
    {
      #if second exists, it means there was an '@' in the url, so the 
+first part is the logon
      my @tmp = split(":", $first);
      $user = $tmp[0];
      
      if ( $tmp[1] ) { $pass = $tmp[1]; }

      if ( $debug ) { print "User: $user, pass: $pass\n"; }
      $first=$second;
    }
    
    my @tmp = split("/", $first);
    $hostname = shift(@tmp);
    $filename = pop(@tmp);
    $path = join("/", @tmp );

    if ( $debug ) { print "$hostname - $path - $filename\n"; }
  }    

  return ( $hostname, $path, $filename, $user, $pass );
}

sub single_download
{
  my @args = @_;
  my ( $host, $path, $filename, $user, $pass, 
    $start_offset, $segment, $expected_size ) = @args;

  if ( $debug ) 
  { 
    print "single_download got:\n" ;
    print "host:$host\npath:$path\nfilename:$filename\n";
    print "start_offset:$start_offset, segment:$segment, expected size
+: $expected_size\n";
  }
  $start_offset = 0 unless ( $start_offset );


  #my $stash = "$host:$filename$start_offset";
  my $stash = "$filename:$start_offset:$host";

  #first we check if we can resume this download... it may be that the
+re is a part file already.
  my $ftp_session = Net::FTP -> new($host, Debug => $debug, Timeout =>
+ $timeout ) 
     or return "FAILED:$stash";
  if ( !$user && !$pass)
  {
    if ( $debug ) { print "Using anonymous login\n"; }
    $user = "anonymous";
    $pass = 'test@localhost.com';
  }

  $ftp_session -> login($user, $pass);
  $ftp_session -> binary;
  

  my $fhandle = new IO::File;

  if ( -f "$stash" )
  {
    open ( $fhandle, "+<$stash" ) || die "$!";   
      #bit of an odd open, but this is because we
      #need to seek within it.
    my $seek_offset = ( -s $stash ) - $buffer;
    $seek_offset = ($seek_offset < 0) ? 0 : $seek_offset;
    $start_offset += $seek_offset;
    print "$stash already exists, resuming download from $start_offset
+ (file offset $seek_offset)\n";
    seek ($fhandle, $seek_offset, 0 ) or die "couldn't seek: $!";  
      # 0 to seek means absolute value, 1 means current + offset, 2 me
+ans EOF + offset
  }
  else 
  {
    open ( $fhandle, ">$stash" ) || die "$!";
  }  



  #need to add some checking. It may be server path /var/tmp
  # or it may be server path ~/var/tmp. Check either for the file.
  
  $ftp_session -> cwd($path);
  if ( $ftp_session -> size($filename) != $expected_size )
  {
    return "FAILED:$stash";
  }
  if ( !$segment ) 
  {
    if ( $debug ) { print "segment size set to EOF\n"; }
    $segment = ( $ftp_session -> size($filename) );
    $segment -= $start_offset;
  }
  if ( $debug ) { print "$segment bytes to fetch ( $stash )\n"; }

  if ( $start_offset ) { ${*$ftp_session}{'net_ftp_rest'} = $start_off
+set; }

  my $data = $ftp_session -> retr($filename);
  my $iobuf;
  my $remaining = $segment;
  while ( $data && ($remaining > 0) && 
          ( my $amount = $data -> read ( $iobuf, 
($block_read < $segment) ? $block_read : $segment, 
$timeout) ) )
  {
    print $fhandle $iobuf;
    $remaining -= $amount;
    #if ( $debug ) { print "$segment bytes left to read\n"; } 
                     #even for debugging, this is too noisy.
  }
  if ( $data ) { $data -> abort(); }

  close($fhandle);
  $ftp_session -> quit;

  if ( ( -s $stash ) >= $segment || $start_offset + ( -s $stash ) >= $
+expected_size )
  {
    print STDOUT "$stash finished.\n";
    return $stash;
  }
  else
  {
    print STDOUT "$stash failed.\n";
    unlink $stash;
    return "FAILED:$stash";
  }
}

sub byOffset
{
  ( split(":", $a ) )[1] <=> ( split (":", $b ) )[1];
}

sub merge_files
{
  my @args = @_;
  #args 0 should be a file name of the type $filename:$offset:$host
  my @tmp = split(':', $args[0]);
  pop(@tmp); # ditch the @host off the end.
  pop(@tmp); # and the offset.
  my $target = join(':', @tmp); #in the unlikely even that the filenam
+e contains an :

  open ( DEST, ">$target" ) or die "$!";

  #take a list of files to merge, sort them into order, and re-combine
+ them.
  foreach my $file ( sort byOffset ( @_ ) )
  {
    #my $fname = ( split('@', $file) )[0];
    if ( $debug ) { print "opening $file for merge\n"; }
    open ( SRC, "$file" ) or die "$!";
    seek ( DEST, (split(":", $file))[1], 0);
    print DEST <SRC>;
    close SRC;
    #unlink $file;
  }
  close DEST;

}

sub get_mirrors
{
  use LWP::Simple;
  my $hit_pattern = 'click.alltheweb.com/go2/2/atw'; 
     #not pretty, but they don't write nice HTML for me.

  my ( $hostname, $path, $filename, $user, $pass, $filesize ) = @_;
  my $srch_url = join("",
     "http://www.alltheweb.com/search?advanced=1&cat=ftp&q=",
     "$filename",
     "&jsact=&cs=utf-8&ftype=6&ld=&lp=",
     ($filesize > 0 ) ? "&ls1=$filesize&ls2=$filesize&" : "&",
     "hits=$num_threads",
     "&matches=&hitsprmatch=" );
  if ( $debug ) { print "Searching: $srch_url\n"; }
  

  #WARNING: This code removed, (and subsequent bits hacked to fit) bec
+ause it causes
  #Bizarre SvTYPE [248] errors with threading.
  #my $user_agent = LWP::UserAgent -> new;
  #$user_agent -> agent ("eftp/0.3 ");
  #my $request = HTTP::Request -> new ( GET => $srch_url );
  #my $response = $user_agent -> request ( $request );

  my $response = get($srch_url);

  #with return @test here, it is not...
  if ( $response ) {
    my @links = split("\n", $response );
    my %results;
    foreach my $link ( @links )
    {
       next unless ( $link =~ m/$hit_pattern/ );
       if ( $debug ) { print "LINK: $link"; };
       $link =~ s/.*href=\"//gi;
       $link =~ s/\">.*//gi;
       my @tmp=split("/", $link);
       while ( @tmp && !($tmp[0] eq "ftp" ) )
       {
         shift(@tmp);
       }
       $tmp[0]="ftp:/";
       if ( $debug ) { print join("/",@tmp), "\n"; }
       $results{join("/",@tmp)} = 1;
    }
    #return @test;
    return keys ( %results );
  }
  else
  {
    print "Mirror search failed.\n";
    return ( "" );  
  }
  
}

sub get_size
{
  #log into a remote server, and do a 'size' command to give us some
  #idea of paralellising.
  my @args = @_;
  my ( $host, $path, $filename, $user, $pass) = @args;
  
  my $ftp_session = Net::FTP -> new ( $host, Debug => $debug, Timeout 
+=> $timeout) || return 0;
  if ( !$user && !$pass)
  {
    if ( $debug ) { print "Using anonymous login\n"; }
    $user = "anonymous";
    $pass = $anon_email_addr;
  }

  $ftp_session -> login($user, $pass);
  $ftp_session -> binary;
  $ftp_session -> cwd($path);
  my $size = $ftp_session -> size ( $filename );
  $ftp_session -> quit;
  if ( !$size ) { return 0 };
  return $size;
}


my $URL = $ARGV[0];
if ( ! $URL ) { print "Usage: eftp.pl <url to fetch>\n"; exit 1;}

my @login_info = parse_url($URL);
my $filesize = get_size(@login_info);

if ( $debug ) { print "$URL: got size of $filesize\n";}

my @mirrors = get_mirrors(@login_info, $filesize);

#for testing

if ( $#mirrors == -1 ) 
{
  @mirrors = ( "$URL" );
}

if ( $debug )
{
  print "Mirror List:\n";
  foreach my $mir (@mirrors)
  {
    print "$mir\n";
  }
}

my $chunk_size = ( $filesize / ( 1 + $#mirrors ) > $min_segment_size )
+ 
    ? $filesize / ( 1 + $#mirrors ) + 10 : $min_segment_size ;
my $start_point = 0;
my @thread_list;

while ( $start_point < $filesize )
{
  foreach my $mirror ( @mirrors )
  {
    next unless ( $start_point < $filesize );
    if ( $debug ) { print "starting connect $mirror $start_point $chun
+k_size\n"; }

    if ( get_size(parse_url($mirror)) == $filesize )
            #opens an ftp session and stats the file
                   #to make sure it exists.
    {
      if ( $debug ) { print "OK to download $mirror.\n" };
      push( @thread_list, 
threads -> new ( \&single_download, parse_url($mirror), 
$start_point, $chunk_size, $filesize ) );
      $start_point += $chunk_size;
    }
    else
    {
      print "Error retrieving from $start_point:$chunk_size $mirror. F
+ilesize doesn't match.\n";
    }
  } #foreach
  if ( $start_point == 0 ) { die "No valid mirrors for file.\n"; }
}

my @merge_list;

if ( $debug ) { print "Waiting for thread merge...\n";}
foreach my $thread ( @thread_list )
{
  my $file = $thread -> join;
  if ( $file && ! ( $file =~ m/FAIL/ ) )
  {
    if ( $debug ) { print "Adding $file to merge list\n" };
    push(@merge_list, $file);
  }
  else 
  {
    print "$file downloading in non-threaded mode from primary mirror\
+n";
    my ($fail, $fname, $startpoint, $mirror) = split(":", $file );
    if ( $debug ) { print "$file = File: $fname, Offset: $startpoint, 
+mirror: $mirror\n"; }
    $file = single_download(parse_url($URL), $startpoint,
$chunk_size, $filesize);
    if ( $debug ) { print "RET: $file\n"; }
    if ( $file =~ m/FAIL/ ) 
    {
      print "WARNING: Significant error. retry of $file unsuccessful\n
+";
      exit(1); 
    }
    push(@merge_list, $file);
  }
}

merge_files(@merge_list);
[download]

--
It's not pessimism if there is a worse option, it's not paranoia when they are and it's not cynicism when you're right.

Comment on Parallel threading resumable FTP client Download Code