#!/usr/bin/perl 

# $Id: eftp.pl,v 1.13 2002/09/16 15:36:45 erolison Exp erolison $

use strict;
use warnings;

require Net::FTP;
require IO::File;
require threads;
use integer;

#features, parallelise download. Search mirrors. save state and resume.
#Be warned. This program deposits temp files in the current directory.

my $debug = 1;
#DEBUG is nice cos it tells you what's going on...

#some defs. 
my $buffer = 4096; 	# how many bytes to roll back a failed download.
my $block_read = 1024; 	#how many bytes to read at a time. should be less than $buffer / 2 
                        #to prevent resume corruption.
my $min_segment_size = 102400; # 100k, minimum download per 'parallel slice' 
      #(so we don't download 1 byte from 1000 different servers ever).
my $num_threads = 10; 
   #how many mirrors to 'get' on the search, and thus how many downloads to run at once.

my $timeout = 60; #how long to wait for a connection.
my $anon_email_addr = 'test@localhost.com';

sub parse_url
{
  my ( $hostname, $path, $filename, $user, $pass );

  my $url = shift(@_);

  if ( $url =~ s,ftp://,, )
  {
    if ( $debug ) { print "$url is an FTP url\n"; }

    my ( $first, $second ) = split ("@", $url );

    if ( $second )
    {
      #if second exists, it means there was an '@' in the url, so the first part is the logon
      my @tmp = split(":", $first);
      $user = $tmp[0];
      
      if ( $tmp[1] ) { $pass = $tmp[1]; }

      if ( $debug ) { print "User: $user, pass: $pass\n"; }
      $first=$second;
    }
    
    my @tmp = split("/", $first);
    $hostname = shift(@tmp);
    $filename = pop(@tmp);
    $path = join("/", @tmp );

    if ( $debug ) { print "$hostname - $path - $filename\n"; }
  }    

  return ( $hostname, $path, $filename, $user, $pass );
}

sub single_download
{
  my @args = @_;
  my ( $host, $path, $filename, $user, $pass, 
	$start_offset, $segment, $expected_size ) = @args;

  if ( $debug ) 
  { 
    print "single_download got:\n" ;
    print "host:$host\npath:$path\nfilename:$filename\n";
    print "start_offset:$start_offset, segment:$segment, expected size: $expected_size\n";
  }
  $start_offset = 0 unless ( $start_offset );


  #my $stash = "$host:$filename$start_offset";
  my $stash = "$filename:$start_offset:$host";

  #first we check if we can resume this download... it may be that there is a part file already.
  my $ftp_session = Net::FTP -> new($host, Debug => $debug, Timeout => $timeout ) 
 	or return "FAILED:$stash";
  if ( !$user && !$pass)
  {
    if ( $debug ) { print "Using anonymous login\n"; }
    $user = "anonymous";
    $pass = 'test@localhost.com';
  }

  $ftp_session -> login($user, $pass);
  $ftp_session -> binary;
  

  my $fhandle = new IO::File;

  if ( -f "$stash" )
  {
    open ( $fhandle, "+<$stash" ) || die "$!";   
      #bit of an odd open, but this is because we
      #need to seek within it.
    my $seek_offset = ( -s $stash ) - $buffer;
    $seek_offset = ($seek_offset < 0) ? 0 : $seek_offset;
    $start_offset += $seek_offset;
    print "$stash already exists, resuming download from $start_offset (file offset $seek_offset)\n";
    seek ($fhandle, $seek_offset, 0 ) or die "couldn't seek: $!";  
      # 0 to seek means absolute value, 1 means current + offset, 2 means EOF + offset
  }
  else 
  {
    open ( $fhandle, ">$stash" ) || die "$!";
  }  



  #need to add some checking. It may be server path /var/tmp
  # or it may be server path ~/var/tmp. Check either for the file.
  
  $ftp_session -> cwd($path);
  if ( $ftp_session -> size($filename) != $expected_size )
  {
    return "FAILED:$stash";
  }
  if ( !$segment ) 
  {
    if ( $debug ) { print "segment size set to EOF\n"; }
    $segment = ( $ftp_session -> size($filename) );
    $segment -= $start_offset;
  }
  if ( $debug ) { print "$segment bytes to fetch ( $stash )\n"; }

  if ( $start_offset ) { ${*$ftp_session}{'net_ftp_rest'} = $start_offset; }

  my $data = $ftp_session -> retr($filename);
  my $iobuf;
  my $remaining = $segment;
  while ( $data && ($remaining > 0) && 
          ( my $amount = $data -> read ( $iobuf, 
($block_read < $segment) ? $block_read : $segment, 
$timeout) ) )
  {
    print $fhandle $iobuf;
    $remaining -= $amount;
    #if ( $debug ) { print "$segment bytes left to read\n"; } 
                     #even for debugging, this is too noisy.
  }
  if ( $data ) { $data -> abort(); }

  close($fhandle);
  $ftp_session -> quit;

  if ( ( -s $stash ) >= $segment || $start_offset + ( -s $stash ) >= $expected_size )
  {
    print STDOUT "$stash finished.\n";
    return $stash;
  }
  else
  {
    print STDOUT "$stash failed.\n";
    unlink $stash;
    return "FAILED:$stash";
  }
}

sub byOffset
{
  ( split(":", $a ) )[1] <=> ( split (":", $b ) )[1];
}

sub merge_files
{
  my @args = @_;
  #args 0 should be a file name of the type $filename:$offset:$host
  my @tmp = split(':', $args[0]);
  pop(@tmp); # ditch the @host off the end.
  pop(@tmp); # and the offset.
  my $target = join(':', @tmp); #in the unlikely even that the filename contains an :

  open ( DEST, ">$target" ) or die "$!";

  #take a list of files to merge, sort them into order, and re-combine them.
  foreach my $file ( sort byOffset ( @_ ) )
  {
    #my $fname = ( split('@', $file) )[0];
    if ( $debug ) { print "opening $file for merge\n"; }
    open ( SRC, "$file" ) or die "$!";
    seek ( DEST, (split(":", $file))[1], 0);
    print DEST <SRC>;
    close SRC;
    #unlink $file;
  }
  close DEST;

}

sub get_mirrors
{
  use LWP::Simple;
  my $hit_pattern = 'click.alltheweb.com/go2/2/atw'; 
     #not pretty, but they don't write nice HTML for me.

  my ( $hostname, $path, $filename, $user, $pass, $filesize ) = @_;
  my $srch_url = join("",
     "http://www.alltheweb.com/search?advanced=1&cat=ftp&q=",
     "$filename",
     "&jsact=&cs=utf-8&ftype=6&ld=&lp=",
     ($filesize > 0 ) ? "&ls1=$filesize&ls2=$filesize&" : "&",
     "hits=$num_threads",
     "&matches=&hitsprmatch=" );
  if ( $debug ) { print "Searching: $srch_url\n"; }
  

  #WARNING: This code removed, (and subsequent bits hacked to fit) because it causes
  #Bizarre SvTYPE [248] errors with threading.
  #my $user_agent = LWP::UserAgent -> new;
  #$user_agent -> agent ("eftp/0.3 ");
  #my $request = HTTP::Request -> new ( GET => $srch_url );
  #my $response = $user_agent -> request ( $request );

  my $response = get($srch_url);

  #with return @test here, it is not...
  if ( $response ) {
    my @links = split("\n", $response );
    my %results;
    foreach my $link ( @links )
    {
       next unless ( $link =~ m/$hit_pattern/ );
       if ( $debug ) { print "LINK: $link"; };
       $link =~ s/.*href=\"//gi;
       $link =~ s/\">.*//gi;
       my @tmp=split("/", $link);
       while ( @tmp && !($tmp[0] eq "ftp" ) )
       {
         shift(@tmp);
       }
       $tmp[0]="ftp:/";
       if ( $debug ) { print join("/",@tmp), "\n"; }
       $results{join("/",@tmp)} = 1;
    }
    #return @test;
    return keys ( %results );
  }
  else
  {
    print "Mirror search failed.\n";
    return ( "" );  
  }
  
}

sub get_size
{
  #log into a remote server, and do a 'size' command to give us some
  #idea of paralellising.
  my @args = @_;
  my ( $host, $path, $filename, $user, $pass) = @args;
  
  my $ftp_session = Net::FTP -> new ( $host, Debug => $debug, Timeout => $timeout) || return 0;
  if ( !$user && !$pass)
  {
    if ( $debug ) { print "Using anonymous login\n"; }
    $user = "anonymous";
    $pass = $anon_email_addr;
  }

  $ftp_session -> login($user, $pass);
  $ftp_session -> binary;
  $ftp_session -> cwd($path);
  my $size = $ftp_session -> size ( $filename );
  $ftp_session -> quit;
  if ( !$size ) { return 0 };
  return $size;
}


my $URL = $ARGV[0];
if ( ! $URL ) { print "Usage: eftp.pl <url to fetch>\n"; exit 1;}

my @login_info = parse_url($URL);
my $filesize = get_size(@login_info);

if ( $debug ) { print "$URL: got size of $filesize\n";}

my @mirrors = get_mirrors(@login_info, $filesize);

#for testing

if ( $#mirrors == -1 ) 
{
  @mirrors = ( "$URL" );
}

if ( $debug )
{
  print "Mirror List:\n";
  foreach my $mir (@mirrors)
  {
    print "$mir\n";
  }
}

my $chunk_size = ( $filesize / ( 1 + $#mirrors ) > $min_segment_size ) 
	? $filesize / ( 1 + $#mirrors ) + 10 : $min_segment_size ;
my $start_point = 0;
my @thread_list;

while ( $start_point < $filesize )
{
  foreach my $mirror ( @mirrors )
  {
    next unless ( $start_point < $filesize );
    if ( $debug ) { print "starting connect $mirror $start_point $chunk_size\n"; }

    if ( get_size(parse_url($mirror)) == $filesize )
			#opens an ftp session and stats the file
   		        #to make sure it exists.
    {
      if ( $debug ) { print "OK to download $mirror.\n" };
      push( @thread_list, 
threads -> new ( \&single_download, parse_url($mirror), 
$start_point, $chunk_size, $filesize ) );
      $start_point += $chunk_size;
    }
    else
    {
      print "Error retrieving from $start_point:$chunk_size $mirror. Filesize doesn't match.\n";
    }
  } #foreach
  if ( $start_point == 0 ) { die "No valid mirrors for file.\n"; }
}

my @merge_list;

if ( $debug ) { print "Waiting for thread merge...\n";}
foreach my $thread ( @thread_list )
{
  my $file = $thread -> join;
  if ( $file && ! ( $file =~ m/FAIL/ ) )
  {
    if ( $debug ) { print "Adding $file to merge list\n" };
    push(@merge_list, $file);
  }
  else 
  {
    print "$file downloading in non-threaded mode from primary mirror\n";
    my ($fail, $fname, $startpoint, $mirror) = split(":", $file );
    if ( $debug ) { print "$file = File: $fname, Offset: $startpoint, mirror: $mirror\n"; }
    $file = single_download(parse_url($URL), $startpoint,
$chunk_size, $filesize);
    if ( $debug ) { print "RET: $file\n"; }
    if ( $file =~ m/FAIL/ ) 
    {
      print "WARNING: Significant error. retry of $file unsuccessful\n";
      exit(1); 
    }
    push(@merge_list, $file);
  }
}

merge_files(@merge_list);