#!/usr/bin/perl -w

use strict;

my   $NEWLINE_SIZE  = length "\n";      # The size of the newline "character" in this OS
my   $OS_ADJUST          = 1;                # A way to somewhat generically do OS-specific offset computation

my   $KEY_OFFSET    = 'O';                        # Optimized key name for offset value
my   $KEY_LENGTH    = 'L';                        # Optimized key name for length value

my   $SEEK_SET = 0;                     # In case you don't want to export the constant for seek()

my   %SeekInfo = ();

{
     &loadKeysAndOffsets();
     &sortFile();
}

exit;

sub loadKeysAndOffsets
{
     my $inputOffset = 0;
     open INPUT_FILE, "<test1.dat";
     while (my $inputBuffer = <INPUT_FILE>)
     {
          chomp $inputBuffer;
          # Only capture records which match the structure
          if ($inputBuffer =~ /^\s*key(\d+)\s+key(\d+)\s+/)
          {
               # Capture the keys and record size
               my $primaryKey = $1;
               my $secondaryKey = $2;
               my $inputLength = length $inputBuffer;
               # Optimize the keys
               my $optimizedKey = sprintf "%02d%02d", $primaryKey, $secondaryKey;
               # Store the offset in a HoA (covers possibility of duplicate key pairs)
               push @{$SeekInfo{$optimizedKey}{$KEY_OFFSET}}, $inputOffset;
               push @{$SeekInfo{$optimizedKey}{$KEY_LENGTH}}, $inputLength;
               # Adjust the offset for read just committed.
               ###########################################################################################
               ### WARNING ### Test on small file to ensure you are getting the right results on your OS #
               ###########################################################################################
               $inputOffset += $inputLength;
               $inputOffset += $NEWLINE_SIZE;
               $inputOffset += $OS_ADJUST;
          }
     }
     close INPUT_FILE;
}

sub sortFile
{
     # In the old days this would also be known as shakeTheHardDrive()
     open INPUT_FILE, '<', 'test1.dat';
     binmode INPUT_FILE;

     open OUTPUT_FILE, ">test1.dat-output.dat";

     foreach my $keyValue (sort keys %SeekInfo)
     {
          my @workingLength = ();
          push @workingLength, @{$SeekInfo{$keyValue}{$KEY_LENGTH}};
          foreach my $inputOffset (@{$SeekInfo{$keyValue}{$KEY_OFFSET}})
          {
               my $workingLength = shift @workingLength;
               seek INPUT_FILE, $inputOffset, $SEEK_SET;
               my $inputBuffer = '';
               my $inputCount = read INPUT_FILE, $inputBuffer, $workingLength;
               print OUTPUT_FILE "$inputBuffer\n";
          }
     }

     close OUTPUT_FILE;
     close INPUT_FILE;
}

__END__


5M lines
20 GB file = 20 * 1024 * 1024 * 1024 = 21,474,836,480
To store offsets into this file you will need more than 2GB:  Cannot use 4-byte integer
So we need to go with 8-byte integers (hope your Perl supports this)

5M lines x 8 bytes per offset slot = 40,000,000 which is ~40MB RAM so far.
Double it to store record size as well; up to ~80MB RAM
     -- Here we could probably use 4-byte integers
     -- But can you do both in the same script?
Triple it to store optimized hash keys puts us up to 120MB RAM.
Perl extras will likely be minimal.
So, not quite fitting in 100MB but really close.
     -- Can squeeze it in if 32-bit and 64-bit integers can cohabitate (??)


##</code><code>##

C:\Steve\Dev\PerlMonks\P-2013-09-16@0043-TwoKeySort>perl TwoKeySort.pl

C:\Steve\Dev\PerlMonks\P-2013-09-16@0043-TwoKeySort>type test1.dat
key1  key2  ndnjfgdsjfjjkjjfjf...
key1  key2  kdfkjdfgdfugbjndkfgkjgndkjfjkd
key43 key21 sdkjfhdghdbgbd
key1  key3  jujdejnsduhffnjj
key2  key2  jhzezhdjjf...

C:\Steve\Dev\PerlMonks\P-2013-09-16@0043-TwoKeySort>type test1.dat-output.dat
key1  key2  ndnjfgdsjfjjkjjfjf...
key1  key2  kdfkjdfgdfugbjndkfgkjgndkjfjkd
key1  key3  jujdejnsduhffnjj
key2  key2  jhzezhdjjf...
key43 key21 sdkjfhdghdbgbd

C:\Steve\Dev\PerlMonks\P-2013-09-16@0043-TwoKeySort>