TwoKeySortDisk1.pl (extracts keys/offset/length and writes to file) Starting RAM: 2.74GB Start Time: 05:03:13 Peak RAM: 2.78GB Ending Time: 05:03:32 Run Time: 19 sec Peak RAM Usage: 0.04GB * 1024 = 40MB Sort (probably Cygwin, not Windows native) Starting RAM: 2.75GB Start Time: 05:04:39 Peak RAM: 3.02GB Ending Time: 05:04:49 Run Time: 10 sec Peak RAM Usage: 0.27GB * 1024 = 276MB TwoKeySortDisk2.pl (reads sorted keys, reads original file in random mode, writes output text file) Starting RAM: 2.73GB Start Time: 05:07:56 Peak RAM: 2.82GB Ending Time: 05:08:45 Run Time: 49 sec Peak RAM Usage: 0.09GB * 1024 = 92MB #### #!/usr/bin/perl -w use strict; my $NEWLINE_SIZE = length "\n"; # The size of the newline "character" in this OS my $OS_ADJUST = 1; # A way to somewhat generically do OS-specific offset computation my $KEY_OFFSET = 'O'; # Optimized key name for offset value my $KEY_LENGTH = 'L'; # Optimized key name for length value my $SEEK_SET = 0; # In case you don't want to export the constant for seek() my $Inpfnm = 'test2.dat'; my $Wrkfnm = $Inpfnm . '-presort.dat'; my $Srtfnm = $Inpfnm . '-sorted.dat'; my $Outfnm = $Inpfnm . '-output.dat'; { &convertKeysAndOffsets(); } exit; sub convertKeysAndOffsets { my $inputOffset = 0; open INPUT_FILE, "<$Inpfnm"; open PRESORT_FILE, ">$Wrkfnm"; while (my $inputBuffer = ) { chomp $inputBuffer; # Only capture records which match the structure if ($inputBuffer =~ /^\s*key(\d+)\s+key(\d+)\s+/) { # Capture the keys and record size my $primaryKey = $1; my $secondaryKey = $2; my $inputLength = length $inputBuffer; # Optimize the keys my $optimizedKey = sprintf "%02d%02d", $primaryKey, $secondaryKey; my $sortBuffer = "$optimizedKey\|$inputOffset\|$inputLength"; print PRESORT_FILE "$sortBuffer\n"; # Adjust the offset for read just committed. ########################################################################################### ### WARNING ### Test on small file to ensure you are getting the right results on your OS # ########################################################################################### $inputOffset += $inputLength; $inputOffset += $NEWLINE_SIZE; $inputOffset += $OS_ADJUST; } } close PRESORT_FILE; close INPUT_FILE; } __END__ #### #!/usr/bin/perl -w use strict; my $NEWLINE_SIZE = length "\n"; # The size of the newline "character" in this OS my $OS_ADJUST = 1; # A way to somewhat generically do OS-specific offset computation my $KEY_OFFSET = 'O'; # Optimized key name for offset value my $KEY_LENGTH = 'L'; # Optimized key name for length value my $SEEK_SET = 0; # In case you don't want to export the constant for seek() my $Inpfnm = 'test2.dat'; my $Wrkfnm = $Inpfnm . '-presort.dat'; my $Srtfnm = $Inpfnm . '-sorted.dat'; my $Outfnm = $Inpfnm . '-output.dat'; { &sortFile(); &cleanUp(); } exit; sub sortFile { # In the old days this would also be known as shakeTheHardDrive() open INPUT_FILE, '<', "$Inpfnm"; binmode INPUT_FILE; open OUTPUT_FILE, ">$Outfnm"; open SORTED_FILE, "<$Srtfnm"; while (my $sortedKeyBuffer = ) { chomp $sortedKeyBuffer; my ($keyValue, $inputOffset, $workingLength) = split /\|/, $sortedKeyBuffer; seek INPUT_FILE, $inputOffset, $SEEK_SET; my $inputBuffer = ''; my $inputCount = read INPUT_FILE, $inputBuffer, $workingLength; print OUTPUT_FILE "$inputBuffer\n"; } close SORTED_FILE; close OUTPUT_FILE; close INPUT_FILE; } sub cleanUp { unlink $Wrkfnm; unlink $Srtfnm; } __END__