TwoKeySortDisk1.pl (extracts keys/offset/length and writes to file)
Starting RAM: 2.74GB
Start Time: 05:03:13
Peak RAM: 2.78GB
Ending Time: 05:03:32
Run Time: 19 sec
Peak RAM Usage: 0.04GB * 1024 = 40MB
Sort (probably Cygwin, not Windows native)
Starting RAM: 2.75GB
Start Time: 05:04:39
Peak RAM: 3.02GB
Ending Time: 05:04:49
Run Time: 10 sec
Peak RAM Usage: 0.27GB * 1024 = 276MB
TwoKeySortDisk2.pl (reads sorted keys, reads original file in random mode, writes output text file)
Starting RAM: 2.73GB
Start Time: 05:07:56
Peak RAM: 2.82GB
Ending Time: 05:08:45
Run Time: 49 sec
Peak RAM Usage: 0.09GB * 1024 = 92MB
####
#!/usr/bin/perl -w
use strict;
my $NEWLINE_SIZE = length "\n"; # The size of the newline "character" in this OS
my $OS_ADJUST = 1; # A way to somewhat generically do OS-specific offset computation
my $KEY_OFFSET = 'O'; # Optimized key name for offset value
my $KEY_LENGTH = 'L'; # Optimized key name for length value
my $SEEK_SET = 0; # In case you don't want to export the constant for seek()
my $Inpfnm = 'test2.dat';
my $Wrkfnm = $Inpfnm . '-presort.dat';
my $Srtfnm = $Inpfnm . '-sorted.dat';
my $Outfnm = $Inpfnm . '-output.dat';
{
&convertKeysAndOffsets();
}
exit;
sub convertKeysAndOffsets
{
my $inputOffset = 0;
open INPUT_FILE, "<$Inpfnm";
open PRESORT_FILE, ">$Wrkfnm";
while (my $inputBuffer = )
{
chomp $inputBuffer;
# Only capture records which match the structure
if ($inputBuffer =~ /^\s*key(\d+)\s+key(\d+)\s+/)
{
# Capture the keys and record size
my $primaryKey = $1;
my $secondaryKey = $2;
my $inputLength = length $inputBuffer;
# Optimize the keys
my $optimizedKey = sprintf "%02d%02d", $primaryKey, $secondaryKey;
my $sortBuffer = "$optimizedKey\|$inputOffset\|$inputLength";
print PRESORT_FILE "$sortBuffer\n";
# Adjust the offset for read just committed.
###########################################################################################
### WARNING ### Test on small file to ensure you are getting the right results on your OS #
###########################################################################################
$inputOffset += $inputLength;
$inputOffset += $NEWLINE_SIZE;
$inputOffset += $OS_ADJUST;
}
}
close PRESORT_FILE;
close INPUT_FILE;
}
__END__
####
#!/usr/bin/perl -w
use strict;
my $NEWLINE_SIZE = length "\n"; # The size of the newline "character" in this OS
my $OS_ADJUST = 1; # A way to somewhat generically do OS-specific offset computation
my $KEY_OFFSET = 'O'; # Optimized key name for offset value
my $KEY_LENGTH = 'L'; # Optimized key name for length value
my $SEEK_SET = 0; # In case you don't want to export the constant for seek()
my $Inpfnm = 'test2.dat';
my $Wrkfnm = $Inpfnm . '-presort.dat';
my $Srtfnm = $Inpfnm . '-sorted.dat';
my $Outfnm = $Inpfnm . '-output.dat';
{
&sortFile();
&cleanUp();
}
exit;
sub sortFile
{
# In the old days this would also be known as shakeTheHardDrive()
open INPUT_FILE, '<', "$Inpfnm";
binmode INPUT_FILE;
open OUTPUT_FILE, ">$Outfnm";
open SORTED_FILE, "<$Srtfnm";
while (my $sortedKeyBuffer = )
{
chomp $sortedKeyBuffer;
my ($keyValue, $inputOffset, $workingLength) = split /\|/, $sortedKeyBuffer;
seek INPUT_FILE, $inputOffset, $SEEK_SET;
my $inputBuffer = '';
my $inputCount = read INPUT_FILE, $inputBuffer, $workingLength;
print OUTPUT_FILE "$inputBuffer\n";
}
close SORTED_FILE;
close OUTPUT_FILE;
close INPUT_FILE;
}
sub cleanUp
{
unlink $Wrkfnm;
unlink $Srtfnm;
}
__END__