#!/usr/bin/perl -w use strict; my $NEWLINE_SIZE = length "\n"; # The size of the newline "character" in this OS my $OS_ADJUST = 1; # A way to somewhat generically do OS-specific offset computation my $KEY_OFFSET = 'O'; # Optimized key name for offset value my $KEY_LENGTH = 'L'; # Optimized key name for length value my $SEEK_SET = 0; # In case you don't want to export the constant for seek() my %SeekInfo = (); { &loadKeysAndOffsets(); &sortFile(); } exit; sub loadKeysAndOffsets { my $inputOffset = 0; open INPUT_FILE, ") { chomp $inputBuffer; # Only capture records which match the structure if ($inputBuffer =~ /^\s*key(\d+)\s+key(\d+)\s+/) { # Capture the keys and record size my $primaryKey = $1; my $secondaryKey = $2; my $inputLength = length $inputBuffer; # Optimize the keys my $optimizedKey = sprintf "%02d%02d", $primaryKey, $secondaryKey; # Store the offset in a HoA (covers possibility of duplicate key pairs) push @{$SeekInfo{$optimizedKey}{$KEY_OFFSET}}, $inputOffset; push @{$SeekInfo{$optimizedKey}{$KEY_LENGTH}}, $inputLength; # Adjust the offset for read just committed. ########################################################################################### ### WARNING ### Test on small file to ensure you are getting the right results on your OS # ########################################################################################### $inputOffset += $inputLength; $inputOffset += $NEWLINE_SIZE; $inputOffset += $OS_ADJUST; } } close INPUT_FILE; } sub sortFile { # In the old days this would also be known as shakeTheHardDrive() open INPUT_FILE, '<', 'test1.dat'; binmode INPUT_FILE; open OUTPUT_FILE, ">test1.dat-output.dat"; foreach my $keyValue (sort keys %SeekInfo) { my @workingLength = (); push @workingLength, @{$SeekInfo{$keyValue}{$KEY_LENGTH}}; foreach my $inputOffset (@{$SeekInfo{$keyValue}{$KEY_OFFSET}}) { my $workingLength = shift @workingLength; seek INPUT_FILE, $inputOffset, $SEEK_SET; my $inputBuffer = ''; my $inputCount = read INPUT_FILE, $inputBuffer, $workingLength; print OUTPUT_FILE "$inputBuffer\n"; } } close OUTPUT_FILE; close INPUT_FILE; } __END__ 5M lines 20 GB file = 20 * 1024 * 1024 * 1024 = 21,474,836,480 To store offsets into this file you will need more than 2GB: Cannot use 4-byte integer So we need to go with 8-byte integers (hope your Perl supports this) 5M lines x 8 bytes per offset slot = 40,000,000 which is ~40MB RAM so far. Double it to store record size as well; up to ~80MB RAM -- Here we could probably use 4-byte integers -- But can you do both in the same script? Triple it to store optimized hash keys puts us up to 120MB RAM. Perl extras will likely be minimal. So, not quite fitting in 100MB but really close. -- Can squeeze it in if 32-bit and 64-bit integers can cohabitate (??) #### C:\Steve\Dev\PerlMonks\P-2013-09-16@0043-TwoKeySort>perl TwoKeySort.pl C:\Steve\Dev\PerlMonks\P-2013-09-16@0043-TwoKeySort>type test1.dat key1 key2 ndnjfgdsjfjjkjjfjf... key1 key2 kdfkjdfgdfugbjndkfgkjgndkjfjkd key43 key21 sdkjfhdghdbgbd key1 key3 jujdejnsduhffnjj key2 key2 jhzezhdjjf... C:\Steve\Dev\PerlMonks\P-2013-09-16@0043-TwoKeySort>type test1.dat-output.dat key1 key2 ndnjfgdsjfjjkjjfjf... key1 key2 kdfkjdfgdfugbjndkfgkjgndkjfjkd key1 key3 jujdejnsduhffnjj key2 key2 jhzezhdjjf... key43 key21 sdkjfhdghdbgbd C:\Steve\Dev\PerlMonks\P-2013-09-16@0043-TwoKeySort>