in reply to Randomizing Big Files

Here is some code I wrote when someone asked this question on irc some time ago (randomizing a big file that is, ignoring the distribute over processes aspect). It's a self-tuning recursive distribute/shuffle/collect. It tries hard to be friendly on memory and filesystem cache and only do streaming disk accesses. The drawback for your case might be that it takes a long time before the first results start coming out of this.
#! /usr/bin/perl -w # Shuffle the lines in a file potentially much bigger than memory # Author: Ton Hospel # License: GNU or artistic use strict; use File::Path; use List::Util qw(shuffle); # Limits on the helper chunks my $max_size = 10e6; my $max_files = 128; my $dir = "Shuffle.$$"; sub big_shuffle { my ($d, $in, $out) = @_; my $files; if (-f $in) { $files = int(($max_size - 1 + -s _) / $max_size); $files = $max_files if $files > $max_files; if ($files == 1 || $files == 2 && $max_size * 1.5 > -s _) { print($out shuffle(<$in>)) || die "Unexpected write error: + $!\n"; return; } } else { $files = $max_files; } my $format = sprintf("%s%s%%0%dd", $d, $d eq $dir ? "/" : ".", length($file +s)); my (@fhs, @names); for (0..$files-1) { $names[$_] = sprintf($format, $_+1); open($fhs[$_], ">", $names[$_]) || die "Could not create $names[$_]: $!"; } local $_; print({$fhs[rand $files]} $_) || die "Unexpected write error: $!\n +" while <$in>; close($_) || die "Unexpected close error: $!\n" for @fhs; close($in) || die "Unexpected input close error: $!\n"; for (@names) { open(my $fh, "<", $_) || die "Could not open $_: $!"; big_shuffle($_, $fh, $out); unlink($_) || die "Could not unlink $_: $!"; } } die "Too many arguments. Usage: $0 [in_file [out_file]]\n" if @ARGV > +2; my ($in, $out) = @ARGV; if (defined($in) && $in ne "") { open(my $fh, "<", $in) || die "Could not open $in: $!"; $in = $fh; } else { $in = \*STDIN; } if (defined($out) && $out ne "") { open(my $fh, ">", $out) || die "Could not create $out: $!"; $out = $fh; } else { $out = \*STDOUT; } mkpath($dir); eval { big_shuffle($dir, $in, $out); close($out) || die "Unexpected output close error: $!\n" }; my $rc = $@; rmtree($dir); die $rc if $rc;