This was hacked up while I ate lunch today. Give it a maximum number of records per file and very large input source and awaaaaay you go.
Use your own sortsub of course. And your own input records (I used random numbers). Otherwise, it's fit to use.
#!/usr/bin/perl -w
# Mergesort
use IO::Handle; # For the ->getline
require 5.6.0; # Sort sub prototypes
$recs=13; # Total number of records to sort.....
# Leave out of the real thing
$max=5; # Maximum number of records per merge file
@files=();
# The prototype is needed because we want lexical
# values in the sort because we're using it as a
# regular comparison and as a sort sub.
sub sortsub ($$) { my($c,$d)=@_; return $c<=>$d; }
{
# Should be POSIX::tmpnam. But I'm lazy at the moment.
# (Under UNIX you can even re-use the same name each
# time and just unlink it after the push()!)
$tempname="fooaa";
sub store {
my($a)=@_;
my $f;
open($f, "+>/tmp/$tempname") || die;
print $f sort sortsub @$a; # Sort small pile
seek $f, 0, 0 or warn "Can't seek: $!";
push(@files, {
fh => $f,
queued => scalar <$f>,
});
$tempname++;
}
}
# This is where you'd read the input file to exhaustion
# I'm just making up data. The important part is the block itself.
while($_=rand() . "\n", $recs--) {
push(@sortarr, $_);
if (@sortarr==$max) {
store(\@sortarr);
@sortarr=();
}
}
store(\@sortarr) if @sortarr; # Store the leftovers
LOOP: {
($lowest)=(sort {
sortsub($a->{queued}, $b->{queued});
} grep(defined $_->{queued}, @files) )[0];
last unless defined $lowest->{queued};
# Do your processing here
print $lowest->{queued};
$lowest->{queued}=$lowest->{fh}->getline();
redo;
}
|