comment on

I am trying to sort a very large text file where the sort key is composed of multiple columns which are strings and numerics. If I sort using unix sort I sort it in about 10 minutes without hogging the memory. However, in unix I cannot configure the input record separator. So if the data comes with embedded \n charcters in the middle then this breaks the sort. So I wrote a Perl sort based on a hash tied to a btree (DB_File CPAN module) and localizing $/ to two characters ascii char(164) followed by \n. I reproduced the unsorted file to have these two charcters at the end of each line. This works but the sort takes about 40 minutes on a Linux machine with 8 CPUs (4 CPUs with HyperThreading) and 15GB of memory. During this time interval the memory is eventually hogged completly so that other process fail to start until this sort is done. Is there the equivalent of $/ in Unix? or Is there an equivalent Perl sort that is as efficient as the Unix sort? here is the sorting code just in case you are wondering how I am doing this:

sub btree{
    my ($disk,$workdir,$outFileName,$sep,$keyMap_aref,$verbose,$infile
+_aref,$indir,$outdir,$splitSize,$keyPos_aref,$outCol_aref)=@_;
    my @keyMap=@{$keyMap_aref};
    my @infile=@{$infile_aref};
    my $outfile=$outdir.'/'.$outFileName;
    my @keyPos=@{$keyPos_aref};
    my @outCol=@{$outCol_aref} if defined $outCol_aref; 
    my $idx;
    my %h ;

    # generate a ref to anonymous sub for parsing key from data line
    # #################################
    my $parse;
    #my $tmp=$splitSize-2;
    my $list=join ',',@keyPos;
    #my $pcode='$parse=sub {my($line)=@_;my $key=join(\''."$sep".'\',(
+(split(/'."$sep".'/,$line,'."$splitSize".'))['."$list".']));return $k
+ey}';
    my $pcode='$parse=sub {my($line)=@_;join(\''."$sep".'\',((split(/'
+."$sep".'/,$line,'."$splitSize".'))['."$list".']));};';
    print "pcode: $pcode\n" if $verbose;
    eval $pcode;
    # #################################

    if($disk){
        $idx="$workdir".'/'."$outFileName".'.idx';
        if(-f $idx){
            unlink "$idx" or croak "cannot unlik $idx: $!\n";
        }

    }
    # create the btree object
    # #################################
    my $t = '$DB_BTREE->{\'compare\'} = '.genComp($sep,\@keyMap);
    print "sort criteria: $t\n" if $verbose;
    eval $t;
    my $mybtree;
    if ($disk){
        $mybtree=tie %h, "DB_File", "$idx", O_RDWR|O_CREAT, 0666, $DB_
+BTREE or croak "Cannot open file $idx: $!\n" ;
    }
    else{
        $mybtree=tie %h, "DB_File", undef, O_RDWR|O_CREAT, 0666, $DB_B
+TREE ;
    }

    # lets do it. lets fill up the btree object
    # #################################
    my $bench0=new Benchmark;
    # Add a key/value pair to the file
    unless(defined $outCol_aref){
        foreach my $infile(@infile){
            print "infile=$infile\n" if $verbose;
                my $fh = new IO::File "$indir".'/'."$infile", "r" or  
+croak "Cannot open file $infile: $!\n";
            while(not $fh->eof){
                my $line=<$fh>;
                my $key=$parse->($line);
                    $h{$key}=$line;
            }
            $fh->close();
        }
    }
    else{    
        foreach my $infile(@infile){
            print "infile=$infile\n" if $verbose;
                my $fh = new IO::File "$indir".'/'."$infile", "r" or  
+croak "Cannot open file $infile: $!\n";
            while(not $fh->eof){
                my $line=<$fh>;
                my $key=$parse->($line);
                    $h{$key}=join $sep,((split(/$sep/,$line,$splitSize
+))[@outCol]);
            }
            $fh->close();
        }
    }

    # Cycle through the keys printing them in order.
    # #################################
    my $fh = new IO::File "$outfile", "w";

    my $bench1=new Benchmark;

    my($key,$value);
        for (my $status = $mybtree->seq($key, $value, R_FIRST) ;
         $status == 0 ;
         $status = $mybtree->seq($key, $value, R_NEXT) ){
                chomp $value;
                print $fh "$value".$/;
        }

    my $bench2=new Benchmark;
    # done
    $fh->close;

    untie %h ;
    my $diff=timediff($bench1,$bench0);
    print "sort ".timestr($diff)."\n" if $verbose;
    my $diff=timediff($bench2,$bench1);
    print "write ".timestr($diff)."\n" if $verbose;
    my $diff=timediff($bench2,$bench0);
    print "total ".timestr($diff)."\n" if $verbose;
}

sub genComp{
        my ($sep,$keyMap_ref)=@_;
        my @keyMap=@{$keyMap_ref};
        my $code  = 'sub {';
        $code    .= 'my($k1,$k2)=@_; my @k1=split /'."$sep".'/,$k1; my
+ @k2=split /'."$sep".'/,$k2;';
        #$code    .= '"$k1[0]" cmp "$k2[0]" || "$k1[1]" <=> "$k2[1]" |
+| "$k1[2]" <=> "$k2[2]";';
        for (my $i=0; $i <= $#keyMap; $i++){
                $code .=  '"$k1[';
                $code .=  $i;
                $code .=  ']" ';
                $code .=   $keyMap[$i] eq 'C' ? 'cmp' : '<=>';
                $code .=   ' "$k2[';
                $code .=   $i;
                $code .=   ']" ';
                $code .=   ' || ' if $i != $#keyMap;
        }
        $code .=   ';';
$code    .= '}';
}
[download]

update (broquaint): added <readmore> tag

In reply to perl sort versus Unix sort by Anonymous Monk

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.