#! perl -sw use strict; open IN, '<', $ARGV[ 0 ] or die $!; open OUT, '>:raw', $ARGV[1] or die $!; my $pos = 0; print( OUT pack 'NQ', m[^(\d+),], $pos ), $pos = tell( IN ) while ; close OUT; close IN; #### #! perl -slw use strict; use Time::HiRes qw[ time ]; use Math::Random::MT [ rand srand ]; srand time; sub binsearch { use integer; my( $idxRef, $key ) = @_; $key = pack 'N', $key; my( $min, $max ) = ( 0, length( $$idxRef ) / 12 ); while( $min < $max) { my $mid = ( $min + $max ) >> 1; if( substr( $$idxRef, $mid*12, 4 ) lt $key ) { $min = $mid + 1; } else { $max = $mid; } } return unpack 'Q', substr( $$idxRef, $min*12+4, 8 ) if ( $max == $min ) and substr( $$idxRef, $min*12, 4 ) eq $key; return undef; } our $N //= 1000; open IDX, '<:raw', $ARGV[ 1 ] or die $!; sysread( IDX, my $idx, -s( $ARGV[ 1 ] ) ); close IDX; open DATA, '<', $ARGV[0] or die $!; my $start = time; for my $i ( 1 .. $N ) { my $rndRec = 1 + int rand( 160e6 ); printf "Record $rndRec: "; my $pos = binsearch( \$idx, $rndRec ); if( $pos ) { seek DATA, $pos, 0; printf "'%s'", scalar ; } else { print 'Not found'; } } printf "Lookup averaged %.6f seconds/record\n", (time - $start) / $N; __END__ C:\>1118102-searcher -N=10000 30GB.dat 30GB.idx | find /v "Record" Lookup averaged 0.000513 seconds/record C:\>1118102-searcher -N=10000 30GB.dat 30GB.idx | find /v "Record" Lookup averaged 0.000397 seconds/record C:\>1118102-searcher -N=10000 30GB.dat 30GB.idx | find /v "Record" Lookup averaged 0.000314 seconds/record S:\>1118102-searcher -N=10000 30GB.dat 30GB.idx | find /v "Record" Lookup averaged 0.000248 seconds/record C:\>1118102-searcher -N=10000 30GB.dat 30GB.idx | find /v "Record" Lookup averaged 0.000201 seconds/record C:\>1118102-searcher -N=10000 30GB.dat 30GB.idx | find /v "Record" Lookup averaged 0.000162 seconds/record