llil2d start
get_properties : 10 secs
sort + output : 21 secs
total : 31 secs
2317524 Kbytes of RAM were used
####
my_test start
get_properties : 13 secs
sort + output : 7 secs
total : 20 secs
349124 Kbytes of RAM were used
##
##
use strict;
use warnings;
use Judy::HS qw/ Set Get Free /;
use Sort::Packed 'sort_packed';
my $DATA_TEMPLATE = 'nZ10';
my $DATA_SIZE = 12;
my $COUNT_SIZE_BYTES = 2;
my $COUNT_SIZE_BITS = 16;
my $COUNT_MAX = int( 2 ** $COUNT_SIZE_BITS - 1 );
@ARGV or die "usage: $0 file...\n";
my @llil_files = @ARGV;
warn "my_test start\n";
my $tstart1 = time;
my ( $data, $current ) = ( '', 0 );
my $judy;
for my $fname ( @llil_files ) {
open( my $fh, '<', $fname ) or die $!;
while ( <$fh> ) {
chomp;
my ( $word, $count ) = split /\t/;
( undef, my $val ) = Get( $judy, $word );
if ( defined $val ) {
vec( $data, $val * $DATA_SIZE / $COUNT_SIZE_BYTES,
$COUNT_SIZE_BITS ) -= $count
}
else {
$data .= pack $DATA_TEMPLATE, $COUNT_MAX - $count, $word;
Set( $judy, $word, $current );
$current ++
}
}
}
Free( $judy );
my $tend1 = time;
warn "get_properties : ", $tend1 - $tstart1, " secs\n";
my $tstart2 = time;
sort_packed "C$DATA_SIZE", $data;
while ( $data ) {
my ( $count, $word ) = unpack $DATA_TEMPLATE, substr $data, 0, $DATA_SIZE, '';
printf "%s\t%d\n", $word, $COUNT_MAX - $count
}
my $tend2 = time;
warn "sort + output : ", $tend2 - $tstart2, " secs\n";
warn "total : ", $tend2 - $tstart1, " secs\n";
use Memory::Usage;
my $m = Memory::Usage-> new;
$m-> record;
warn $m-> state-> [0][3], " Kbytes of RAM were used\n";
##
##
my_test start
get_properties : 21 secs
sort + output : 23 secs
total : 44 secs
841880 Kbytes of RAM were used
##
##
use strict;
use warnings;
use Judy::HS qw/ Set Get Free /;
use Crypt::xxHash 'xxhash3_128bits_hex';
use Sort::Packed 'sort_packed';
my $DATA_TEMPLATE = 'nnNn'; # word count
# file index
# word position
# word length
my $DATA_SIZE = 10;
my $COUNT_SIZE_BYTES = 2;
my $COUNT_SIZE_BITS = 16;
my $COUNT_MAX = int( 2 ** $COUNT_SIZE_BITS - 1 );
@ARGV or die "usage: $0 file...\n";
my @llil_files = @ARGV;
warn "my_test start\n";
my $tstart1 = time;
my ( $data, $current ) = ( '', 0 );
my $judy;
for my $idx ( 0 .. $#llil_files ) {
open( my $fh, '<', $llil_files[ $idx ]) or die $!;
until ( eof $fh ) {
my $pos = tell $fh;
$_ = <$fh>;
chomp;
my ( $word, $count ) = split /\t/;
my $xx = xxhash3_128bits_hex( $word, 0 );
( undef, my $val ) = Get( $judy, $xx );
if ( defined $val ) {
vec( $data, $val * $DATA_SIZE / $COUNT_SIZE_BYTES,
$COUNT_SIZE_BITS ) -= $count
}
else {
$data .= pack $DATA_TEMPLATE,
$COUNT_MAX - $count,
$idx,
$pos,
length $word;
Set( $judy, $xx, $current );
$current ++
}
}
}
Free( $judy );
my $tend1 = time;
warn "get_properties : ", $tend1 - $tstart1, " secs\n";
my $tstart2 = time;
sort_packed "C$DATA_SIZE", $data;
my @fh;
open $fh[ $_ ], '<', $llil_files[ $_ ] for 0 .. $#llil_files;
while ( $data ) {
my ( $count, $idx, $pos, $len )
= unpack $DATA_TEMPLATE, substr $data, 0, $DATA_SIZE, '';
sysseek $fh[ $idx ], $pos, 0;
sysread $fh[ $idx ], my( $word ), $len;
printf "%s\t%d\n", $word, $COUNT_MAX - $count
}
my $tend2 = time;
warn "sort + output : ", $tend2 - $tstart2, " secs\n";
warn "total : ", $tend2 - $tstart1, " secs\n";
use Memory::Usage;
my $m = Memory::Usage-> new;
$m-> record;
warn $m-> state-> [0][3], " Kbytes of RAM were used\n";