llil2d start get_properties : 10 secs sort + output : 21 secs total : 31 secs 2317524 Kbytes of RAM were used #### my_test start get_properties : 13 secs sort + output : 7 secs total : 20 secs 349124 Kbytes of RAM were used #### use strict; use warnings; use Judy::HS qw/ Set Get Free /; use Sort::Packed 'sort_packed'; my $DATA_TEMPLATE = 'nZ10'; my $DATA_SIZE = 12; my $COUNT_SIZE_BYTES = 2; my $COUNT_SIZE_BITS = 16; my $COUNT_MAX = int( 2 ** $COUNT_SIZE_BITS - 1 ); @ARGV or die "usage: $0 file...\n"; my @llil_files = @ARGV; warn "my_test start\n"; my $tstart1 = time; my ( $data, $current ) = ( '', 0 ); my $judy; for my $fname ( @llil_files ) { open( my $fh, '<', $fname ) or die $!; while ( <$fh> ) { chomp; my ( $word, $count ) = split /\t/; ( undef, my $val ) = Get( $judy, $word ); if ( defined $val ) { vec( $data, $val * $DATA_SIZE / $COUNT_SIZE_BYTES, $COUNT_SIZE_BITS ) -= $count } else { $data .= pack $DATA_TEMPLATE, $COUNT_MAX - $count, $word; Set( $judy, $word, $current ); $current ++ } } } Free( $judy ); my $tend1 = time; warn "get_properties : ", $tend1 - $tstart1, " secs\n"; my $tstart2 = time; sort_packed "C$DATA_SIZE", $data; while ( $data ) { my ( $count, $word ) = unpack $DATA_TEMPLATE, substr $data, 0, $DATA_SIZE, ''; printf "%s\t%d\n", $word, $COUNT_MAX - $count } my $tend2 = time; warn "sort + output : ", $tend2 - $tstart2, " secs\n"; warn "total : ", $tend2 - $tstart1, " secs\n"; use Memory::Usage; my $m = Memory::Usage-> new; $m-> record; warn $m-> state-> [0][3], " Kbytes of RAM were used\n"; #### my_test start get_properties : 21 secs sort + output : 23 secs total : 44 secs 841880 Kbytes of RAM were used #### use strict; use warnings; use Judy::HS qw/ Set Get Free /; use Crypt::xxHash 'xxhash3_128bits_hex'; use Sort::Packed 'sort_packed'; my $DATA_TEMPLATE = 'nnNn'; # word count # file index # word position # word length my $DATA_SIZE = 10; my $COUNT_SIZE_BYTES = 2; my $COUNT_SIZE_BITS = 16; my $COUNT_MAX = int( 2 ** $COUNT_SIZE_BITS - 1 ); @ARGV or die "usage: $0 file...\n"; my @llil_files = @ARGV; warn "my_test start\n"; my $tstart1 = time; my ( $data, $current ) = ( '', 0 ); my $judy; for my $idx ( 0 .. $#llil_files ) { open( my $fh, '<', $llil_files[ $idx ]) or die $!; until ( eof $fh ) { my $pos = tell $fh; $_ = <$fh>; chomp; my ( $word, $count ) = split /\t/; my $xx = xxhash3_128bits_hex( $word, 0 ); ( undef, my $val ) = Get( $judy, $xx ); if ( defined $val ) { vec( $data, $val * $DATA_SIZE / $COUNT_SIZE_BYTES, $COUNT_SIZE_BITS ) -= $count } else { $data .= pack $DATA_TEMPLATE, $COUNT_MAX - $count, $idx, $pos, length $word; Set( $judy, $xx, $current ); $current ++ } } } Free( $judy ); my $tend1 = time; warn "get_properties : ", $tend1 - $tstart1, " secs\n"; my $tstart2 = time; sort_packed "C$DATA_SIZE", $data; my @fh; open $fh[ $_ ], '<', $llil_files[ $_ ] for 0 .. $#llil_files; while ( $data ) { my ( $count, $idx, $pos, $len ) = unpack $DATA_TEMPLATE, substr $data, 0, $DATA_SIZE, ''; sysseek $fh[ $idx ], $pos, 0; sysread $fh[ $idx ], my( $word ), $len; printf "%s\t%d\n", $word, $COUNT_MAX - $count } my $tend2 = time; warn "sort + output : ", $tend2 - $tstart2, " secs\n"; warn "total : ", $tend2 - $tstart1, " secs\n"; use Memory::Usage; my $m = Memory::Usage-> new; $m-> record; warn $m-> state-> [0][3], " Kbytes of RAM were used\n";