c:\test>678848 Run with 15000 unique words in 5000 documents (Ave: 554 words/doc) ASCII uses 26235503 bytes Binary uses 16398152 bytes Elias uses 22057989 bytes 1 trial of Packing ascii (163.453s total) 1 trial of Unpacking ascii (14.564s total) 1 trial of Packing binary (222.252s total) 1 trial of Unpacking binary (7.203s total) 1 trial of Packing Elias (337.402s total) 1 trial of Unpacking Elias (14.141s total) #### #! perl -slw use strict; use Inline C => 'DATA', NAME => '_678848', CLEAN_AFTER_BUILD => 0; use Benchmark::Timer; my $T = new Benchmark::Timer; use List::Util qw[ sum ]; use Math::Random::MT qw[ rand srand ]; use Data::Dump qw[ pp ]; $Data::Dump::MAX_WIDTH = 1000; $|++; srand 1; use constant NO_UNIQUE_WORDS => 15000; use constant NO_OF_DOCS => 5000; use constant WORDS_PER_DOC => 554; ## Build some test data my @words = do{ local @ARGV = 'words'; <> }; close ARGV; chomp @words; @words = @words[ map{ int( rand scalar @words ) } 1 .. NO_UNIQUE_WORDS ]; printf "Run with %s unique words in %s documents (Ave: %s words/doc)\n\n", NO_UNIQUE_WORDS, NO_OF_DOCS, WORDS_PER_DOC; my %index; for my $docId ( 1 .. NO_OF_DOCS ) { my $pos = 0; my @doc = @words[ map{ int rand scalar @words } 1 .. WORDS_PER_DOC ]; for my $w ( 0 .. $#doc ) { my $word = $doc[ $w ]; push @{ $index{ $word }{ $docId } }, $pos; $pos += 1 + length $word; } } #pp \%index; <>; ## Building the posting lists in ASCII $T->start( my $label = 'Packing ascii' ); my @postings = map { my $wordHash = $_; join ' ', map{ "$_:" . join ';', @{ $wordHash->{ $_ } }; } keys %{ $wordHash }; } values %index; $T->stop( $label ); #pp \@postings; <>; printf "ASCII uses %d bytes\n", sum map length, @postings; ## Unpack the ASCII $T->start( $label = 'Unpacking ascii' ); my $wordIndex = 0; for my $wordData ( @postings ) { # printf "\n$words[ $wordIndex ++ ]: "; for my $docData ( split ' ', $wordData ) { my( $doc, @posns ) = split '[:;]', $docData; # printf "$doc:%s ", join ';', @posns; } } $T->stop( $label ); undef @postings; ## Doing them in binary $T->start( $label = 'Packing binary' ); my @binPostings = map { my $wordHash = $_; pack '(v/a*)*', map { pack 'v*', $_, @{ $wordHash->{ $_ } }; } keys %{ $wordHash }; } values %index; $T->stop( $label ); printf "Binary uses %d bytes\n", sum map length, @binPostings; ## Unpack the packed binary $T->start( $label = 'Unpacking binary' ); $wordIndex = 0; for my $wordData ( @binPostings ) { # printf "\n$words[ $wordIndex ++ ]: "; for my $docData ( unpack '(v/a*)*', $wordData ) { my( $doc, @posns ) = unpack 'v*', $docData; # printf "$doc:%s ", join ';', @posns; } } $T->stop( $label ); undef @binPostings; ## Elias Gamma Encoding $T->start( $label = 'Packing Elias' ); my @packBinPostings = map { my $wordHash = $_; pack '(v/a*)*', map { packEm( $_, @{ $wordHash->{ $_ } } ); } keys %{ $wordHash }; } values %index; $T->stop( $label ); printf "Elias uses %d bytes\n", sum map length, @packBinPostings; ## Unpack the packed binary $T->start( $label = 'Unpacking Elias' ); $wordIndex = 0; for my $wordData ( @packBinPostings ) { # printf "\n$words[ $wordIndex ++ ]: "; for my $docData ( unpack '(v/a*)*', $wordData ) { my( $doc, @posns ) = unpackEm( $docData ); # printf "$doc:%s ", join ';', @posns; } } $T->stop( $label ); $T->report; __DATA__ c:\test>678848 Run with 15000 unique words in 5000 documents (Ave: 554 words/doc) ASCII uses 26235503 bytes Binary uses 16398152 bytes Elias uses 22057989 bytes 1 trial of Packing ascii (163.453s total) 1 trial of Unpacking ASCII (14.564s total) 1 trial of Packing binary (222.252s total) 1 trial of Unpacking binary (7.203s total) 1 trial of Packing Elias (337.402s total) 1 trial of Unpacking Elias (14.141s total) c:\test>678848 Run with 15000 unique words in 5000 documents (Ave: 554 words/doc) ASCII uses 26237329 bytes Binary uses 16396556 bytes Elias uses 22065316 bytes 1 trial of Packing ascii (167.313s total) 1 trial of Unpacking ASCII (14.782s total) 1 trial of Packing binary (247.297s total) 1 trial of Unpacking binary (7.438s total) 1 trial of Packing Elias (378.662s total) 1 trial of Unpacking Elias (14.906s total) __C__ #include #define XS_Vars dXSARGS #define XS_Items items #define XS_Item(x) ST(x) #define XS_Reset sp = mark #define XS_Push(x) XPUSHs(x) #define XS_Done PUTBACK #define XS_Return(x) XSRETURN(x) #define XS_Void XSRETURN(0) #define TRACE printf( "%s:%d\n", __FILE__, __LINE__ ); _inline double log2( double n ) { return log( n ) / log( 2 ); } _inline void setBit( register U8 *bs, U32 offset ) { *( bs + ( offset / 8 ) ) |= (U8)( 1 << ( offset % 8 ) ); } _inline U8 getBit( register U8 *bs, U32 offset ) { return *( bs + ( offset / 8 ) ) & (U8)( 1 << ( offset % 8 ) ) ? 1 : 0; } void packEm ( SV *dummy, ... ) { XS_Vars; U32 in = 0, out = 0, i = 0; register U8 *packed; Newz( 0xfaece5, packed, XS_Items*4, U8 ); for( in = 0; in < (U32)XS_Items; in++ ) { U32 num = SvUV( XS_Item( in ) ); U32 len = (U32)log2( num ); out += len; setBit( packed, out++ ); for( i = 0; i < len; i++ ) { if( num & ( 1 << i ) ) setBit( packed, out ); out++; } } out = ( out + 8 ) / 8; XS_Reset; XS_Push( sv_2mortal( newSVpvn( packed, out ) ) ); Safefree( packed ); XS_Done; } void unpackEm( SV * svPacked ) { XS_Vars; U32 bits = SvCUR( svPacked ) * 8; register U8 *packed = SvPVX( svPacked ); U32 in = 0, i = 0; XS_Reset; while( in < bits ) { U32 len = 0; U32 num = 0; while( in < bits && !getBit( packed, in++ ) ) len++; if( in == bits ) break; for( i = 0; i < len; i++ ) { if( getBit( packed, in++ ) ) num |= ( 1 << i ); } num |= ( 1 << len ); XS_Push( sv_2mortal( newSVuv( num ) ) ); } XS_Done; return; }