use strict; use warnings; use utf8; use feature 'say'; use Data::Dump 'dd'; use FFI::Raw; use Config; use Benchmark 'cmpthese'; die unless $Config{ ptrsize } == 8; # some numbers hardcoded below my $str = join ' ', ('this is the text to play with') x 5; my $ngramWindow_MIN = 2; my $ngramWindow_MAX = 6; my ( $low, $high ) = ( $ngramWindow_MIN - 1, $ngramWindow_MAX - 1 ); my $i_ = FFI::Raw::ulong; my $s_ = FFI::Raw::str; my $p_ = FFI::Raw::ptr; my $dll = "$ENV{HOME}/j64-807/bin/j.dll"; # libj.so"; my $pJ = FFI::Raw-> new( $dll, 'JInit', $i_ )-> call or die; my $cmd = 'ngrams =: ;a:-.~,L (<@:,&(0}a.)@:}:@:;)\ w '. '[indxs =: <:0-.~,(>:@:I.@:|.)"1 ]2> L -"0 _ i.# w '. '[w =: <;.2 s '. "[L =: @{[ $ngramWindow_MIN .. $ngramWindow_MAX ]} ". "[s =: '$str',' '"; my $JDo = FFI::Raw-> new( $dll, 'JDo', $i_, $i_, $s_ )-> coderef; my $JGetM = FFI::Raw-> new( $dll, 'JGetM', $i_, $i_, $s_, ( $p_ ) x 4 )-> coderef; my $type = FFI::Raw::memptr( 8 ); my $rank = FFI::Raw::memptr( 8 ); my $shape = FFI::Raw::memptr( 8 ); my $data = FFI::Raw::memptr( 8 ); # Demo dd j(); # Benchmark cmpthese -2, { j => \&j, AnomalousMonk => \&AnomalousMonk, johngg => \&johngg, tybalt89 => \&tybalt89, }; sub j { $JDo-> ( $pJ, $cmd ); $JGetM-> ( $pJ, 'ngrams', $type, $rank, $shape, $data ); my $len = unpack 'Q', unpack 'P8', $shape-> tostr( 8 ); my @ngrams = unpack '(Z*)*', unpack "P$len", $data-> tostr( 8 ); $JGetM-> ( $pJ, 'indxs', $type, $rank, $shape, $data ); $len = 8 * unpack 'Q', unpack 'P8', $shape-> tostr( 8 ); my @indxs = unpack 'Q*', unpack "P$len", $data-> tostr( 8 ); return \@ngrams, \@indxs } sub AnomalousMonk { my @ngrams; for my $ngramWindow ($ngramWindow_MIN .. $ngramWindow_MAX) { my $m = $ngramWindow - 1; my $ngram = qr{ \b [[:alpha:]]+ (?: \s+ [[:alpha:]]+){$m} \b }xms; my @word_ngrams = $str =~ m{ (?= ($ngram)) }xmsg; push @ngrams, @word_ngrams; } return \@ngrams; } sub johngg { my ( @ngrams, @indxs ); for my $nWords ( $ngramWindow_MIN .. $ngramWindow_MAX ) { my @words = split m{\s+}, $str; my $start = 0; while ( scalar @words >= $nWords ) { push @indxs, $start ++; push @ngrams, join " ", @words[ 0 .. $nWords - 1 ]; shift @words; } }; return \@ngrams, \@indxs } sub tybalt89 { my ( @ngrams, @indxs ); $str =~ /(?