]s =. 'the text to play with' NB. String the text to play with ]L =. 2 3 4 NB. Ngram lengths 2 3 4 NB. Running a little ahead, the sentence to get us each ngram in its box, NB. with each word nicely boxed: ;L ([:<<\)"(0 _)(<;._2) s,' ' +----------+---------+---------+-----------+-------------+--------------+--------------+------------------+-------------------+ |+---+----+|+----+--+|+--+----+|+----+----+|+---+----+--+|+----+--+----+|+--+----+----+|+---+----+--+----+|+----+--+----+----+| ||the|text|||text|to|||to|play|||play|with|||the|text|to|||text|to|play|||to|play|with|||the|text|to|play|||text|to|play|with|| |+---+----+|+----+--+|+--+----+|+----+----+|+---+----+--+|+----+--+----+|+--+----+----+|+---+----+--+----+|+----+--+----+----+| +----------+---------+---------+-----------+-------------+--------------+--------------+------------------+-------------------+ NB. But, this structure would mean a lot of unpleasant pointer arithmetic NB. on Perl side (indirect, indirect, offset, indirect, etc.). It slows things, too. NB. Instead, let's have best of both worlds, by strictly adhering to NB. task in OP -- let each ngram be a string, let's append ASCII-0 to each, catenate, NB. return pointer to Perl, and just unpack with '(Z*)*'. NB. Little explanation (s is string, L is ngram lengths, as above): ]w =. <;.2 s,' ' NB. Words +----+-----+---+-----+-----+ |the |text |to |play |with | +----+-----+---+-----+-----+ NB. ;.2 - cut by last item (char), keeping it NB. , - append, < - box L (<@:,&(35}a.)@:}:@:;)\ w +-----------------+------------------+-------------+----------+ |the text# |text to# |to play# |play with#| +-----------------+------------------+-------------+----------+ |the text to# |text to play# |to play with#| | +-----------------+------------------+-------------+----------+ |the text to play#|text to play with#| | | +-----------------+------------------+-------------+----------+ NB. \ - apply verb to overlapping infixes of lengths L of w NB. @: - composition, & - curry, and read right to left: NB. ; - raze (unbox items), }: - curtail, NB. 35}a. - take element from ASCII table, NB. 35 just for sake of display, it will be 0 in practice NB. Almost there ;a:-.~,L (<@:,&(35}a.)@:}:@:;)\ w the text#text to#to play#play with#the text to#text to play#to play with#the text to play#text to play with# NB. , - ravel (flatten shape) NB. ~ - swap left/right operands (just easier to write) NB. -. - exclude. What to exclude? NB. a: is called "Ace", it's an empty box NB. But OP wanted indexes of words at ngram starts. It's easy: i.# w 0 1 2 3 4 NB. i. - integer list, # - number of items. NB. Then simple, but vectorized, arithmetic. 2 > L -"0 _ i.# w 0 1 1 1 1 0 0 1 1 1 0 0 0 1 1 NB. Then, for each row of matrix above: NB. |. - reverse, I. - indexes of true elements, NB. >: - increment (because short lines are padded with zeroes) (>:@:I.@:|.)"1 ]2 > L -"0 _ i.# w 1 2 3 4 1 2 3 0 1 2 0 0 <:0-.~,(>:@:I.@:|.)"1 ]2> L -"0 _ i.# w 0 1 2 3 0 1 2 0 1 NB. Above -- ravel, exclude zeroes, decrement. NB. So, phrases to get us (catenated) ngrams and list of starting indexes: ;a:-.~,L (<@:,&(35}a.)@:}:@:;)\ w the text#text to#to play#play with#the text to#text to play#to play with#the text to play#text to play with# <:0-.~,(>:@:I.@:|.)"1 ]2> L -"0 _ i.# w 0 1 2 3 0 1 2 0 1 NB. What about sorting not by ngram length, but starting index? NB. Nothing easier: transpose (|:) at appropriate moment: ;a:-.~,|:L (<@:,&(35}a.)@:}:@:;)\ w the text#the text to#the text to play#text to#text to play#text to play with#to play#to play with#play with# <:0-.~,|:(>:@:I.@:|.)"1 ]2> L -"0 _ i.# w 0 0 0 1 1 1 2 2 3 #### use strict; use warnings; use utf8; use feature 'say'; use Data::Dump 'dd'; use FFI::Raw; use Config; use Benchmark 'cmpthese'; die unless $Config{ ptrsize } == 8; # some numbers hardcoded below my $str = join ' ', ('this is the text to play with') x 5; my $ngramWindow_MIN = 2; my $ngramWindow_MAX = 6; my ( $low, $high ) = ( $ngramWindow_MIN - 1, $ngramWindow_MAX - 1 ); my $i_ = FFI::Raw::ulong; my $s_ = FFI::Raw::str; my $p_ = FFI::Raw::ptr; my $dll = "$ENV{HOME}/j64-807/bin/j.dll"; # libj.so"; my $pJ = FFI::Raw-> new( $dll, 'JInit', $i_ )-> call or die; my $cmd = 'ngrams =: ;a:-.~,L (<@:,&(0}a.)@:}:@:;)\ w '. '[indxs =: <:0-.~,(>:@:I.@:|.)"1 ]2> L -"0 _ i.# w '. '[w =: <;.2 s '. "[L =: @{[ $ngramWindow_MIN .. $ngramWindow_MAX ]} ". "[s =: '$str',' '"; my $JDo = FFI::Raw-> new( $dll, 'JDo', $i_, $i_, $s_ )-> coderef; my $JGetM = FFI::Raw-> new( $dll, 'JGetM', $i_, $i_, $s_, ( $p_ ) x 4 )-> coderef; my $type = FFI::Raw::memptr( 8 ); my $rank = FFI::Raw::memptr( 8 ); my $shape = FFI::Raw::memptr( 8 ); my $data = FFI::Raw::memptr( 8 ); # Demo dd j(); # Benchmark cmpthese -2, { j => \&j, AnomalousMonk => \&AnomalousMonk, johngg => \&johngg, tybalt89 => \&tybalt89, }; sub j { $JDo-> ( $pJ, $cmd ); $JGetM-> ( $pJ, 'ngrams', $type, $rank, $shape, $data ); my $len = unpack 'Q', unpack 'P8', $shape-> tostr( 8 ); my @ngrams = unpack '(Z*)*', unpack "P$len", $data-> tostr( 8 ); $JGetM-> ( $pJ, 'indxs', $type, $rank, $shape, $data ); $len = 8 * unpack 'Q', unpack 'P8', $shape-> tostr( 8 ); my @indxs = unpack 'Q*', unpack "P$len", $data-> tostr( 8 ); return \@ngrams, \@indxs } sub AnomalousMonk { my @ngrams; for my $ngramWindow ($ngramWindow_MIN .. $ngramWindow_MAX) { my $m = $ngramWindow - 1; my $ngram = qr{ \b [[:alpha:]]+ (?: \s+ [[:alpha:]]+){$m} \b }xms; my @word_ngrams = $str =~ m{ (?= ($ngram)) }xmsg; push @ngrams, @word_ngrams; } return \@ngrams; } sub johngg { my ( @ngrams, @indxs ); for my $nWords ( $ngramWindow_MIN .. $ngramWindow_MAX ) { my @words = split m{\s+}, $str; my $start = 0; while ( scalar @words >= $nWords ) { push @indxs, $start ++; push @ngrams, join " ", @words[ 0 .. $nWords - 1 ]; shift @words; } }; return \@ngrams, \@indxs } sub tybalt89 { my ( @ngrams, @indxs ); $str =~ /(?