xxxMMxxxxxxxMMxxxxx ## 2-value runs xx?MMxxxxxx?MMxxxxx ## Possible 3-value runs xxxMM?xxxxxxMM?xxxx ## " " " #### [19:58:08.64] P:\test>412061 -LEN=1000 -COUNT=1000 -N=10 -M=2 >log1 Generated & encoded data. 2-char substring index built: 65536 entries. 2-char index trimmed to 65535 entries. Found 65535 2-number runs appearing in at least 2 sets. 3-char index trimmed to 757 entries. Found 757 3-number runs appearing in at least 2 sets. 4-char index trimmed to 1 entries. Found 1 4-number runs appearing in at least 2 sets. 5-char index trimmed to 0 entries. [19:59:09.28] P:\test> #### #! perl -slw use strict; use bytes; use Data::Dumper; $| = 1; $" = ', '; our $LEN ||= 1000; our $COUNT ||= 1000; our $M ||= 2; our $N ||= 3; =disabled Avoid the AoAs step to cut memory usage. my @data = map { [ map{ int rand 256 } 1 .. $LEN ] } 1 .. $COUNT; warn "Gen'd data"; =cut sub uniq { my %x; @x{ @_ } = (); keys %x } my @encoded = map { pack 'C*', map{ int rand 256 } 1 .. $LEN; } 1 .. $COUNT; warn 'Generated & encoded data'; my %seq; for my $i ( 0 .. $#encoded ) { for my $o ( 0 .. $LEN - 2 ) { push @{ $seq{ 2 }{ substr $encoded[ $i ], $o, 2 } }, "$i:$o"; } } warn '2-char substring index built: ', scalar keys %{ $seq{ 2 } }, ' entries.'; for my $n ( 2 .. $N ) { for my $run ( keys %{ $seq{ $n } } ) { ## Remove duplicated line numbers from index if( $n > 2 ) { ## Won't happen on the first pass, @{ $seq{ $n }{ $run } } = uniq @{ $seq{ $n }{ $run } }; } ## Trim any runs that do not appear in at least $M lines delete $seq{ $n }{ $run } unless @{ $seq{ $n }{ $run } } > $M; } warn "$n-char index trimmed to ", scalar keys %{ $seq{ $n } }, ' entries.'; ## Skip out early if nothing more to do last unless keys %{ $seq{ $n } }; ## Ouput the $n-char matches. my $count = 0; for my $run ( keys %{ $seq{ $n } } ) { next unless @{ $seq{ $n }{ $run } } > $M; $count++; printf "Run [%*s] appears in lines: %s\n", $n * 4, join(',', map{ sprintf '%03d', $_ } unpack 'C*', $run ), "[@{ $seq{ $n }{ $run } }]"; } warn "Found $count $n-number runs appearing in at least $M sets"; ## Go through each run remaining in index $n for my $run ( keys %{ $seq{ $n } } ) { ## And each line that run was found in for my $idx ( @{ $seq{ $n }{ $run } } ) { my( $i, $o ) = split ':', $idx; ## And build a new index ($n+1) who's keys are the previous runs ## +1 char infront ## +1 char behind push @{ $seq{ $n+1 }{ substr $encoded[ $i ], $o-1, $n+1 } }, "$i:" . ( $o - 1 ) if $o > 1; push @{ $seq{ $n+1 }{ substr $encoded[ $i ], $o, $n+1 } }, "$i:" . $o if $o < $LEN - $n +1; } } ## Save space by discarding previous index. delete $seq{ $n } } __END__ [20:29:57.57] P:\test>412061 -LEN=1000 -COUNT=1000 -N=10 > log1 Generated & encoded data at P:\test\412061.pl line 26. 2-char substring index built: 65536 entries. at P:\test\412061.pl line 34. 2-char index trimmed to 65535 entries. at P:\test\412061.pl line 45. Found 65535 2-number runs appearing in at least 2 sets at P:\test\412061.pl line 60. 3-char index trimmed to 785 entries. at P:\test\412061.pl line 45. Found 785 3-number runs appearing in at least 2 sets at P:\test\412061.pl line 60. 4-char index trimmed to 0 entries. at P:\test\412061.pl line 45. [20:31:03.28] P:\test>