xxxMMxxxxxxxMMxxxxx ## 2-value runs
xx?MMxxxxxx?MMxxxxx ## Possible 3-value runs
xxxMM?xxxxxxMM?xxxx ##    "        "     "

##</code><code>##

[19:58:08.64] P:\test>412061 -LEN=1000 -COUNT=1000 -N=10 -M=2 >log1
Generated & encoded data.
2-char substring index built: 65536 entries.
2-char index trimmed to 65535 entries.
Found 65535 2-number runs appearing in at least 2 sets.

3-char index trimmed to 757 entries.
Found 757 3-number runs appearing in at least 2 sets.

4-char index trimmed to 1 entries. 
Found 1 4-number runs appearing in at least 2 sets.

5-char index trimmed to 0 entries. 

[19:59:09.28] P:\test>

##</code><code>##

#! perl -slw
use strict;
use bytes;
use Data::Dumper;

$| = 1;
$" = ', ';

our $LEN 	||= 1000;
our $COUNT	||= 1000;
our $M 		||= 2;
our $N 		||= 3;

=disabled Avoid the AoAs step to cut memory usage.
my @data = map {
	[ map{ int rand 256 } 1 .. $LEN ]
} 1 .. $COUNT;
warn "Gen'd data";
=cut

sub uniq { my %x; @x{ @_ } = (); keys %x }

my @encoded = map {
	pack 'C*', map{ int rand 256 } 1 .. $LEN;
} 1 .. $COUNT;
warn 'Generated & encoded data';

my %seq;
for my $i ( 0 .. $#encoded ) {
	for my $o ( 0 .. $LEN - 2 ) {
		push @{ $seq{ 2 }{ substr $encoded[ $i ], $o, 2 } }, "$i:$o";
	}
}
warn '2-char substring index built: ', scalar keys %{ $seq{ 2 } }, ' entries.';

for my $n ( 2 .. $N ) {
	for my $run ( keys %{ $seq{ $n } } ) {
		## Remove duplicated line numbers from index
		if( $n > 2 ) { ## Won't happen on the first pass,
			@{ $seq{ $n }{ $run } } = uniq @{ $seq{ $n }{ $run } };
		}
		## Trim any runs that do not appear in at least $M lines
		delete $seq{ $n }{ $run } unless @{ $seq{ $n }{ $run } } > $M;
	}
	warn "$n-char index trimmed to ", scalar keys %{ $seq{ $n } }, ' entries.';

	## Skip out early if nothing more to do
	last unless keys %{ $seq{ $n } };
	
	## Ouput the $n-char matches.
	my $count = 0;
	for my $run ( keys %{ $seq{ $n } } ) {
		next unless @{ $seq{ $n }{ $run } } > $M;
		$count++;
		printf "Run [%*s] appears in lines: %s\n",
			$n * 4,
			join(',', map{ sprintf '%03d', $_ } unpack 'C*', $run ), 
			"[@{ $seq{ $n }{ $run } }]";
	}
	warn "Found $count $n-number runs appearing in at least $M sets"; 
	
	## Go through each run remaining in index $n 
	for my $run ( keys %{ $seq{ $n } } ) { 
		## And each line that run was found in 
		for my $idx ( @{ $seq{ $n }{ $run } } ) {
			my( $i, $o ) = split ':', $idx;
			## And build a new index ($n+1) who's keys are the previous runs 
			## +1 char infront
			## +1 char behind
			push @{ $seq{ $n+1 }{ substr $encoded[ $i ], $o-1, $n+1 } }, 
				"$i:" . ( $o - 1 ) if $o > 1;
			push @{ $seq{ $n+1 }{ substr $encoded[ $i ], $o,   $n+1 } }, 
				"$i:" .   $o       if $o < $LEN - $n +1;
		}
	}
	
	## Save space by discarding previous index.
	delete $seq{ $n }
}


__END__
[20:29:57.57] P:\test>412061 -LEN=1000 -COUNT=1000 -N=10 > log1
Generated & encoded data at P:\test\412061.pl line 26.
2-char substring index built: 65536 entries. at P:\test\412061.pl line 34.
2-char index trimmed to 65535 entries. at P:\test\412061.pl line 45.
Found 65535 2-number runs appearing in at least 2 sets at P:\test\412061.pl line 60.
3-char index trimmed to 785 entries. at P:\test\412061.pl line 45.
Found 785 3-number runs appearing in at least 2 sets at P:\test\412061.pl line 60.
4-char index trimmed to 0 entries. at P:\test\412061.pl line 45.

[20:31:03.28] P:\test>