xxxMMxxxxxxxMMxxxxx ## 2-value runs
xx?MMxxxxxx?MMxxxxx ## Possible 3-value runs
xxxMM?xxxxxxMM?xxxx ## " " "
####
[19:58:08.64] P:\test>412061 -LEN=1000 -COUNT=1000 -N=10 -M=2 >log1
Generated & encoded data.
2-char substring index built: 65536 entries.
2-char index trimmed to 65535 entries.
Found 65535 2-number runs appearing in at least 2 sets.
3-char index trimmed to 757 entries.
Found 757 3-number runs appearing in at least 2 sets.
4-char index trimmed to 1 entries.
Found 1 4-number runs appearing in at least 2 sets.
5-char index trimmed to 0 entries.
[19:59:09.28] P:\test>
####
#! perl -slw
use strict;
use bytes;
use Data::Dumper;
$| = 1;
$" = ', ';
our $LEN ||= 1000;
our $COUNT ||= 1000;
our $M ||= 2;
our $N ||= 3;
=disabled Avoid the AoAs step to cut memory usage.
my @data = map {
[ map{ int rand 256 } 1 .. $LEN ]
} 1 .. $COUNT;
warn "Gen'd data";
=cut
sub uniq { my %x; @x{ @_ } = (); keys %x }
my @encoded = map {
pack 'C*', map{ int rand 256 } 1 .. $LEN;
} 1 .. $COUNT;
warn 'Generated & encoded data';
my %seq;
for my $i ( 0 .. $#encoded ) {
for my $o ( 0 .. $LEN - 2 ) {
push @{ $seq{ 2 }{ substr $encoded[ $i ], $o, 2 } }, "$i:$o";
}
}
warn '2-char substring index built: ', scalar keys %{ $seq{ 2 } }, ' entries.';
for my $n ( 2 .. $N ) {
for my $run ( keys %{ $seq{ $n } } ) {
## Remove duplicated line numbers from index
if( $n > 2 ) { ## Won't happen on the first pass,
@{ $seq{ $n }{ $run } } = uniq @{ $seq{ $n }{ $run } };
}
## Trim any runs that do not appear in at least $M lines
delete $seq{ $n }{ $run } unless @{ $seq{ $n }{ $run } } > $M;
}
warn "$n-char index trimmed to ", scalar keys %{ $seq{ $n } }, ' entries.';
## Skip out early if nothing more to do
last unless keys %{ $seq{ $n } };
## Ouput the $n-char matches.
my $count = 0;
for my $run ( keys %{ $seq{ $n } } ) {
next unless @{ $seq{ $n }{ $run } } > $M;
$count++;
printf "Run [%*s] appears in lines: %s\n",
$n * 4,
join(',', map{ sprintf '%03d', $_ } unpack 'C*', $run ),
"[@{ $seq{ $n }{ $run } }]";
}
warn "Found $count $n-number runs appearing in at least $M sets";
## Go through each run remaining in index $n
for my $run ( keys %{ $seq{ $n } } ) {
## And each line that run was found in
for my $idx ( @{ $seq{ $n }{ $run } } ) {
my( $i, $o ) = split ':', $idx;
## And build a new index ($n+1) who's keys are the previous runs
## +1 char infront
## +1 char behind
push @{ $seq{ $n+1 }{ substr $encoded[ $i ], $o-1, $n+1 } },
"$i:" . ( $o - 1 ) if $o > 1;
push @{ $seq{ $n+1 }{ substr $encoded[ $i ], $o, $n+1 } },
"$i:" . $o if $o < $LEN - $n +1;
}
}
## Save space by discarding previous index.
delete $seq{ $n }
}
__END__
[20:29:57.57] P:\test>412061 -LEN=1000 -COUNT=1000 -N=10 > log1
Generated & encoded data at P:\test\412061.pl line 26.
2-char substring index built: 65536 entries. at P:\test\412061.pl line 34.
2-char index trimmed to 65535 entries. at P:\test\412061.pl line 45.
Found 65535 2-number runs appearing in at least 2 sets at P:\test\412061.pl line 60.
3-char index trimmed to 785 entries. at P:\test\412061.pl line 45.
Found 785 3-number runs appearing in at least 2 sets at P:\test\412061.pl line 60.
4-char index trimmed to 0 entries. at P:\test\412061.pl line 45.
[20:31:03.28] P:\test>