#! perl -slw use strict; use Time::HiRes qw[ time ]; use bytes; sub mem{ my( $mem ) = `tasklist /nh /fo csv /fi \"pid eq $$\"` =~ m[\"([\d,]+\s+.)\"$]; $mem } $|++; our $FUZZ ||= 2; our $KEYLEN ||= 25; our $MEM ||= 0; die "Need two files" unless @ARGV == 2; my $start = time(); open FRAGS, '<', $ARGV[ 0 ] or die "$ARGV[ 0 ] : $!"; chomp( my @fragments = ); close FRAGS; open SEQ, '<', $ARGV[ 1 ] or die "$ARGV[ 1 ] : $!"; my $count = 0; while( my $seq = ) { chomp $seq; my $seqLen = length $seq; my( $masked, $offset2, $fuz, $mask ); for my $frag ( @fragments ) { $KEYLEN = length $frag; my $minZeros = chr( 0 ) x int( $KEYLEN / ( $FUZZ + 1 )); my $maskReps = int( $seqLen / $KEYLEN ); my $maskLen = $maskReps * $KEYLEN; my $limit = $seqLen - $KEYLEN; my $mask = $frag x $maskReps; for ( 0 .. $KEYLEN - 1 ) { $masked = $mask ^ substr( $seq, $_, $maskLen ); while( $masked =~ m[$minZeros]g ) { $offset2 = pos( $masked ) - ( pos( $masked ) % $KEYLEN ); last unless $_ + $offset2 <= $limit; if( ( $fuz = substr( $masked, $offset2, $KEYLEN ) =~ tr[\0][]c ) <= $FUZZ ) { ++$count; printf "line:$. offset:%d fuzz:%d '%s'\n", $_ + $offset2, $fuz, $frag; } pos( $masked ) = $offset2 + $KEYLEN; } } } } close SEQ; warn "Found $count matches in (secs): ", time() - $start, ( $MEM ? ' Mem: ' . mem() : () ), $/; ##

##

#! perl -slw
use strict;
use Fuzzy::Matcher::DFA;
use Time::HiRes qw[ time ];

sub mem{ 
    my( $mem ) = `tasklist /nh /fo csv /fi \"pid eq $$\"` 
        =~ m[\"([\d,]+\s+.)\"$]; 
    $mem 
}

our $FUZZ	||= 2;
our $KEYLEN	||= 25;
our $MEM	||= 0;

die "Need two files" unless @ARGV == 2;

my $start = time();

my $matcher = Fuzzy::Matcher::DFA->new( $FUZZ, $KEYLEN );
open FRAGS, '<', $ARGV[ 0 ] or die "$ARGV[ 1 ] : $!";
while(  ) {
	chomp;
	$matcher->fuzz_store( $_ );
}
close FRAGS;

$matcher->prepare;

open SEQ, '<', $ARGV[ 1 ] or die "$ARGV[ 0 ] : $!";

my $count = 0;
while(  ) {
	chomp;
	my $results = $matcher->fuzz_search( $_ );
	$count += @$results / 3;
	printf "line:$. offset:%d fuzz:%d '%s'\n", 
        splice @$results, 0, 3 while @$results;
}
my $elapsed = time() - $start;
close SEQ;
warn "Found $count matches in (secs): ", $elapsed, 
    ( $MEM ? ' Mem: ' . mem() : () ), $/;

##

##

#! perl -slw
use strict;
use Time::HiRes qw[ time ];

sub mem{ 
    my( $mem ) = `tasklist /nh /fo csv /fi \"pid eq $$\"` 
        =~ m[\"([\d,]+\s+.)\"$]; 
    $mem 
}

our $FUZZ	||=  2;
our $KEYLEN ||= 25;
our $MEM	||= 0;

die "Need two files" unless @ARGV == 2;

my $start = time();

open FRAGS, '<', $ARGV[ 0 ] or die "$ARGV[ 0 ] : $!";
chomp( my @fragments =  );
close FRAGS;

# initialization code
my $chunks = join ")(", 
	map "."x ( int(   $_       * $KEYLEN / ( $FUZZ + 1 ) ) 
	         - int( ( $_ - 1 ) * $KEYLEN / ( $FUZZ + 1 ) ) )
	, 1 .. $FUZZ + 1;
my $qr = qr/(?=(($chunks)))./;

my @filters;
for my $frag ( @fragments ) {
	my @m = $frag =~ $qr
		or die "something's horribly wrong, $frag =~ $qr ";
	push @{ $filters[$_]{ $m[ $_ ] } }, $frag for 1..$#m;
}

open SEQ, '<', $ARGV[ 1 ] or die "$ARGV[ 1 ] : $!";

my $empty = [];
my $count = 0;
# search code
while( my $seq =  ) {
	while( $seq =~ /$qr/g ) {
		my %uniq;
		for ( map @{ 
                $filters[ $_ - 1 ]{ substr $seq, $-[$_], $+[$_]-$-[$_] } 
                || $empty 
            }, 2..$#- ) {
			if ( ( $1 ^ $_ ) =~ tr/\0//c <= $FUZZ && !$uniq{ $_ }++ ) {
				++$count;
				printf "line:$. offset:%d fuzz: %d '%s'\n", $-[ 0 ], 
                    ( ( $1 ^ $_ ) =~ tr/\0//c ), $_;
			}
		}
	}
}
my $elapsed = time() - $start;
close SEQ;
warn "Found $count matches in (secs): ", $elapsed, 
    ( $MEM ? ' Mem: ' . mem() : () ), $/;

##

##

#! perl -slw
use strict;
use Data::Dumper;
use Benchmark::Timer;

our $FUZZ	||=  2;
our $MEM	||=  0;
our $ITERS	||= -2;

my $T = new Benchmark::Timer;
my %tests; 
@ARGV = map quotemeta,  @ARGV;

for my $seqFile ( glob 'seq.*' ) {
	for my $fragFile ( glob 'frg.*' ) {
		my $test = sprintf '[%s][%s]', $fragFile, $seqFile;
		for my $candidate ( @ARGV ) {
			my $cmd = qq[
				$candidate -FUZZ=$FUZZ -MEM=$MEM
				$fragFile 
				$seqFile 
				2>&1
				1> $candidate\\"$test".out
			];
			$cmd =~ s[\s+][ ]g;
#print "'$cmd'";
			for ( 1 .. $ITERS ) {			
				$T->start( $test . $candidate );
				my $result = `$cmd`;
				$T->stop( $test . $candidate );
#print "'$result'";
				if( my( $found, $time, $mem, $scale ) = $result =~ 
				  m[Found (\d+) matches in \(secs\): ([\d\.]+) Mem: ([\d,]+) K]
				) {
					$tests{ $test }{ $candidate }{ found } += $found;
					$tests{ $test }{ $candidate }{ time  } += $time;
					$mem =~ tr[,][]d;
					$tests{ $test }{ $candidate }{ mem   } += $mem;
				}
				else { warn "Bad return data: '$result'"; }
			}
			$tests{ $test }{ $candidate }{ $_ } 
                       /= $ITERS for qw[ found time mem ];
		}
	}
}
for my $test ( sort keys %tests ) {
	print "\n------ $test";
	print "       Who:   Matches Secs(ave)    Memory";
	printf "%10s: %9d  %8.3f %9d\n", $_,
		@{ $tests{ $test }{ $_ } }{ qw[ found time mem ] }
		for @ARGV;
}