#! perl -slw use strict; use Time::HiRes qw[ time ]; use bytes; sub mem{ my( $mem ) = `tasklist /nh /fo csv /fi \"pid eq $$\"` =~ m[\"([\d,]+\s+.)\"$]; $mem } $|++; our $FUZZ ||= 2; our $KEYLEN ||= 25; our $MEM ||= 0; die "Need two files" unless @ARGV == 2; my $start = time(); open FRAGS, '<', $ARGV[ 0 ] or die "$ARGV[ 0 ] : $!"; chomp( my @fragments = ); close FRAGS; open SEQ, '<', $ARGV[ 1 ] or die "$ARGV[ 1 ] : $!"; my $count = 0; while( my $seq = ) { chomp $seq; my $seqLen = length $seq; my( $masked, $offset2, $fuz, $mask ); for my $frag ( @fragments ) { $KEYLEN = length $frag; my $minZeros = chr( 0 ) x int( $KEYLEN / ( $FUZZ + 1 )); my $maskReps = int( $seqLen / $KEYLEN ); my $maskLen = $maskReps * $KEYLEN; my $limit = $seqLen - $KEYLEN; my $mask = $frag x $maskReps; for ( 0 .. $KEYLEN - 1 ) { $masked = $mask ^ substr( $seq, $_, $maskLen ); while( $masked =~ m[$minZeros]g ) { $offset2 = pos( $masked ) - ( pos( $masked ) % $KEYLEN ); last unless $_ + $offset2 <= $limit; if( ( $fuz = substr( $masked, $offset2, $KEYLEN ) =~ tr[\0][]c ) <= $FUZZ ) { ++$count; printf "line:$. offset:%d fuzz:%d '%s'\n", $_ + $offset2, $fuz, $frag; } pos( $masked ) = $offset2 + $KEYLEN; } } } } close SEQ; warn "Found $count matches in (secs): ", time() - $start, ( $MEM ? ' Mem: ' . mem() : () ), $/; #### #! perl -slw use strict; use Fuzzy::Matcher::DFA; use Time::HiRes qw[ time ]; sub mem{ my( $mem ) = `tasklist /nh /fo csv /fi \"pid eq $$\"` =~ m[\"([\d,]+\s+.)\"$]; $mem } our $FUZZ ||= 2; our $KEYLEN ||= 25; our $MEM ||= 0; die "Need two files" unless @ARGV == 2; my $start = time(); my $matcher = Fuzzy::Matcher::DFA->new( $FUZZ, $KEYLEN ); open FRAGS, '<', $ARGV[ 0 ] or die "$ARGV[ 1 ] : $!"; while( ) { chomp; $matcher->fuzz_store( $_ ); } close FRAGS; $matcher->prepare; open SEQ, '<', $ARGV[ 1 ] or die "$ARGV[ 0 ] : $!"; my $count = 0; while( ) { chomp; my $results = $matcher->fuzz_search( $_ ); $count += @$results / 3; printf "line:$. offset:%d fuzz:%d '%s'\n", splice @$results, 0, 3 while @$results; } my $elapsed = time() - $start; close SEQ; warn "Found $count matches in (secs): ", $elapsed, ( $MEM ? ' Mem: ' . mem() : () ), $/; #### #! perl -slw use strict; use Time::HiRes qw[ time ]; sub mem{ my( $mem ) = `tasklist /nh /fo csv /fi \"pid eq $$\"` =~ m[\"([\d,]+\s+.)\"$]; $mem } our $FUZZ ||= 2; our $KEYLEN ||= 25; our $MEM ||= 0; die "Need two files" unless @ARGV == 2; my $start = time(); open FRAGS, '<', $ARGV[ 0 ] or die "$ARGV[ 0 ] : $!"; chomp( my @fragments = ); close FRAGS; # initialization code my $chunks = join ")(", map "."x ( int( $_ * $KEYLEN / ( $FUZZ + 1 ) ) - int( ( $_ - 1 ) * $KEYLEN / ( $FUZZ + 1 ) ) ) , 1 .. $FUZZ + 1; my $qr = qr/(?=(($chunks)))./; my @filters; for my $frag ( @fragments ) { my @m = $frag =~ $qr or die "something's horribly wrong, $frag =~ $qr "; push @{ $filters[$_]{ $m[ $_ ] } }, $frag for 1..$#m; } open SEQ, '<', $ARGV[ 1 ] or die "$ARGV[ 1 ] : $!"; my $empty = []; my $count = 0; # search code while( my $seq = ) { while( $seq =~ /$qr/g ) { my %uniq; for ( map @{ $filters[ $_ - 1 ]{ substr $seq, $-[$_], $+[$_]-$-[$_] } || $empty }, 2..$#- ) { if ( ( $1 ^ $_ ) =~ tr/\0//c <= $FUZZ && !$uniq{ $_ }++ ) { ++$count; printf "line:$. offset:%d fuzz: %d '%s'\n", $-[ 0 ], ( ( $1 ^ $_ ) =~ tr/\0//c ), $_; } } } } my $elapsed = time() - $start; close SEQ; warn "Found $count matches in (secs): ", $elapsed, ( $MEM ? ' Mem: ' . mem() : () ), $/; #### #! perl -slw use strict; use Data::Dumper; use Benchmark::Timer; our $FUZZ ||= 2; our $MEM ||= 0; our $ITERS ||= -2; my $T = new Benchmark::Timer; my %tests; @ARGV = map quotemeta, @ARGV; for my $seqFile ( glob 'seq.*' ) { for my $fragFile ( glob 'frg.*' ) { my $test = sprintf '[%s][%s]', $fragFile, $seqFile; for my $candidate ( @ARGV ) { my $cmd = qq[ $candidate -FUZZ=$FUZZ -MEM=$MEM $fragFile $seqFile 2>&1 1> $candidate\\"$test".out ]; $cmd =~ s[\s+][ ]g; #print "'$cmd'"; for ( 1 .. $ITERS ) { $T->start( $test . $candidate ); my $result = `$cmd`; $T->stop( $test . $candidate ); #print "'$result'"; if( my( $found, $time, $mem, $scale ) = $result =~ m[Found (\d+) matches in \(secs\): ([\d\.]+) Mem: ([\d,]+) K] ) { $tests{ $test }{ $candidate }{ found } += $found; $tests{ $test }{ $candidate }{ time } += $time; $mem =~ tr[,][]d; $tests{ $test }{ $candidate }{ mem } += $mem; } else { warn "Bad return data: '$result'"; } } $tests{ $test }{ $candidate }{ $_ } /= $ITERS for qw[ found time mem ]; } } } for my $test ( sort keys %tests ) { print "\n------ $test"; print " Who: Matches Secs(ave) Memory"; printf "%10s: %9d %8.3f %9d\n", $_, @{ $tests{ $test }{ $_ } }{ qw[ found time mem ] } for @ARGV; }