#! perl -slw use strict; use Time::HiRes qw[ time ]; use bytes; sub mem{ my( $mem ) = `tasklist /nh /fo csv /fi \"pid eq $$\"` =~ m[\"([\d,]+\s+.)\"$]; $mem } $|++; our $FUZZ ||= 2; our $KEYLEN ||= 25; our $MEM ||= 0; die "Need two files" unless @ARGV == 2; my $start = time(); open FRAGS, '<', $ARGV[ 0 ] or die "$ARGV[ 0 ] : $!"; chomp( my @fragments = ); close FRAGS; open SEQ, '<', $ARGV[ 1 ] or die "$ARGV[ 1 ] : $!"; my $count = 0; while( my $seq = ) { chomp $seq; my $seqLen = length $seq; my( $masked, $offset2, $fuz, $mask ); for my $frag ( @fragments ) { $KEYLEN = length $frag; my $minZeros = chr( 0 ) x int( $KEYLEN / ( $FUZZ + 1 )); my $maskReps = int( $seqLen / $KEYLEN ); my $maskLen = $maskReps * $KEYLEN; my $limit = $seqLen - $KEYLEN; my $mask = $frag x $maskReps; for ( 0 .. $KEYLEN - 1 ) { $masked = $mask ^ substr( $seq, $_, $maskLen ); while( $masked =~ m[$minZeros]g ) { $offset2 = pos( $masked ) - ( pos( $masked ) % $KEYLEN ); last unless $_ + $offset2 <= $limit; if( ( $fuz = substr( $masked, $offset2, $KEYLEN ) =~ tr[\0][]c ) <= $FUZZ ) { ++$count; printf "line:$. offset:%d fuzz:%d '%s'\n", $_ + $offset2, $fuz, $frag; } pos( $masked ) = $offset2 + $KEYLEN; } } } } close SEQ; warn "Found $count matches in (secs): ", time() - $start, ( $MEM ? ' Mem: ' . mem() : () ), $/;