in reply to Re: Algorithm Showdown: (A litany of failures)
in thread Algorithm Showdown: Fuzzy Matching
Here are the test scripts used in the parent post exercises.
408636-4.pl
#! perl -slw use strict; use Time::HiRes qw[ time ]; use bytes; sub mem{ my( $mem ) = `tasklist /nh /fo csv /fi \"pid eq $$\"` =~ m[\"([\d,]+\s+.)\"$]; $mem } $|++; our $FUZZ ||= 2; our $KEYLEN ||= 25; our $MEM ||= 0; die "Need two files" unless @ARGV == 2; my $start = time(); open FRAGS, '<', $ARGV[ 0 ] or die "$ARGV[ 0 ] : $!"; chomp( my @fragments = <FRAGS> ); close FRAGS; open SEQ, '<', $ARGV[ 1 ] or die "$ARGV[ 1 ] : $!"; my $count = 0; while( my $seq = <SEQ> ) { chomp $seq; my $seqLen = length $seq; my( $masked, $offset2, $fuz, $mask ); for my $frag ( @fragments ) { $KEYLEN = length $frag; my $minZeros = chr( 0 ) x int( $KEYLEN / ( $FUZZ + 1 )); my $maskReps = int( $seqLen / $KEYLEN ); my $maskLen = $maskReps * $KEYLEN; my $limit = $seqLen - $KEYLEN; my $mask = $frag x $maskReps; for ( 0 .. $KEYLEN - 1 ) { $masked = $mask ^ substr( $seq, $_, $maskLen ); while( $masked =~ m[$minZeros]g ) { $offset2 = pos( $masked ) - ( pos( $masked ) % $KEYLEN + ); last unless $_ + $offset2 <= $limit; if( ( $fuz = substr( $masked, $offset2, $KEYLEN ) =~ tr +[\0][]c ) <= $FUZZ ) { ++$count; printf "line:$. offset:%d fuzz:%d '%s'\n", $_ + $offset2, $fuz, $frag; } pos( $masked ) = $offset2 + $KEYLEN; } } } } close SEQ; warn "Found $count matches in (secs): ", time() - $start, ( $MEM ? ' Mem: ' . mem() : () ), $/;
demerphq.pl
#! perl -slw use strict; use Fuzzy::Matcher::DFA; use Time::HiRes qw[ time ]; sub mem{ my( $mem ) = `tasklist /nh /fo csv /fi \"pid eq $$\"` =~ m[\"([\d,]+\s+.)\"$]; $mem } our $FUZZ ||= 2; our $KEYLEN ||= 25; our $MEM ||= 0; die "Need two files" unless @ARGV == 2; my $start = time(); my $matcher = Fuzzy::Matcher::DFA->new( $FUZZ, $KEYLEN ); open FRAGS, '<', $ARGV[ 0 ] or die "$ARGV[ 1 ] : $!"; while( <FRAGS> ) { chomp; $matcher->fuzz_store( $_ ); } close FRAGS; $matcher->prepare; open SEQ, '<', $ARGV[ 1 ] or die "$ARGV[ 0 ] : $!"; my $count = 0; while( <SEQ> ) { chomp; my $results = $matcher->fuzz_search( $_ ); $count += @$results / 3; printf "line:$. offset:%d fuzz:%d '%s'\n", splice @$results, 0, 3 while @$results; } my $elapsed = time() - $start; close SEQ; warn "Found $count matches in (secs): ", $elapsed, ( $MEM ? ' Mem: ' . mem() : () ), $/;
ysth.pl
#! perl -slw use strict; use Time::HiRes qw[ time ]; sub mem{ my( $mem ) = `tasklist /nh /fo csv /fi \"pid eq $$\"` =~ m[\"([\d,]+\s+.)\"$]; $mem } our $FUZZ ||= 2; our $KEYLEN ||= 25; our $MEM ||= 0; die "Need two files" unless @ARGV == 2; my $start = time(); open FRAGS, '<', $ARGV[ 0 ] or die "$ARGV[ 0 ] : $!"; chomp( my @fragments = <FRAGS> ); close FRAGS; # initialization code my $chunks = join ")(", map "."x ( int( $_ * $KEYLEN / ( $FUZZ + 1 ) ) - int( ( $_ - 1 ) * $KEYLEN / ( $FUZZ + 1 ) ) ) , 1 .. $FUZZ + 1; my $qr = qr/(?=(($chunks)))./; my @filters; for my $frag ( @fragments ) { my @m = $frag =~ $qr or die "something's horribly wrong, $frag =~ $qr "; push @{ $filters[$_]{ $m[ $_ ] } }, $frag for 1..$#m; } open SEQ, '<', $ARGV[ 1 ] or die "$ARGV[ 1 ] : $!"; my $empty = []; my $count = 0; # search code while( my $seq = <SEQ> ) { while( $seq =~ /$qr/g ) { my %uniq; for ( map @{ $filters[ $_ - 1 ]{ substr $seq, $-[$_], $+[$_]-$-[$_] + } || $empty }, 2..$#- ) { if ( ( $1 ^ $_ ) =~ tr/\0//c <= $FUZZ && !$uniq{ $_ }++ ) +{ ++$count; printf "line:$. offset:%d fuzz: %d '%s'\n", $-[ 0 ], ( ( $1 ^ $_ ) =~ tr/\0//c ), $_; } } } } my $elapsed = time() - $start; close SEQ; warn "Found $count matches in (secs): ", $elapsed, ( $MEM ? ' Mem: ' . mem() : () ), $/;
A simple script for automating mass testing of the above scripts, plus any other candidates over any number of tests.
Not used for the exercises above, because it was written, while the above tests were running, as an antithisis of this and this, to show how simple it is to follow the K.I.S.S principle. It's trivially incomplete, but the essence of it is there and works.
showdown2.pl
#! perl -slw use strict; use Data::Dumper; use Benchmark::Timer; our $FUZZ ||= 2; our $MEM ||= 0; our $ITERS ||= -2; my $T = new Benchmark::Timer; my %tests; @ARGV = map quotemeta, @ARGV; for my $seqFile ( glob 'seq.*' ) { for my $fragFile ( glob 'frg.*' ) { my $test = sprintf '[%s][%s]', $fragFile, $seqFile; for my $candidate ( @ARGV ) { my $cmd = qq[ $candidate -FUZZ=$FUZZ -MEM=$MEM $fragFile $seqFile 2>&1 1> $candidate\\"$test".out ]; $cmd =~ s[\s+][ ]g; #print "'$cmd'"; for ( 1 .. $ITERS ) { $T->start( $test . $candidate ); my $result = `$cmd`; $T->stop( $test . $candidate ); #print "'$result'"; if( my( $found, $time, $mem, $scale ) = $result =~ m[Found (\d+) matches in \(secs\): ([\d\.]+) Mem: ([ +\d,]+) K] ) { $tests{ $test }{ $candidate }{ found } += $found; $tests{ $test }{ $candidate }{ time } += $time; $mem =~ tr[,][]d; $tests{ $test }{ $candidate }{ mem } += $mem; } else { warn "Bad return data: '$result'"; } } $tests{ $test }{ $candidate }{ $_ } /= $ITERS for qw[ found time mem ]; } } } for my $test ( sort keys %tests ) { print "\n------ $test"; print " Who: Matches Secs(ave) Memory"; printf "%10s: %9d %8.3f %9d\n", $_, @{ $tests{ $test }{ $_ } }{ qw[ found time mem ] } for @ARGV; }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
| A reply falls below the community's threshold of quality. You may see it by logging in. |