in reply to Re^2: Algorithm Showdown: Fuzzy Matching (Matcher.pm)
in thread Algorithm Showdown: Fuzzy Matching

This is a solution by ysth. It exploits the fact that if a word of M characters is to be matched with N fuzz then if you were to divide the word into N+1 parts at least one of them would have to match exactly. He uses hashes and the regex engine to do his stuff quickly. This appears to scale reasonably well. Ill leave it to ysth to explain in more detail.

package Fuzzy::Matcher::Ysth; use strict; use warnings; use Fuzzy::Matcher; use vars qw/$VERSION @ISA/; @ISA=qw(Fuzzy::Matcher); $VERSION=0.01; # Original implementation by [ysth] # modified to fit Fuzzy::Matcher interface by [demerphq] sub prepare { my ($self)=@_; my $fuzz=$self->{fuzz}; my $length=$self->{strlen}; die "still hardcoded for fuzz 2" if $fuzz != 2; delete $self->{str_hash}; my $fuzz_strings = delete $self->{str_array}; my $chunks = join ")(", map "."x (int($_*$length/($fuzz+1))-int(($_-1)*$length/($fuzz+1) +)), 1..$fuzz+1; my @filters; my $qr = qr/(?=(($chunks)))./; for my $frag (@$fuzz_strings) { my @m = $frag =~ $qr or die "something's horribly wrong, $frag =~ $qr "; push @{$filters[$_]{$m[$_]}}, $frag for 1..$#m; } $self->{ysth_filters} = \@filters; $self->{ysth_qr} = $qr; } sub fuzz_search { my ($self, $seq) = @_; die "Unprepared to fuzz_search!" unless $self->{ysth_filters}; my $fuzz=$self->{fuzz}; my @filters = @{$self->{ysth_filters}}; my $qr = $self->{ysth_qr}; my @matches; my $empty = []; while ($seq =~ /$qr/g) { my %uniq; for (map @{$filters[$_-1]{substr $seq, $-[$_], $+[$_]-$-[$_]}| +|$empty}, 2..$#-) { # print "trying $1 against $_ len $length fuzz $fuzz seq $ +seq\n"; # die; if ( ($1 ^ $_) =~ tr/\0//c <= $fuzz && !$uniq{$_}++) { push @matches, $-[0], ($1 ^ $_) =~ tr/\0//c, $_; } } } return \@matches; } 1;
---
demerphq