in reply to Re^3: Algorithm Showdown: Fuzzy Matching (DFA.pm)
in thread Algorithm Showdown: Fuzzy Matching

Patch applied to fix the bug BrowserUk identified and also a C cleanup. (Updated Patch)

--- PostedDFA.pm 2004-12-11 12:31:19.714617600 +0100 +++ DFA.pm 2004-12-11 15:37:03.829067200 +0100 @@ -48,6 +48,15 @@ sub _init { my ($self,$chars)=@_; + if ($chars) { + $self->_set_chars($chars); + } + return $self; +} + + +sub _set_chars { + my ($self,$chars)=@_; $chars='ACGT' unless defined $chars; my @achars; @@ -55,18 +64,17 @@ $self->{charcount}=0; $self->{charlist}=[]; for my $c (split //,($chars||'ACGT')) { - next if $hchars{$c} || $hchars{lc($c)}; + next if $hchars{$c}; push @{$self->{charlist}},$c; $hchars{$c} = $self->{charcount}; $achars[ord $c] = $self->{charcount}; - $hchars{lc($c)} = $self->{charcount}; - $achars[ord(lc($c))] = $self->{charcount}; $self->{charcount}++ } + @{$self}{'achars','bchars','hchars'}=( \@achars, - pack( "C*", map { defined $_ ? $_ : 255 } @achars), + pack( "C*", map { defined $achars[$_] ? $achars[$_] : 255 } 0..25 +5), \%hchars ); @@ -77,7 +85,6 @@ return $self; } - # Associates an integer to a string in the trie. # trie is a flat string of packed longs. # Space allocated for the trie increases by doubling @@ -128,8 +135,9 @@ # we will never store strings that can completely overlap each other. # Ie if ABBA and BB are valid strings and we search ABBA, we will not # record that we matched both, only ABBA. This isnt a problem for -# the purposes at hand as all of our words will be most 1 character -# different in lengths. +# the purposes at hand as all of our words will be the same length. +# A more complicated storage structure could facilitate the issue of +# overlaps. # The Idea here is we recurse through the trie keeping "cursors" that # represent where we would be if we backtracked at each new letter # encountered. When we find an empty slot we use the state of the @@ -206,6 +214,11 @@ return if $self->{str_hash}{$str}++; + unless ($self->{charlist}) { + $self->{fuzzed_chars}{$_}++ + for split //,$str + } + my $id=++$self->{strings_stored}; my $fuzz=$self->{fuzz}; @@ -254,6 +267,12 @@ # sub prepare { my $self=shift; + + unless ($self->{charlist}) { + $self->_set_chars(join "",sort keys %{$self->{fuzzed_chars}}); + } + + my $partshash=($self->{parts_hash}||={}); my $parts_idx=pack PACKTYPE,0; @@ -336,19 +355,26 @@ " bytes) Uses:".length($$trie)." bytes\n". "Trie Nodes Allocated: $nodes\n". ($strings_stored||0)." Fuzzy Strings Stored.\n"; - #print Dump($strings); + print "Used ".length($$strings)." bytes to store strings\n"; + my $max=50; if ($partsidx) { print "Used ".length($$partsidx)." to store index\n"; - #my @unpack=unpack PACKTYPE."*",$$partsidx; - #print "@unpack\n"; + my @unpack=unpack PACKTYPE."*",$$partsidx; + + if (@unpack>$max) { + my $count=splice @unpack,$max,-1; + print join(" ",@unpack),"... [$count]\n"; + } else { + print join(" ",@unpack),"\n"; + } } print $self->{fuzzed} ? "Prepared " : "Not Prepared ", $self->{filled_in} ? "Converted " : "Not Converted ", "\n"; my $ofs=1; - while ($ofs<$size and $ofs<1000) { + while ($ofs<$size and $ofs<$max) { my $pos=$ofs; printf "%6d=%s\n",$pos, join(":",map { @@ -356,7 +382,7 @@ unpack PACKTYPE, substr($$trie,($ofs++)*4,4) } 0 .. $charc); } - if ($ofs>1000) { + if ($ofs>$max) { print "(Truncated trie dump due to size)\n"; } @@ -413,147 +439,155 @@ char *charmap, UV charcount, UV fuzz, - UV start) { - UV ofs; - UV cpos = 0; - UV upto = search_len-strlen; - UV bmaxlen = strlen+1; - const unsigned char mask[8]={128,64,32,16,8,4,2,1}; - UV num_fullchecks=0; - AV* return_array=newAV(); + UV start) +{ + UV ofs; + UV cpos = 0; + UV upto = search_len-strlen; + UV bmaxlen = strlen+1; + const unsigned char mask[8]={128,64,32,16,8,4,2,1}; + UV num_fullchecks=0; + AV* return_array=newAV(); - start = start ? start : 1; - ofs = start; + start = start ? start : 1; + ofs = start; + #ifdef trie_DEBUG + printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string); + #endif + + while( search_string[cpos] && ofs ) { + UV x; + UV idx; + char curchar; + unsigned char curcharidx; + #ifdef trie_DEBUG - printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string); + printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen,cpos,s +trlen,veclen); #endif - while( search_string[cpos] && ofs ) { - UV x; - char c; - - #ifdef trie_DEBUG - printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen, +cpos,strlen,veclen); - #endif + Zero(&seenvec[(cpos % strlen)*veclen],veclen,char); + curchar=search_string[cpos++]; + curcharidx=charmap[ curchar ]; - Zero(&seenvec[(cpos % strlen)*veclen],veclen,char); - /* - for (x=0;x<veclen;x++) - seenvec[(cpos % strlen)*veclen+x]=0; - */ + #ifdef trie_DEBUG + printf("Pos: %d -- State: %d -- Char: '%c' CIdx: '%d'\n",cpos-1,o +fs,curchar,curcharidx); + #endif - #ifdef trie_DEBUG - printf("Pos: %d -- State: %d -- Char: %c\n",cpos,ofs,search +_string[cpos]); - #endif + if (curcharidx==255) { + #ifdef trie_DEBUG + printf("Reset to start: %d (bad char)\n",ofs); + #endif + ofs = start; + continue; + } - c=charmap[ search_string[cpos++] ]; - if (c==255) { - ofs = start; - } else { - ofs = trie[ ofs + c ]; + ofs = trie[ ofs + curcharidx ]; - #ifdef trie_DEBUG - printf("ofs: %d\n",ofs); - #endif + #ifdef trie_DEBUG + printf("New ofs: %d\n",ofs); + #endif - if (trie[ofs+charcount]) { + idx=trie[ofs+charcount]; + if (!idx) + continue; - UV idx=trie[ofs+charcount]; + #ifdef trie_DEBUG + printf("Match idx: %d\n",idx); + #endif - #ifdef trie_DEBUG - printf("Match idx: %d\n",idx); - #endif + while (stridx[idx]<0) { + UV srch_ofs; + IV sofs = stridx[idx++]; - while (stridx[idx]) { - IV sofs = stridx[idx++]; - UV srch_ofs = cpos + sofs; + if ((sofs + cpos<0)||(sofs + cpos > upto)) { + while(stridx[idx]>0) idx++; + continue; + } - #ifdef trie_DEBUG - printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs); - #endif + srch_ofs = cpos + sofs; - x= (srch_ofs % strlen)*veclen; + #ifdef trie_DEBUG + printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs); + #endif - while (stridx[idx]>0) { + x= (srch_ofs % strlen)*veclen; - if ( srch_ofs>=0 && srch_ofs<=upto ) { - UV check_byte=x+(long)(stridx[idx]/8); - unsigned char check_bit=stridx[idx] % 8; - unsigned char seen=mask[check_bit] & seenvec[check_ +byte]; + while (stridx[idx]>0) { + UV check_byte=x+(long)(stridx[idx]/8); + unsigned char check_bit=stridx[idx] % 8; + unsigned char seen=mask[check_bit] & seenvec[check_byte]; - #ifdef trie_DEBUG - printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d +\n", - stridx[idx],check_byte,check_bit,seen); - #endif + #ifdef trie_DEBUG + printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d\n", + stridx[idx],check_byte,check_bit,seen); + #endif - if (!seen) { - char *c= &strings[ (stridx[idx]-1)*bmaxlen ]; - char *s= &search_string[ srch_ofs ]; - UV diff=0; - num_fullchecks++; - seenvec[check_byte]=seenvec[check_byte] | mask[ch +eck_bit]; + if (!seen) { + char *c= &strings[ (stridx[idx]-1)*bmaxlen ]; + char *s= &search_string[ srch_ofs ]; + UV diff=0; + num_fullchecks++; + seenvec[check_byte]=seenvec[check_byte] | mask[check_bit]; - #ifdef trie_DEBUG - printf("\tsearching at ofs %d / strings_of %d ( +cpos:%d)\n",srch_ofs,stridx[idx],cpos); - printf("\t> C '%s' | S %s = ",c,s); - #endif + #ifdef trie_DEBUG + printf("\tsearching at ofs %d / strings_of %d (cpos:%d)\n", +srch_ofs,stridx[idx],cpos); + printf("\t> C '%s' | S %s = ",c,s); + #endif - while (*c && diff<=fuzz) { - if (*c++ != *s++) diff++; - } + while (*c && diff<=fuzz) { + if (*c++ != *s++) diff++; + } - #ifdef trie_DEBUG - printf("\tDiff: %d Fuzz:%d\n",diff,fuzz); - #endif + #ifdef trie_DEBUG + printf("\tDiff: %d Fuzz:%d\n",diff,fuzz); + #endif - if (diff <= fuzz) { - #ifdef trie_PACKED_RETURN - { - Match_Return ret; - #ifdef HAS_HTONL - ret.ofs=PerlSock_htonl(ofs); - ret.diff=PerlSock_htonl(diff); - ret.match=PerlSock_htonl((stridx[idx]-1)); - #else - ret.ofs=ofs; - ret.diff=diff; - ret.match=(stridx[idx]-1); - #endif - { - SV *sv=newSVpvn((char*)(&ret),sizeof(ret)); - av_push(return_array,sv); - } + if (diff <= fuzz) { +#ifdef trie_PACKED_RETURN + { + Match_Return ret; +#ifdef HAS_HTONL + ret.ofs=PerlSock_htonl(ofs); + ret.diff=PerlSock_htonl(diff); + ret.match=PerlSock_htonl((stridx[idx]-1)); +#else + ret.ofs=ofs; + ret.diff=diff; + ret.match=(stridx[idx]-1); +#endif + { + SV *sv=newSVpvn((char*)(&ret),sizeof(ret)); + av_push(return_array,sv); + } - } - #else - { - SV *sv=newSViv((UV)(srch_ofs)); - av_push(return_array,sv); - } - { - SV *sv=newSViv((IV)diff); - av_push(return_array,sv); - } - { - SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmax +len],0); - av_push(return_array,sv); - } - #endif - } - } - } - idx++; - } /* each string */ - } /* each ofs */ - } /* do we have a hit */ - } /* legal char? */ - } /* walk the string */ - { - SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullCheck +s",TRUE); - sv_setuv(num_fullchecks_sv,num_fullchecks); + } +#else + { + SV *sv=newSViv((UV)(srch_ofs)); + av_push(return_array,sv); + } + { + SV *sv=newSViv((IV)diff); + av_push(return_array,sv); + } + { + SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmaxlen],0); + av_push(return_array,sv); + } +#endif + } + } + idx++; + } } - return return_array; + } + { + SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullChecks" +,TRUE); + sv_setuv(num_fullchecks_sv,num_fullchecks); + } + return return_array; } +
---
demerphq