--- PostedDFA.pm 2004-12-11 12:31:19.714617600 +0100
+++ DFA.pm 2004-12-11 15:37:03.829067200 +0100
@@ -48,6 +48,15 @@
sub _init {
my ($self,$chars)=@_;
+ if ($chars) {
+ $self->_set_chars($chars);
+ }
+ return $self;
+}
+
+
+sub _set_chars {
+ my ($self,$chars)=@_;
$chars='ACGT' unless defined $chars;
my @achars;
@@ -55,18 +64,17 @@
$self->{charcount}=0;
$self->{charlist}=[];
for my $c (split //,($chars||'ACGT')) {
- next if $hchars{$c} || $hchars{lc($c)};
+ next if $hchars{$c};
push @{$self->{charlist}},$c;
$hchars{$c} = $self->{charcount};
$achars[ord $c] = $self->{charcount};
- $hchars{lc($c)} = $self->{charcount};
- $achars[ord(lc($c))] = $self->{charcount};
$self->{charcount}++
}
+
@{$self}{'achars','bchars','hchars'}=(
\@achars,
- pack( "C*", map { defined $_ ? $_ : 255 } @achars),
+ pack( "C*", map { defined $achars[$_] ? $achars[$_] : 255 } 0..25
+5),
\%hchars
);
@@ -77,7 +85,6 @@
return $self;
}
-
# Associates an integer to a string in the trie.
# trie is a flat string of packed longs.
# Space allocated for the trie increases by doubling
@@ -128,8 +135,9 @@
# we will never store strings that can completely overlap each other.
# Ie if ABBA and BB are valid strings and we search ABBA, we will not
# record that we matched both, only ABBA. This isnt a problem for
-# the purposes at hand as all of our words will be most 1 character
-# different in lengths.
+# the purposes at hand as all of our words will be the same length.
+# A more complicated storage structure could facilitate the issue of
+# overlaps.
# The Idea here is we recurse through the trie keeping "cursors" that
# represent where we would be if we backtracked at each new letter
# encountered. When we find an empty slot we use the state of the
@@ -206,6 +214,11 @@
return if $self->{str_hash}{$str}++;
+ unless ($self->{charlist}) {
+ $self->{fuzzed_chars}{$_}++
+ for split //,$str
+ }
+
my $id=++$self->{strings_stored};
my $fuzz=$self->{fuzz};
@@ -254,6 +267,12 @@
#
sub prepare {
my $self=shift;
+
+ unless ($self->{charlist}) {
+ $self->_set_chars(join "",sort keys %{$self->{fuzzed_chars}});
+ }
+
+
my $partshash=($self->{parts_hash}||={});
my $parts_idx=pack PACKTYPE,0;
@@ -336,19 +355,26 @@
" bytes) Uses:".length($$trie)." bytes\n".
"Trie Nodes Allocated: $nodes\n".
($strings_stored||0)." Fuzzy Strings Stored.\n";
- #print Dump($strings);
+
print "Used ".length($$strings)." bytes to store strings\n";
+ my $max=50;
if ($partsidx) {
print "Used ".length($$partsidx)." to store index\n";
- #my @unpack=unpack PACKTYPE."*",$$partsidx;
- #print "@unpack\n";
+ my @unpack=unpack PACKTYPE."*",$$partsidx;
+
+ if (@unpack>$max) {
+ my $count=splice @unpack,$max,-1;
+ print join(" ",@unpack),"... [$count]\n";
+ } else {
+ print join(" ",@unpack),"\n";
+ }
}
print $self->{fuzzed} ? "Prepared " : "Not Prepared ",
$self->{filled_in} ? "Converted " : "Not Converted ",
"\n";
my $ofs=1;
- while ($ofs<$size and $ofs<1000) {
+ while ($ofs<$size and $ofs<$max) {
my $pos=$ofs;
printf "%6d=%s\n",$pos,
join(":",map {
@@ -356,7 +382,7 @@
unpack PACKTYPE, substr($$trie,($ofs++)*4,4)
} 0 .. $charc);
}
- if ($ofs>1000) {
+ if ($ofs>$max) {
print "(Truncated trie dump due to size)\n";
}
@@ -413,147 +439,155 @@
char *charmap,
UV charcount,
UV fuzz,
- UV start) {
- UV ofs;
- UV cpos = 0;
- UV upto = search_len-strlen;
- UV bmaxlen = strlen+1;
- const unsigned char mask[8]={128,64,32,16,8,4,2,1};
- UV num_fullchecks=0;
- AV* return_array=newAV();
+ UV start)
+{
+ UV ofs;
+ UV cpos = 0;
+ UV upto = search_len-strlen;
+ UV bmaxlen = strlen+1;
+ const unsigned char mask[8]={128,64,32,16,8,4,2,1};
+ UV num_fullchecks=0;
+ AV* return_array=newAV();
- start = start ? start : 1;
- ofs = start;
+ start = start ? start : 1;
+ ofs = start;
+ #ifdef trie_DEBUG
+ printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string);
+ #endif
+
+ while( search_string[cpos] && ofs ) {
+ UV x;
+ UV idx;
+ char curchar;
+ unsigned char curcharidx;
+
#ifdef trie_DEBUG
- printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string);
+ printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen,cpos,s
+trlen,veclen);
#endif
- while( search_string[cpos] && ofs ) {
- UV x;
- char c;
-
- #ifdef trie_DEBUG
- printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen,
+cpos,strlen,veclen);
- #endif
+ Zero(&seenvec[(cpos % strlen)*veclen],veclen,char);
+ curchar=search_string[cpos++];
+ curcharidx=charmap[ curchar ];
- Zero(&seenvec[(cpos % strlen)*veclen],veclen,char);
- /*
- for (x=0;x<veclen;x++)
- seenvec[(cpos % strlen)*veclen+x]=0;
- */
+ #ifdef trie_DEBUG
+ printf("Pos: %d -- State: %d -- Char: '%c' CIdx: '%d'\n",cpos-1,o
+fs,curchar,curcharidx);
+ #endif
- #ifdef trie_DEBUG
- printf("Pos: %d -- State: %d -- Char: %c\n",cpos,ofs,search
+_string[cpos]);
- #endif
+ if (curcharidx==255) {
+ #ifdef trie_DEBUG
+ printf("Reset to start: %d (bad char)\n",ofs);
+ #endif
+ ofs = start;
+ continue;
+ }
- c=charmap[ search_string[cpos++] ];
- if (c==255) {
- ofs = start;
- } else {
- ofs = trie[ ofs + c ];
+ ofs = trie[ ofs + curcharidx ];
- #ifdef trie_DEBUG
- printf("ofs: %d\n",ofs);
- #endif
+ #ifdef trie_DEBUG
+ printf("New ofs: %d\n",ofs);
+ #endif
- if (trie[ofs+charcount]) {
+ idx=trie[ofs+charcount];
+ if (!idx)
+ continue;
- UV idx=trie[ofs+charcount];
+ #ifdef trie_DEBUG
+ printf("Match idx: %d\n",idx);
+ #endif
- #ifdef trie_DEBUG
- printf("Match idx: %d\n",idx);
- #endif
+ while (stridx[idx]<0) {
+ UV srch_ofs;
+ IV sofs = stridx[idx++];
- while (stridx[idx]) {
- IV sofs = stridx[idx++];
- UV srch_ofs = cpos + sofs;
+ if ((sofs + cpos<0)||(sofs + cpos > upto)) {
+ while(stridx[idx]>0) idx++;
+ continue;
+ }
- #ifdef trie_DEBUG
- printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs);
- #endif
+ srch_ofs = cpos + sofs;
- x= (srch_ofs % strlen)*veclen;
+ #ifdef trie_DEBUG
+ printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs);
+ #endif
- while (stridx[idx]>0) {
+ x= (srch_ofs % strlen)*veclen;
- if ( srch_ofs>=0 && srch_ofs<=upto ) {
- UV check_byte=x+(long)(stridx[idx]/8);
- unsigned char check_bit=stridx[idx] % 8;
- unsigned char seen=mask[check_bit] & seenvec[check_
+byte];
+ while (stridx[idx]>0) {
+ UV check_byte=x+(long)(stridx[idx]/8);
+ unsigned char check_bit=stridx[idx] % 8;
+ unsigned char seen=mask[check_bit] & seenvec[check_byte];
- #ifdef trie_DEBUG
- printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d
+\n",
- stridx[idx],check_byte,check_bit,seen);
- #endif
+ #ifdef trie_DEBUG
+ printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d\n",
+ stridx[idx],check_byte,check_bit,seen);
+ #endif
- if (!seen) {
- char *c= &strings[ (stridx[idx]-1)*bmaxlen ];
- char *s= &search_string[ srch_ofs ];
- UV diff=0;
- num_fullchecks++;
- seenvec[check_byte]=seenvec[check_byte] | mask[ch
+eck_bit];
+ if (!seen) {
+ char *c= &strings[ (stridx[idx]-1)*bmaxlen ];
+ char *s= &search_string[ srch_ofs ];
+ UV diff=0;
+ num_fullchecks++;
+ seenvec[check_byte]=seenvec[check_byte] | mask[check_bit];
- #ifdef trie_DEBUG
- printf("\tsearching at ofs %d / strings_of %d (
+cpos:%d)\n",srch_ofs,stridx[idx],cpos);
- printf("\t> C '%s' | S %s = ",c,s);
- #endif
+ #ifdef trie_DEBUG
+ printf("\tsearching at ofs %d / strings_of %d (cpos:%d)\n",
+srch_ofs,stridx[idx],cpos);
+ printf("\t> C '%s' | S %s = ",c,s);
+ #endif
- while (*c && diff<=fuzz) {
- if (*c++ != *s++) diff++;
- }
+ while (*c && diff<=fuzz) {
+ if (*c++ != *s++) diff++;
+ }
- #ifdef trie_DEBUG
- printf("\tDiff: %d Fuzz:%d\n",diff,fuzz);
- #endif
+ #ifdef trie_DEBUG
+ printf("\tDiff: %d Fuzz:%d\n",diff,fuzz);
+ #endif
- if (diff <= fuzz) {
- #ifdef trie_PACKED_RETURN
- {
- Match_Return ret;
- #ifdef HAS_HTONL
- ret.ofs=PerlSock_htonl(ofs);
- ret.diff=PerlSock_htonl(diff);
- ret.match=PerlSock_htonl((stridx[idx]-1));
- #else
- ret.ofs=ofs;
- ret.diff=diff;
- ret.match=(stridx[idx]-1);
- #endif
- {
- SV *sv=newSVpvn((char*)(&ret),sizeof(ret));
- av_push(return_array,sv);
- }
+ if (diff <= fuzz) {
+#ifdef trie_PACKED_RETURN
+ {
+ Match_Return ret;
+#ifdef HAS_HTONL
+ ret.ofs=PerlSock_htonl(ofs);
+ ret.diff=PerlSock_htonl(diff);
+ ret.match=PerlSock_htonl((stridx[idx]-1));
+#else
+ ret.ofs=ofs;
+ ret.diff=diff;
+ ret.match=(stridx[idx]-1);
+#endif
+ {
+ SV *sv=newSVpvn((char*)(&ret),sizeof(ret));
+ av_push(return_array,sv);
+ }
- }
- #else
- {
- SV *sv=newSViv((UV)(srch_ofs));
- av_push(return_array,sv);
- }
- {
- SV *sv=newSViv((IV)diff);
- av_push(return_array,sv);
- }
- {
- SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmax
+len],0);
- av_push(return_array,sv);
- }
- #endif
- }
- }
- }
- idx++;
- } /* each string */
- } /* each ofs */
- } /* do we have a hit */
- } /* legal char? */
- } /* walk the string */
- {
- SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullCheck
+s",TRUE);
- sv_setuv(num_fullchecks_sv,num_fullchecks);
+ }
+#else
+ {
+ SV *sv=newSViv((UV)(srch_ofs));
+ av_push(return_array,sv);
+ }
+ {
+ SV *sv=newSViv((IV)diff);
+ av_push(return_array,sv);
+ }
+ {
+ SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmaxlen],0);
+ av_push(return_array,sv);
+ }
+#endif
+ }
+ }
+ idx++;
+ }
}
- return return_array;
+ }
+ {
+ SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullChecks"
+,TRUE);
+ sv_setuv(num_fullchecks_sv,num_fullchecks);
+ }
+ return return_array;
}
+
|