use strict; use warnings; use Time::HiRes qw(time); use POSIX qw(strftime); use Getopt::Long; use vars qw/ $VERBOSE /; $VERBOSE=0; exit(main(@ARGV)); BEGIN { # all of this gunk is for timing and logging stuff my @time=(time); my @comment=('Start'); sub mark_time { push @time,time; push @comment,join "",@_; print @_, sprintf " <%9.4f secs>\n",$time[-1]-$time[-2] if $VERBOSE; return $time[-1]; } sub clear_time { push @time,time; push @comment,'Finish'; @comment=map { s/[\t\r\n]/ /g; $_} @comment; printf STDERR "%%#%% %s %s %s\n",strftime("%H:%M:%S",localtime($time[$_])), $_ ? sprintf " (%9.4f : %9.4f)",$time[$_]-$time[$_-1],$time[$_]-$time[0] : '###', ($comment[$_]||'') for 0..$#time; my $total=$time[-1]-$time[0]; print "Total Elapsed Time: ".$total." secs\n"; @time=(time); @comment=(shift||'Start'); return $total; } sub elapsed { my $time=shift; $time||=$time[-1]; print @_, sprintf " [%9.4f secs]\n",time-$time; return time; } END { clear_time(); } } sub search { my ($node,$string,$idx,$maximum)=@_; $idx=0 unless defined $idx; my @match; my $matches; $maximum||=50; #print "\nsearch($node,$string,$idx)\n" if $VERBOSE<0; for (;$idx{id} : ",($node->{value}?$node->{value}:"''"),"\n" # if $VERBOSE<0; if ( $node->{value} ) { push @match,[$idx-length($node->{string}),${$node->{value}},$node->{fuzz},$node] if $matches++<$maximum; } $node=$node->{$char}||$node->{-$char}; die $char,$idx unless $node; } if ( $node->{value} ) { push @match,[$idx-length($node->{string}),${$node->{value}},$node->{fuzz},$node] if $matches++<$maximum; } return $matches,@match; } BEGIN { my $IDX=1; sub get_idx(){"$IDX"} # this builds the trie sub hash_trie_store { my ($node,$string,$value,$fuzz)=@_; $node->{id}||=$IDX++; my $dot=0; foreach my $char (split //,$string) { unless ($node->{$char}) { $node->{$char}={id=>$IDX++}; } $node=$node->{$char}; } $node->{value}=\$_[2]; $node->{fuzz}=$fuzz if defined $fuzz; $node->{string}=$string; #print "Store: $string:$value:$fuzz\n"; $node->{id} } } # this makes fuzzy words from a string and then inserts into the trie. sub make_variants { my ($string,$depth,$chars,$hash,$trie,$rdepth,$rstring)=@_; my $ltrs=ref $chars ? $chars : [ split //, $chars ]; my @char=split//,$string; $trie||={}; $hash||={}; $rdepth||=0; my $count=0; my @follow; $rstring||=$string; if (!$rdepth) { $hash->{$string}=hash_trie_store($trie,$string,$rstring,$rdepth); $count++; #print "* $string, $rdepth\n"; } foreach my $idx (0..$#char) { foreach my $alt (@$ltrs) { next if $alt eq $char[$idx]; local $char[$idx]=$alt; my $str=join "",@char; if (!$hash->{$str}) { $hash->{$str}=hash_trie_store($trie,$str,$rstring,$rdepth+1); $count++; push @follow,$str if $depth>1; #print "$str, ",$rdepth+1,"\n"; } } } foreach my $str (@follow) { $count+=make_variants($str,$depth-1,$ltrs,$hash,$trie,$rdepth+1,$rstring); } return $count; } # this converts a trie into a "dfa" capable of doing the overlapping matching # note it doesnt handle strings of differing size correctly so dont use it for that. sub traverse { my ($root,$chars,$str,$hash)=@_; $hash||=$root; $str="" unless defined $str; my $ltrs=ref $chars ? $chars : [ split //, $chars ]; foreach my $k (@$ltrs) { if ($hash->{$k}) { traverse($root,$chars,$str.$k,$hash->{$k}); } else { if (length($str)>1) { my @chars=(split(//,$str),$k); while (defined shift @chars) { my $csr=$root; for my $c (@chars) { last unless $csr=$csr->{$c}||$csr->{-$c}; } if ($csr) { $hash->{-$k}=$csr; last } } } $hash->{-$k}||=$root; } } return } # slighlty modified version of BrowserUk's code some changes necessary becuase # we allow for differing length words and also because Perl 5.6.2 complained. sub xor_fuzz { my ($length,$FUZZY,$fuzzfile,$stringfile,$maximum)=@_; use bytes; $FUZZY||=2; $maximum||=50; mark_time "xor_fuzz loading '$fuzzfile'\n"; open my $FUZ, '<', $fuzzfile or die "$fuzzfile : $!"; my %fuz; while( <$FUZ> ) { chomp; $fuz{ $_ } = ''; } close $FUZ; mark_time "Loaded ${ \scalar keys %fuz } $length-ers\n"; open my $SEQ, '<', $stringfile or die "$stringfile : $!"; my $totalLen = 0; my $fuzzyComps = 0; while( my $seq = <$SEQ> ) { my @matches; my $matches=0; chomp $seq; $totalLen += length $seq; for my $offset ( 0 .. length( $seq ) - $length ) { my $ssref = \substr( $seq, $offset, $length ); #printf STDERR "\rProcessing sequence %5d offset %05d", $., $offset; for my $fuz ( keys %fuz ) { $fuzzyComps++; my $m = $length - (my $x=( $fuz ^ $$ssref )) =~ tr[\0][\0]; if( $m <= $FUZZY ) { push @matches,[$offset,$fuz,$m] if $matches++<$maximum; } } } mark_time sprintf "XOR #%04d: %04d > %s",$.,$matches,@matches ? ": ". join(", ",map{ sprintf "%06d:%s:%d",@{$_}[0..2] } @matches) : "None."; } mark_time "\n\nProcessed $. sequences\nAverage length: ", $totalLen / $.,"\n"."Total fuzzy comparisons: ", $fuzzyComps,"\n"; close $SEQ; } #utility function sub write_file(&@) { my ($sub,$file,$count,$override)=@_; return if !$override and -e $file; open my $fh,">","$file.tmp" or die "Error writing to '$file.tmp':$!"; print "Writing '$file'\n" if $VERBOSE; for (1..$count) { print $fh $sub->(),"\n" or die "Failed to print to '$file'\n"; } close $fh or die "Failed to close '$file'\n"; rename "$file","$file.bak" or die "Backup Rename failed:$!" if -e $file; rename "$file.tmp","$file" or die "Tmp Rename failed:$!"; mark_time "File Complete: '$file'\n"; } sub main { $|++; my $Fuzz=2; my $Words=1; my $Word_Size=25; my $Strings=1000; my $String_Size=1000; my $OverWrite=0; GetOptions('fuzz=i' => \$Fuzz, 'words=i' => \$Words, 'strings=i' => \$Strings, 'word_size=i' => \$Word_Size, 'string_size=i'=> \$String_Size, 'over_write!' => \$OverWrite, 'verbose!' => \$VERBOSE) or die "Bad Options\n"; my @Chars=qw(A C T G); my $Word_File=sprintf "Fuzz-Words-W%04d-S%04d-WC%04d-SC%04d.fuzz",$Word_Size,$String_Size,$Words,$Strings; my $String_File=sprintf "Fuzz-Strings-W%04d-S%04d-WC%04d-SC%04d.fuzz",$Word_Size,$String_Size,$Words,$Strings; rename $String_File,"$String_File.bak" if -e $String_File and !-e $Word_File; my @words; write_file { my $str=join "",map { $Chars[rand @Chars] } 1..$Word_Size; push @words,$str; $str; } $Word_File,$Words,$OverWrite; print "Getting strings\n" if $VERBOSE; write_file { my $str=$Chars[rand @Chars] x $String_Size; substr($str,$_-1,1)=$Chars[rand @Chars] for 1..$String_Size; for (1..$Word_Size) { my $p=int rand($String_Size-$Word_Size); my $w=$words[rand @words]; substr($str,$p,$Word_Size,$w); } $str; } $String_File,$Strings,$OverWrite; mark_time "Finished building strings\nWords: $Word_File\nStrings: $String_File\n"; clear_time("Starting TRIE\n"); my $construct_time; { # Trie search @words=do{ open my $ifh,"<",$Word_File or die "Can't read Word File '$Word_File':$!"; map { chomp; $_ } <$ifh> }; my $time=mark_time "Got ".scalar(@words)." to fuzzyify\n"; print "Making fuzzy strings\n" if $VERBOSE; my (%trie,%hash); foreach my $word (@words) { print length($word).":$word:" if $VERBOSE; my $count=make_variants($word,$Fuzz,\@Chars,\%hash,\%trie); $time=elapsed($time,$count) if $VERBOSE; } @words=sort keys %hash; mark_time "Fuzzy Words:".scalar(@words)." [".get_idx()." nodes in tree]\n"; #exit(0); print "Doing traverse...\n" if $VERBOSE; traverse(\%trie,\@Chars); mark_time "Finised Traverse"; $construct_time=clear_time("Starting Trie Scan\n"); open my $fh,"<",$String_File or die "Error reading stringfile '$String_File'\n"; while (<$fh>) { chomp(my $string=$_); my ($count,@matches)=search(\%trie,$string); mark_time sprintf "TRIE #%04d: %04d > %s",$.,$count,@matches ? ": ". join(", ",map{ sprintf "%06d:%s:%d",@{$_}[0..2] } @matches) : "None."; } mark_time "Trie Done\n"; } my $scan_time=clear_time("Starting xor_fuzz search\n"); { # Xor Search xor_fuzz($Word_Size,$Fuzz,$Word_File,$String_File); } my $xor_total=clear_time("xor_fuzz search done.\n"); print STDERR "**** WordSize: $Word_Size StringSize: $String_Size WordCount: $Words StringCount: $Strings Xor: $xor_total Trie: " .($scan_time+$construct_time)." ($scan_time + $construct_time)\n"; print "**** WordSize: $Word_Size StringSize: $String_Size WordCount: $Words StringCount: $Strings Xor: $xor_total Trie: " .($scan_time+$construct_time)." ($scan_time + $construct_time)\n"; 0 } #### del *.fuzz trie_xor.pl --word_size=10 --string_size=1000 --words=1 --over_write 2>fuzzlog-10-1000-01.fuzzlog trie_xor.pl --word_size=10 --string_size=1000 --words=2 --over_write 2>fuzzlog-10-1000-02.fuzzlog trie_xor.pl --word_size=10 --string_size=1000 --words=5 --over_write 2>fuzzlog-10-1000-05.fuzzlog trie_xor.pl --word_size=10 --string_size=1000 --words=10 --over_write 2>fuzzlog-10-1000-10.fuzzlog trie_xor.pl --word_size=10 --string_size=10000 --words=1 --over_write 2>fuzzlog-10-10000-01.fuzzlog trie_xor.pl --word_size=10 --string_size=10000 --words=2 --over_write 2>fuzzlog-10-10000-02.fuzzlog trie_xor.pl --word_size=10 --string_size=10000 --words=5 --over_write 2>fuzzlog-10-10000-05.fuzzlog trie_xor.pl --word_size=10 --string_size=10000 --words=10 --over_write 2>fuzzlog-10-10000-10.fuzzlog trie_xor.pl --word_size=10 --string_size=100000 --words=1 --over_write 2>fuzzlog-10-100000-01.fuzzlog trie_xor.pl --word_size=10 --string_size=100000 --words=2 --over_write 2>fuzzlog-10-100000-02.fuzzlog trie_xor.pl --word_size=10 --string_size=100000 --words=5 --over_write 2>fuzzlog-10-100000-05.fuzzlog trie_xor.pl --word_size=10 --string_size=100000 --words=10 --over_write 2>fuzzlog-10-100000-10.fuzzlog trie_xor.pl --word_size=25 --string_size=1000 --words=1 --over_write 2>fuzzlog-25-1000-01.fuzzlog trie_xor.pl --word_size=25 --string_size=1000 --words=2 --over_write 2>fuzzlog-25-1000-02.fuzzlog trie_xor.pl --word_size=25 --string_size=1000 --words=5 --over_write 2>fuzzlog-25-1000-05.fuzzlog trie_xor.pl --word_size=25 --string_size=1000 --words=10 --over_write 2>fuzzlog-25-1000-10.fuzzlog trie_xor.pl --word_size=25 --string_size=10000 --words=1 --over_write 2>fuzzlog-25-10000-01.fuzzlog trie_xor.pl --word_size=25 --string_size=10000 --words=2 --over_write 2>fuzzlog-25-10000-02.fuzzlog trie_xor.pl --word_size=25 --string_size=10000 --words=5 --over_write 2>fuzzlog-25-10000-05.fuzzlog trie_xor.pl --word_size=25 --string_size=10000 --words=10 --over_write 2>fuzzlog-25-10000-10.fuzzlog trie_xor.pl --word_size=25 --string_size=100000 --words=1 --over_write 2>fuzzlog-25-100000-01.fuzzlog trie_xor.pl --word_size=25 --string_size=100000 --words=2 --over_write 2>fuzzlog-25-100000-02.fuzzlog trie_xor.pl --word_size=25 --string_size=100000 --words=5 --over_write 2>fuzzlog-25-100000-05.fuzzlog trie_xor.pl --word_size=25 --string_size=100000 --words=10 --over_write 2>fuzzlog-25-100000-10.fuzzlog