in reply to Re^2: Fast common substring matching
in thread Fast common substring matching
Give it a whirl.
A test-data generating program followsuse warnings; use strict; use Time::HiRes; if (@ARGV == 0) { print "Finds longest matching substring between any pair of test s +trings\n"; print "in the given file. Pairs of lines are expected with the fir +st of a\n"; print "pair being the string name and the second the test string." +; exit (1); } my $minmatch = 4; my $startTime = [Time::HiRes::gettimeofday ()]; my @strings; while (<>) { chomp(my $label = $_); chomp(my $string = <>); # Compute all substrings push @strings, map [substr($string, $_), $label, $_], 0..(length($st +ring) - $minmatch); } print "Loaded. Sorting...\n"; @strings = sort {$a->[0] cmp $b->[0]} @strings; print "Sorted. Finding matches...\n"; # Now walk through the list. The best match for each string will be th +e # previous or next element in the list that is not from the original s +ubstring, # so for each entry, just look for the next one. See how many initial +letters # match and track the best matches my @matchdata = (0); # (length, index1-into-strings, index2-into-strin +gs) for my $i1 (0..($#strings - 1)) { my $i2 = $i1 + 1; ++$i2 while $i2 <= $#strings and $strings[$i2][1] eq $strings[$i1][1 +]; next if $i2 > $#strings; my ($common) = map length, ($strings[$i1][0] ^ $strings[$i2][0]) =~ +/^(\0*)/; if ($common > $matchdata[0]) { @matchdata = ($common, [$i1, $i2]); } elsif ($common == $matchdata[0]) { push @matchdata, [$i1, $i2]; } } print "Best match: $matchdata[0] chars\n"; for my $i (@matchdata[1..$#matchdata]) { print "$strings[$i->[0]][1] starting at $strings[$i->[0]][2]" . " and $strings[$i->[1]][1] starting at $strings[$i->[1]][2]\n"; } print "Completed in " . Time::HiRes::tv_interval ($startTime) . "\n";
use warnings; use strict; my ($howmany, $howlong) = (20, 1000); # Generate $howmany strings of $howlong characters for my $s (1..$howmany) { print "'String $s'\n"; my $str = ''; $str .= (qw(A C G T))[rand 4] for 1..$howlong; print "$str\n"; }
|
---|
Replies are listed 'Best First'. | |
---|---|
Re^4: Fast common substring matching
by bioMan (Beadle) on Nov 29, 2005 at 16:53 UTC | |
by Roy Johnson (Monsignor) on Nov 29, 2005 at 17:08 UTC | |
by bioMan (Beadle) on Nov 29, 2005 at 22:37 UTC | |
by GrandFather (Saint) on Nov 29, 2005 at 22:56 UTC | |
by bioMan (Beadle) on Nov 30, 2005 at 19:33 UTC | |
by marioroy (Prior) on Feb 18, 2016 at 00:01 UTC | |
Re^4: Fast common substring matching
by marioroy (Prior) on Feb 17, 2016 at 06:39 UTC | |
by marioroy (Prior) on Feb 17, 2016 at 07:36 UTC | |
by marioroy (Prior) on Feb 17, 2016 at 08:34 UTC | |
Re^4: Fast common substring matching
by marioroy (Prior) on Feb 17, 2016 at 20:31 UTC |