my $xor = substr($data[$d1],$s1,$L) ^ substr($data[$d2],$s2,$L); #### #!/your/perl/here # find the longest common substring (contiguous) in a list of strings # # method: # sort all strings by descending length # search from longest overlap to shortest # (the longest substring is more likely to be in the longest overlap) # use strict; use warnings; use Benchmark; my @timer; push @timer, new Benchmark; # predictable randomness srand(atan2(1,1)*4*1000); my @keys = qw( LENGTH STRING FIRST_INDEX FIRST_OFFSET SECOND_INDEX SECOND_OFFSET ); my @chars = qw( A G C T ); my $longest = 0; my @data; foreach my $i (1..300) { my $d; foreach my $j (1..3000) { $d .= $chars[rand(@chars)]; } push @data, $d; $longest = length($d) if ( length($d) > $longest ); } # sort longest to shortest @data = sort { length($b) <=> length ($a) } @data; my %LCS; @LCS{@keys} = (0,'',-1,-1,-1,-1); my $L = $longest; L: while ( $L > $LCS{LENGTH} ) { push @timer, new Benchmark; @timer = time_delta("\nL:$L", @timer); D1: for my $d1 ( 0..$#data-1 ) { # all entries are sorted by descending length # if this entry is too short, all the rest are as well # go to the next "length" my $d1L = length( $data[$d1] ); next L unless ( $d1L >= $L ); D2: for my $d2 ( $d1+1..$#data ) { # all entries are sorted by descending length # if this entry is too short, all the rest are as well # go to the next D1 (which may be long enough) my $d2L = length( $data[$d2] ); next D1 unless ( $d2L >= $L ); # consider overlaps of length $L # these are starting index numbers # overlaps are either at index 0, # or the last $L characters my @s1 = (0, $d1L - $L ); my @s2 = ($d2L - $L, 0 ); S: for my $s ( 0,1 ) { my $xor = substr($data[$d1],$s1[$s],$L) ^ substr($data[$d2],$s2[$s],$L); my $offset = 0; while ( $xor =~ /([^\0]*)(\0{$LCS{LENGTH},})/gsm ) { my ($first, $second) = ($1, $2); $offset += length($first); my $long = length($second); if ( $long > $LCS{'LENGTH'} ) { @LCS{@keys} = ($long, substr($data[$d1],$s1[$s]+$offset,$long), # string $d1,$s1[$s]+$offset, $d2,$s2[$s]+$offset); print "\nLCS: <$LCS{STRING}> $long\n"; } $offset += length($second); } } } } $L--; } print "Longest: <$LCS{STRING}> ($LCS{LENGTH})\n"; print "Item $LCS{FIRST_INDEX}, offset $LCS{FIRST_OFFSET}\n"; print "Item $LCS{SECOND_INDEX}, offset $LCS{SECOND_OFFSET}\n"; push @timer, new Benchmark; @timer = time_delta("total", @timer); exit; ########################################### # nice benchmark timestamp printer sub time_delta { my $msg = shift; my @t = @_; my $total_time = timestr(timediff(@t[-1,0])); my $delta_time = timestr(timediff(@t[-1,-2])); print "$msg\nd:$delta_time\nt:$total_time\n"; return @t[0,-1]; }