in reply to Speed up DNA dotplot

I'm not sure if I really understood your requirements or code.
my $MAX = 10_000; my $WINDOW_SIZE = 5; my $MAX_MISMATCH = 1; my $seq1 = join '', qw(A T G C)[ map int rand 4, 1 .. $MAX ]; my $seq2 = join '', qw(A T G C)[ map int rand 4, 1 .. $MAX ]; sub with_regexp { my ($seq1, $seq2, $window, $mismatch) = @_; my $retval = ''; for my $start (0 .. length ($seq1) - $window - 1) { my $regex = build_regexp (substr ($seq1, $start, $window), $mi +smatch); pos $seq2 = 0; do { $retval .= $seq2 =~ m/\G(?=$regex)/gc ? 1 : 0 } while $seq2 =~ m/\G(?=.{$window})./g; $retval .= "\n"; } $retval; } sub build_parts { my ($window, $mismatch) = @_; my $l = length $window; $mismatch = $l if $mismatch > $l; return $window unless $mismatch; return '.' x $l if $l == $mismatch; my ($first, $rest) = split //, $window, 2; return ( (map $first . $_, build_parts ($rest, $mismatch)), (map '.' . $_, build_parts ($rest, $mismatch -1)), ); } sub build_regexp { join '|', map '(?:' . $_ . ')', build_parts (@_); } print with_regexps ($seq1, $seq2, $WINDOW, $MAX_MISMATCH);

Replies are listed 'Best First'.
Re^2: Speed up DNA dotplot
by happy.barney (Friar) on Jul 14, 2011 at 09:35 UTC
    small improvements:
    sub test_regexps2 { my ($seq1, $seq2, $window, $mismatch) = @_; my $retval = ''; my %cache; my @mask = (0) x (length ($seq2) - $window); for my $start (0 .. (length ($seq1) - $window)) { my $part = substr ($seq1, $start, $window); $retval .= $cache{$part} ||= do { my $regex = build_regexp ($part, $mismatch); my @res = @mask; while ($seq2 =~ m/(?=$regex)/g) { $res[ pos $seq2 ] = 1; } join '', @res, "\n"; }; } $retval; }
    benchmarks for length 200, 400 and 600
    200: Rate orig_poster test_regexps test_regexps2 orig_poster 9.70/s -- -51% -81% test_regexps 19.6/s 103% -- -61% test_regexps2 49.8/s 413% 153% -- 400: Rate orig_poster test_regexps test_regexps2 orig_poster 2.44/s -- -52% -84% test_regexps 5.08/s 109% -- -67% test_regexps2 15.5/s 535% 205% -- 600: Rate orig_poster test_regexps test_regexps2 orig_poster 1.06/s -- -54% -86% test_regexps 2.30/s 117% -- -70% test_regexps2 7.75/s 633% 237% --