Try something like:
(I'm just using ...ABCxxWXY... to make the permutations and overlaps clear.) (Update: The number that is the second item in each array reference returned is the base-0 offset of the start of the matching subsequence.)c:\@Work\Perl\monks>perl -wMstrict -le "use Data::Dump qw(dd); ;; my $seq = 'xABCxABCxxxWXYxWXZxxxABCxxWXYx'; ;; my $subseq = qr{ ABC \w* (?: WXY | WXZ) }xms; ;; my @all = find_all($seq, $subseq); dd \@all; ;; ;; sub find_all { my ($seq, $regex) = @_; ;; local our @hits; use re 'eval'; $seq =~ m{ ($regex) (?{ push @hits, [ $^N, $-[1] ] }) (?!) }xmsg; ;; return @hits; } " [ ["ABCxABCxxxWXYxWXZxxxABCxxWXY", 1], ["ABCxABCxxxWXYxWXZ", 1], ["ABCxABCxxxWXY", 1], ["ABCxxxWXYxWXZxxxABCxxWXY", 5], ["ABCxxxWXYxWXZ", 5], ["ABCxxxWXY", 5], ["ABCxxWXY", 21], ]
Update: Using your original sequence:
This works with Perl 5.8+ regexes. What version of Perl are you using — it might make a difference in future?c:\@Work\Perl\monks>perl -wMstrict -le "use Data::Dump qw(dd); ;; my $seq = 'AATGGTTTCTCCCATCTCTCCATCGGCATAAAAATACAGAATGATCTAA'; ;; my $subseq = qr{ ATG \w* (?: TAG | TAA | TGA) }xms; ;; my @all = find_all($seq, $subseq); dd \@all; ;; ;; sub find_all { my ($seq, $regex) = @_; ;; local our @hits; use re 'eval'; $seq =~ m{ ($regex) (?{ push @hits, [ $^N, $-[1] ] }) (?!) }xmsg; ;; return @hits; } " [ ["ATGGTTTCTCCCATCTCTCCATCGGCATAAAAATACAGAATGATCTAA", 1], ["ATGGTTTCTCCCATCTCTCCATCGGCATAAAAATACAGAATGA", 1], ["ATGGTTTCTCCCATCTCTCCATCGGCATAA", 1], ["ATGATCTAA", 40], ]
Update 2: Remembering that DNA sequences may sometimes be loooong, it may be advantageous to pass the sequence by reference. Note that both the function and the function invocation must change.
Still runs under Perl 5.8.c:\@Work\Perl\monks>perl -wMstrict -le "use Data::Dump qw(dd); ;; my $seq = 'AATGGTTTCTCCCATCTCTCCATCGGCATAAAAATACAGAATGATCTAA'; ;; my $subseq = qr{ ATG \w* (?: TAG | TAA | TGA) }xms; ;; my @all = find_all(\$seq, $subseq); dd \@all; ;; ;; sub find_all { my ($sr_seq, $regex) = @_; ;; local our @hits; use re 'eval'; $$sr_seq =~ m{ ($regex) (?{ push @hits, [ $^N, $-[1] ] }) (?!) }xmsg; ;; return @hits; } " [ ["ATGGTTTCTCCCATCTCTCCATCGGCATAAAAATACAGAATGATCTAA", 1], ["ATGGTTTCTCCCATCTCTCCATCGGCATAAAAATACAGAATGA", 1], ["ATGGTTTCTCCCATCTCTCCATCGGCATAA", 1], ["ATGATCTAA", 40], ]
Give a man a fish: <%-{-{-{-<
In reply to Re^3: Using Recursion to Find DNA Sequences
by AnomalousMonk
in thread Using Recursion to Find DNA Sequences
by clueless_perl
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |