#! perl -slw use strict; my $needle = 'ATGGAGTCGACGAATTTGAAGAAT'; my $haystack = 'xxxxxxATGGAGyxxxTCGAzxxxxCGAATTTGAAxxwGAAT'; my @needles; #! While we've still needle to process while ($needle) { #! Try shorter and shorter substring of needle for my $start (0 .. length($needle) - 1) { my $bit = substr $needle, $start; #! move of if no match next unless $haystack =~ m[$bit]; #! Got a match, save it $haystack =~ s[$bit][ push @needles, $bit; $bit ]ge; #! and remove it $needle = substr $needle, 0, $start; #! repeat last; } } #! Sort the needles longest first @needles = sort{ length $b <=> length $a } @needles; #! mark their places in the haystack $haystack =~ s[($_)(?!\})][{$1}]g for @needles; #! remove nested marks #! (where a shorter needle was found inside a longer one) $haystack =~ s[ ({[^{}]*?) #! capture everything after the first { ({) #! until we find a second (also captured) ([^{}]*?) #! Then capture everything before the close } (}) #! and the close (nested) } ([^}]*?}) #! and everything from there, to and including the final } ] [$1$3$5]gx; #! Throw away the inner {}. #! Finally print out the none needle parts, and the needle that followed them. print "$1 preceeded $2" while $haystack =~ m[(\G[^{]+){([^}]+)}]g; __END__ C:\test>228122 xxxxxx preceeded ATGGAG yxxx preceeded TCGA zxxxx preceeded CGAATTTGAA xxw preceeded GAAT C:\test> #### #! perl -slw use vars qw[$LEN $N $MAX]; use strict; ($LEN, $N, $MAX) = ($LEN||1000, $N||10, $MAX||20); sub rndStr{local $"=''; "@_[map{rand @_} 0 .. shift]"; } #!" sub findStuff (\$\$) { my ($href, $nref) = @_; my @needles; while ($$nref) { for my $start (0 .. length($$nref) - 1) { my $bit = substr $$nref, $start; next unless 1+index( $$href, $bit ); push @needles, $bit; $$nref = substr $$nref, 0, $start; last; } } $$href =~ s[($_)(?!\})][{$1}]g for sort{ length $b <=> length $a } @needles; $$href =~ s[ ({[^{}]*?) ({) ([^{}]*?) (}) ([^}]*?}) ][$1$3$5]gx; return $$href =~ m[(\G[^{]+{[^}]+})]sg;} print 'Results from sample data'; my $haystack = 'xxxxxxATGGAGyxxxTCGAzxxxxCGAATTTGAAxxwGAAT'; my $needle = 'ATGGAGTCGACGAATTTGAAGAAT'; my @matches = findStuff $haystack, $needle; m[(^.*?)({.*}$)] and printf "%*s was preceeded by %s\n", $MAX+4, $2, $1 for @matches; $haystack = rndStr $LEN, qw[A C G T]; my $p=0; my @needles = map{ my $n = substr($haystack , $p += 4 + rand( $LEN / $N ) , 4 + rand( do{ my $tmp = $LEN - $p; $tmp > $MAX ? $MAX - 4 : $tmp - 4 } ) ); # print $n,':',length $n; $n; } 1 .. $N; $needle = join '', @needles; print <<"EOS"; Results from test data of $N needles; length (4-$MAX) within a haystack of $LEN chars EOS @matches = findStuff $haystack, $needle; m[(^.*?)({.*}$)] and printf "%*s was preceeded by %-60.60s %s\n" , $MAX + 2 , $2 , $1 , length $1 > 60 ? '... ' . substr( $1, -10) : '' for @matches;