#!usr/bin/perl
use strict;
use warnings;
my @starts = ();
while (<>) {
chomp;
next if $. < 2; # Skip first line
my @seq = split '', $_;
for (0..$#seq-5){
if ($seq[$_] eq 'A' && $seq[$_+1] eq 'T' && $seq[$_+2] eq 'G') {
push (@starts, $_);
for (my $j=$_+3; $j<=$#seq-2; $j=$j+3){
if ( ($seq[$j] eq 'T' && $seq[$j+1] eq 'A' && $seq[$j+2] eq 'A
+') ||
($seq[$j] eq 'T' && $seq[$j+1] eq 'G' && $seq[$j+2] eq 'A
+') ||
($seq[$j] eq 'T' && $seq[$j+1] eq 'A' && $seq[$j+2] eq 'G
+') ) {
print "ORF: $_-", ($j+2), "\n";
last; ##lasts the j loop
}
}
}
}
} continue {
close ARGV if eof; # reset $.
}
__END__
$ perl bio_test.pl sequence.fa
ORF: 16-75
ORF: 50-136
ORF: 133-357
ORF: 232-357
ORF: 252-290
ORF: 287-334
ORF: 305-334
ORF: 363-380
ORF: 394-450
ORF: 489-623
ORF: 575-724
ORF: 651-797
ORF: 689-724
ORF: 724-936
ORF: 854-859
ORF: 859-936
ORF: 954-959
ORF: 1051-1200
ORF: 1145-1255
ORF: 1228-1275
ORF: 1249-1275
Update: Another possible way, this is how I would approach it. This is 3/4 of the solution, I do not want to give you the full solution as this looks like school work. It should not be really difficult to continue from here.
As I said before use index for multiple occurrences in the string.
#!usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
my $substring = 'ATG';
while (<>) {
chomp;
next if $. < 2; # Skip first line
my @seq = split ' ', $_;
# print Dumper \@seq; # Remove hashtag and see how the input looks
for (@seq) {
my $found = index($_, $substring);
while ($found != -1) { # check for multiple occurrences in the sam
+e string
print "Found $substring at $found\n"; # Do your calculations h
+ere
my $offset = $found + 1; # keep track of offset in your counte
+r as you do on your primary script and should have it
$found = index($_, $substring, $offset);
}
}
} continue {
close ARGV if eof; # reset $.
}
__END__
$ perl bio_test.pl sequence.fa
Found 'ATG' at 16
Found 'ATG' at 50
Found 'ATG' at 62
Found 'ATG' at 19
Found 'ATG' at 39
Found 'ATG' at 3
Found 'ATG' at 21
Found 'ATG' at 8
Found 'ATG' at 39
Found 'ATG' at 63
Found 'ATG' at 7
Found 'ATG' at 12
Found 'ATG' at 50
Found 'ATG' at 14
Found 'ATG' at 2
Found 'ATG' at 7
Found 'ATG' at 31
Found 'ATG' at 20
Found 'ATG' at 50
Found 'ATG' at 57
Found 'ATG' at 9
Found 'ATG' at 21
Found 'ATG' at 42
Found 'ATG' at 65
Notice that the index is going back to zero, keep track of it and it should give you the same results as your primary script. Let us know if there is something that is not clear.
Update2: If the space in between the sequences in normal you do not need to split and you can loop through like this:
#!usr/bin/perl
use strict;
use warnings;
my $substring = 'ATG';
while (<>) {
chomp;
next if $. < 2; # Skip first line
my $found = index($_, $substring);
while ($found != -1) { # check for multiple occurrences in the sam
+e string
print "Found $substring at $found\n"; # Do your calculations here
my $offset = $found + 1; # keep track of offset in your counter as
+ you do on your primary script and should have it
$found = index($_, $substring, $offset);
}
} continue {
close ARGV if eof; # reset $.
}
__END__
$ perl bio.pl sequence.fa
Found ATG at 16
Found ATG at 50
Found ATG at 133
Found ATG at 232
Found ATG at 252
Found ATG at 287
Found ATG at 305
Found ATG at 363
Found ATG at 394
Found ATG at 489
Found ATG at 575
Found ATG at 651
Found ATG at 689
Found ATG at 724
Found ATG at 854
Found ATG at 859
Found ATG at 954
Found ATG at 1014
Found ATG at 1044
Found ATG at 1051
Found ATG at 1145
Found ATG at 1228
Found ATG at 1249
Found ATG at 1272
Now you just need to apply an if condition and voila. :D
Hope this helps.
Seeking for Perl wisdom...on the process of learning...not there...yet!
|