Dear Choroba
Thank you for your time. I modified the code as per your suggestions but it seems that the code is skipping the first block of alignment and rest the occurrences of nucleotides as percentages are wrong.
I am attaching the code and the data file for the same
#!/usr/bin/perl
use strict;
use warnings;
use Syntax::Construct qw{ // };
my $endpos = 0;
my ($startpos, $count);
my %occurrences;
my $file = $ARGV[0];
open(DATA, $file);
while (<DATA>) {
if (/^CLUSTAL.*/) {next;}
if (/^ +$/) {
$startpos = $endpos + 1;
$count = 0;
}
elsif (/\s+ ([-actg]+) \s*$/x) {
++$count;
my @nucleotides = split //, $1;
$endpos = $endpos + length $1 if $startpos == $endpos + 1;
for my $pos (0 .. $#nucleotides) {
++$occurrences{ $nucleotides[$pos] }[$startpos + $pos]
unless '-' eq $nucleotides[$pos];
}
}
}
for my $pos (1 .. $endpos) {
print "$pos\t";
for my $nucleotide (sort keys %occurrences) {
printf "%s\t%0.1f\t", uc $nucleotide, 100 * ($occurrences{$nuc
+leotide}[$pos] // 0) / $count;
}
print "\n";
}
Data file
CLUSTAL O(1.2.1) multiple sequence alignment
gnl|hbvcds|AB014370_PreC_P-A -----------------------------------
+-------------------------
gnl|hbvcds|AB064314_PreC_P-A -----------------------------------
+-------------------------
gnl|hbvcds|AB014384_C_P-C -----------------------------------
+-------------------------
gnl|hbvcds|AB014385_C_P-C -----------------------------------
+-------------------------
gnl|hbvcds|AB048701_PreS1_P-D atggggcagaatctttccaccagcaatcctctggg
+attctttcccgaccatcagttggat
gnl|hbvcds|AB078031_PreS1_P-D atggggcagaatctttccaccagcaaccctctggg
+attctttcccgaccaccagttggat
gnl|hbvcds|AB030513_S_P-A -----------------------------------
+------------------------a
gnl|hbvcds|AB064314_S_P-A -----------------------------------
+------------------------c
gnl|hbvcds|AB194947_PreS2_P-E -----------------------------------
+------------------------g
gnl|hbvcds|AB194948_PreS2_P-E -----------------------------------
+------------------------g
+
gnl|hbvcds|AB014370_PreC_P-A tagagtctcctgagcattgctcacctcaccatact
+gcactcaggcaagccattctctgct
gnl|hbvcds|AB064314_PreC_P-A tagagtctcctgagcattgctcacctcaccatacg
+gcactcaggcaagccattctctgct
gnl|hbvcds|AB014384_C_P-C tagagtctccggaacattgttcacctcaccataca
+gcactcaggcaagctattctgtgtt
gnl|hbvcds|AB014385_C_P-C tagagtctccggaacattgttcacctcaccataca
+gcactcaggcaagctattctgtgtt
gnl|hbvcds|AB048701_PreS1_P-D gggtttttcttgttgacaagaatcctcacaatacc
+gcagagtctagactcgtggtggact
gnl|hbvcds|AB078031_PreS1_P-D gggtttttcttgttgacaagaatcctcacaatacc
+gcagagtctagactcgtggtggact
gnl|hbvcds|AB030513_S_P-A gggtttttcttgttgacaagaatcctcacaatacc
+gcagagtctagactcgtggtggact
gnl|hbvcds|AB064314_S_P-A gggtttttcttgttgacaagaatcctcacaatacc
+gcagagtctagactcgtggtggact
gnl|hbvcds|AB194947_PreS2_P-E gggtttttcttgttgacaaaaatcctcacaatacc
+gcagagtctagactcgtggtggact
gnl|hbvcds|AB194948_PreS2_P-E gggtttttcttgttgacaaaaatcctcacaatacc
+gcagagtctagactcgtggtggact
* * ** * * ****** ****
+*** * * * *
Regards |