CLUSTAL O(1.2.1) multiple sequence alignment
gnl|hbvcds|AB014370_PreC_P-A ------------------------------------------------------------
gnl|hbvcds|AB064314_PreC_P-A ------------------------------------------------------------
gnl|hbvcds|AB014384_C_P-C ------------------------------------------------------------
gnl|hbvcds|AB014385_C_P-C ------------------------------------------------------------
gnl|hbvcds|AB048701_PreS1_P-D atggggcagaatctttccaccagcaatcctctgggattctttcccgaccatcagttggat
gnl|hbvcds|AB078031_PreS1_P-D atggggcagaatctttccaccagcaaccctctgggattctttcccgaccaccagttggat
gnl|hbvcds|AB030513_S_P-A ------------------------------------------------------------
gnl|hbvcds|AB064314_S_P-A ------------------------------------------------------------
gnl|hbvcds|AB194947_PreS2_P-E ------------------------------------------------------------
gnl|hbvcds|AB194948_PreS2_P-E ------------------------------------------------------------
####
#!/usr/bin/perl
use strict;
use warnings;
my ($rna,$rnalen,$posi, $pos, $base, $dibase,$percent);
my (%bases, %occur, %count); my($max_position, $sum, $tot) = 0;
my @dinucleotide = qw(A C G T);#change the number of bases
my $file = $ARGV[0] or die "Please provide the input file\n";
my $column = $ARGV[1];
open (X, $file) || die "Check the input $file\n";
while () {
# chomp $_;
if ($_=~/^#/) { next; }
if ($_=~/^\s+/) { next; }
if ($_=~/^CLUSTAL/) { next; }
if ($_=~/\*/) { next; }
my @temp = split (/\s+/, $_);
$rna = $temp[$column];
$rna = uc $rna;
$rnalen = length($rna);
my @nucleotides = split (//,$rna);
$tot++;
foreach my $key(@dinucleotide) {
$bases{$key}=1;
$sum++;
}
$max_position = $rnalen if($rnalen > $max_position);
for (my $position=0;$position<=$rnalen-1;$position++) {
$dibase = substr($rna, $position, 1); #change the splitting number
foreach my $dinucleotide(@dinucleotide) {
if ($dibase eq $dinucleotide){
$posi = $position+1;
$occur{$posi,$dibase}++;
# print "$posi\t$dibase\n";
}
}
}
}
for($pos=1;$pos<=$max_position;$pos++) {
print "$pos\t";
foreach $base(sort keys %bases) {
print "$base\t";
if(exists $occur{$pos,$base}) {
my $percent = sprintf ("%0.1f", (($occur{$pos,$base})/$tot)*100);
# print "$occur{$pos,$base}\t";
print "$percent\t";
}
else {
print "0\t";
}
}
print "\n";
}
close (X);
####
1 A 20.0 C 0 G 0 T 0
2 A 0 C 0 G 0 T 20.0
3 A 0 C 0 G 20.0 T 0
4 A 0 C 0 G 20.0 T 0
5 A 0 C 0 G 20.0 T 0
6 A 0 C 0 G 20.0 T 0
7 A 0 C 20.0 G 0 T 0
8 A 20.0 C 0 G 0 T 0
9 A 0 C 0 G 20.0 T 0
10 A 20.0 C 0 G 0 T 0
11 A 20.0 C 0 G 0 T 0
12 A 0 C 0 G 0 T 20.0
13 A 0 C 20.0 G 0 T 0
14 A 0 C 0 G 0 T 20.0
15 A 0 C 0 G 0 T 20.0
16 A 0 C 0 G 0 T 20.0
17 A 0 C 20.0 G 0 T 0
18 A 0 C 20.0 G 0 T 0
19 A 20.0 C 0 G 0 T 0
20 A 0 C 20.0 G 0 T 0
21 A 0 C 20.0 G 0 T 0
22 A 20.0 C 0 G 0 T 0
23 A 0 C 0 G 20.0 T 0
24 A 0 C 20.0 G 0 T 0
25 A 20.0 C 0 G 0 T 0
26 A 20.0 C 0 G 0 T 0
27 A 0 C 10.0 G 0 T 10.0
28 A 0 C 20.0 G 0 T 0
29 A 0 C 20.0 G 0 T 0
30 A 0 C 0 G 0 T 20.0
31 A 0 C 20.0 G 0 T 0
32 A 0 C 0 G 0 T 20.0
33 A 0 C 0 G 20.0 T 0
34 A 0 C 0 G 20.0 T 0
35 A 0 C 0 G 20.0 T 0
36 A 20.0 C 0 G 0 T 0
37 A 0 C 0 G 0 T 20.0
38 A 0 C 0 G 0 T 20.0
39 A 0 C 20.0 G 0 T 0
40 A 0 C 0 G 0 T 20.0
41 A 0 C 0 G 0 T 20.0
42 A 0 C 0 G 0 T 20.0
43 A 0 C 20.0 G 0 T 0
44 A 0 C 20.0 G 0 T 0
45 A 0 C 20.0 G 0 T 0
46 A 0 C 0 G 20.0 T 0
47 A 20.0 C 0 G 0 T 0
48 A 0 C 20.0 G 0 T 0
49 A 0 C 20.0 G 0 T 0
50 A 20.0 C 0 G 0 T 0
51 A 0 C 10.0 G 0 T 10.0
52 A 0 C 20.0 G 0 T 0
53 A 20.0 C 0 G 0 T 0
54 A 0 C 0 G 20.0 T 0
55 A 0 C 0 G 0 T 20.0
56 A 0 C 0 G 0 T 20.0
57 A 0 C 0 G 20.0 T 0
58 A 0 C 0 G 20.0 T 0
59 A 20.0 C 0 G 0 T 0
60 A 0 C 0 G 0 T 20.0
####
CLUSTAL O(1.2.1) multiple sequence alignment
1 60
gnl|hbvcds|AB014370_PreC_P-A ------------------------------------------------------------
gnl|hbvcds|AB064314_PreC_P-A ------------------------------------------------------------
gnl|hbvcds|AB014384_C_P-C ------------------------------------------------------------
gnl|hbvcds|AB014385_C_P-C ------------------------------------------------------------
gnl|hbvcds|AB048701_PreS1_P-D atggggcagaatctttccaccagcaatcctctgggattctttcccgaccatcagttggat
gnl|hbvcds|AB078031_PreS1_P-D atggggcagaatctttccaccagcaaccctctgggattctttcccgaccaccagttggat
gnl|hbvcds|AB030513_S_P-A ------------------------------------------------------------
gnl|hbvcds|AB064314_S_P-A ------------------------------------------------------------
gnl|hbvcds|AB194947_PreS2_P-E ------------------------------------------------------------
gnl|hbvcds|AB194948_PreS2_P-E ------------------------------------------------------------
61 120
gnl|hbvcds|AB014370_PreC_P-A tagagtctcctgagcattgctcacctcaccatactgcactcaggcaagccattctctgct
gnl|hbvcds|AB064314_PreC_P-A tagagtctcctgagcattgctcacctcaccatacggcactcaggcaagccattctctgct
gnl|hbvcds|AB014384_C_P-C tagagtctccggaacattgttcacctcaccatacagcactcaggcaagctattctgtgtt
gnl|hbvcds|AB014385_C_P-C tagagtctccggaacattgttcacctcaccatacagcactcaggcaagctattctgtgtt
gnl|hbvcds|AB048701_PreS1_P-D gggtttttcttgttgacaagaatcctcacaataccgcagagtctagactcgtggtggact
gnl|hbvcds|AB078031_PreS1_P-D gggtttttcttgttgacaagaatcctcacaataccgcagagtctagactcgtggtggact
gnl|hbvcds|AB030513_S_P-A gggtttttcttgttgacaagaatcctcacaataccgcagagtctagactcgtggtggact
gnl|hbvcds|AB064314_S_P-A gggtttttcttgttgacaagaatcctcacaataccgcagagtctagactcgtggtggact
gnl|hbvcds|AB194947_PreS2_P-E gggtttttcttgttgacaaaaatcctcacaataccgcagagtctagactcgtggtggact
gnl|hbvcds|AB194948_PreS2_P-E gggtttttcttgttgacaaaaatcctcacaataccgcagagtctagactcgtggtggact
* * ** * * ****** **** *** * * * *