#!/usr/bin/perl use strict; use warnings; $/ = ''; # Set paragraph mode my @count; my %absent; my $name; while ( my $para = ) { # Remove fasta header line if ( $para =~ s/^>(.*)//m ){ $name = $1; }; # Remove comment line(s) $para =~ s/^\s*#.*//mg; my %prot; $para =~ s/([ACDEFGHIKLMNPQRSTVWY])/ ++$prot{ $1 } /eg; my $len = length($para); my $num = scalar keys %prot; push @count,[$num,$name]; printf "Counted %d for %s ..\n",$num,substr($name,0,50); print "$name\n"; print join( ' ', map "$_=$prot{$_}", sort keys %prot ), "\n"; printf "Amino acid alphabet = %d\n\n",$num ; print "Sequence length = $len\n"; # count absent for ('A'..'Z'){ ++$absent{$_} unless exists $prot{$_}; }; }; # sort names by count in ascending order to get lowest my @sorted = sort { $a->[0] <=> $b->[0] } @count; my $lowest = $sorted[0]->[0]; # maybe more than 1 lowest printf "Least number of amino acids is %d in these entries\n",$lowest; my @lowest = grep { $_->[0] == $lowest } @sorted; print "$_->[1]\n" for @lowest; # show all results print "\nAll results in ascending count\n"; for (@sorted){ printf "%d %s\n", @$_; }; print "\nExclusion of various amino acids is as follows\n"; for (sort keys %absent){ printf "%s=%d\n",$_,$absent{$_}; }; __DATA__ >sp_0005_SySynthetic ConstructTumor protein p53 N-terminal transcription-activation domain VQLQESGGGLVQAGGSLRLSCAASGRAVSMYNMGWFRQAPGQERELVAAISRGGSIYYA DSVKGRFTISRDNAKNTLYLQMNNLKPEDTGVYQCRQGSTLGQGTQVTVSS >sp_0017_CaCamelidSorghum bicolor multidrug and toxic compound extrusion sbmate HVQLVESGGGSVQAGGSLRLTCAASGFTFSNYYMSWVRQAPGKGLEWVSSIYSVGSNGYY ADSVKGRSTISRDNAKNTLYLQMNSLKPEDTAVYYCAAEPGGSWWDAYSYWGQGTQVTVS S #### Counted 19 for sp_0005_SySynthetic ConstructTumor protein p53 N-t .. sp_0005_SySynthetic ConstructTumor protein p53 N-terminal transcription-activation domain A=9 C=2 D=3 E=4 F=2 G=15 I=3 K=3 L=9 M=3 N=5 P=2 Q=10 R=8 S=12 T=6 V=8 W=1 Y=5 Amino acid alphabet = 19 Sequence length = 124 Counted 20 for sp_0017_CaCamelidSorghum bicolor multidrug and tox .. sp_0017_CaCamelidSorghum bicolor multidrug and toxic compound extrusion sbmate A=10 C=2 D=4 E=4 F=2 G=15 H=1 I=2 K=4 L=7 M=2 N=5 P=3 Q=6 R=4 S=18 T=7 V=10 W=5 Y=10 Amino acid alphabet = 20 Sequence length = 143 Least number of amino acids is 19 in these entries sp_0005_SySynthetic ConstructTumor protein p53 N-terminal transcription-activation domain All results in ascending count 19 sp_0005_SySynthetic ConstructTumor protein p53 N-terminal transcription-activation domain 20 sp_0017_CaCamelidSorghum bicolor multidrug and toxic compound extrusion sbmate Exclusion of various amino acids is as follows B=2 H=1 J=2 O=2 U=2 X=2 Z=2