yuvraj_ghaly has asked for the wisdom of the Perl Monks concerning the following question:

I have a fasta file containing many protein sequences in fasta format. I want to calculate isoelectric point of each sequences.

I have this code but it is giving errors.


use Bio::Tools::pICalculator;
use Bio::SeqIO;

my $in = Bio::SeqIO->new( -fh => \*STDIN ,-format => 'Fasta' );

my $calc = Bio::Tools::pICalculator->new(-places => 2,
-pKset => 'EMBOSS');

while ( my $seq = $in->next_seq ) {
$calc->seq($seq);
my $iep = $calc->iep;
print sprintf( "%s\t%s\t%.2f\n",
$seq->id,
$iep,
$calc->charge_at_pH($iep) );

for( my $i = 0; $i <= 14; $i += 0.5 ){
print sprintf( "pH = %.2f\tCharge = %.2f\n",
$i,
$calc->charge_at_pH($i) );
}
}

package Bio::Tools::pICalculator;
use strict;


use base qw(Bio::Root::Root);
my $DTASelect_pK = { N_term => 8.0,
K => 10.0, # Lys
R => 12.0, # Arg
H => 6.5, # His
D => 4.4, # Asp
E => 4.4, # Glu
C => 8.5, # Cys
Y => 10.0, # Tyr
C_term => 3.1
};
my $Emboss_pK = { N_term => 8.6,
K => 10.8, # Lys
R => 12.5, # Arg
H => 6.5, # His
D => 3.9, # Asp
E => 4.1, # Glu
C => 8.5, # Cys
Y => 10.1, # Tyr
C_term => 3.6
};
sub new {
my( $class, %opts ) = @_;
my $self = $class->SUPER::new(%opts);
$self = bless {}, ref $self || $self;
$self->seq( $opts{-seq} ) if exists $opts{-seq};
$self->pKset( $opts{-pKset} || 'EMBOSS' );
exists $opts{-places} ? $self->places( $opts{-places} ) :
$self->places(2);
return $self;
}
sub seq {
my( $this, $seq ) = @_;
unless( defined $seq && UNIVERSAL::isa($seq,'Bio::Seq') ){
die $seq . " is not a valid Bio::Seq object\n";
}
$this->{-seq} = $seq;
$this->{-count} = count_charged_residues( $seq );
return $this->{-seq};
}
sub pKset {
my ( $this, $pKset ) = @_;
if( ref $pKset eq 'HASH' ){ # user defined pK values
$this->{-pKset} = $pKset; }elsif( $pKset =~ /^emboss$/i ){ # from EMBOSS's iep program
$this->{-pKset} = $Emboss_pK;
}elsif( $pKset =~ /^dtaselect$/i ){ # from DTASelect (scripps)
$this->{-pKset} = $DTASelect_pK;
}else{ # default to EMBOSS
$this->{-pKset} = $Emboss_pK;
}
return $this->{-pKset};
}
sub places {
my $this = shift;
$this->{-places} = shift if @_;
return $this->{-places};
}
sub iep {
my $this = shift;
return _calculate_iep($this->{-pKset},
$this->{-places},
$this->{-seq},
$this->{-count}
);
}
sub charge_at_pH {
my $this = shift;
return _calculate_charge_at_pH( shift, $this->{-pKset},
$this->{-count} );
}
sub count_charged_residues {
my $seq = shift;
my $sequence = $seq->seq;
my $count;
for ( qw( K R H D E C Y ) ){ # charged AA's
$count->{$_}++ while $sequence =~ /$_/ig;
}
return $count;
}
sub _calculate_iep {
my( $pK, $places, $seq, $count ) = @_;
my $pH = 7.0;
my $step = 3.5;
my $last_charge = 0.0;
my $format = "%.${places}f";

unless( defined $count ){
$count = count_charged_residues($seq);
}
while(1){
my $charge = _calculate_charge_at_pH( $pH, $pK, $count );
last if sprintf($format,$charge) ==
sprintf($format,$last_charge);
$charge > 0 ? ( $pH += $step ) : ( $pH -= $step );
$step /= 2.0;
$last_charge = $charge;
}
return sprintf( $format, $pH );
}
sub _calculate_charge_at_pH {
no warnings; # don't complain if a given key doesn't exist
my( $pH, $pK, $count ) = @_;
my $charge = _partial_charge( $pK->{N_term}, $pH )
+ $count->{K} * _partial_charge( $pK->{K}, $pH )
+ $count->{R} * _partial_charge( $pK->{R}, $pH )
+ $count->{H} * _partial_charge( $pK->{H}, $pH )
- $count->{D} * _partial_charge( $pH, $pK->{D} )
- $count->{E} * _partial_charge( $pH, $pK->{E} )
- $count->{C} * _partial_charge( $pH, $pK->{C} )
- $count->{Y} * _partial_charge( $pH, $pK->{Y} )
- _partial_charge( $pH, $pK->{C_term} );
return $charge;
}
sub _partial_charge {
my $cr = 10 ** ( $_[0] - $_1 );
return $cr / ( $cr + 1 );
}
1;
  • Comment on Caluclation of isoelectric point of protein in fasta file using per;

Replies are listed 'Best First'.
Re: Caluclation of isoelectric point of protein in fasta file using per;
by marto (Cardinal) on Jul 02, 2013 at 09:52 UTC

    Bio::Tools::pICalculator:

    "Calculates the isoelectric point of a protein, the pH at which there is no overall charge on the protein. Calculates the charge on a protein at a given pH. Can use built-in sets of pK values or custom pK sets."

    See also Perl and Bioinformatics and pICalculator.

    Update: For context, prior to OP updating their post it read "I have a fasta file containing many protein sequences in fasta format. I want to calculate isoelectric point of each sequences."

    A reply falls below the community's threshold of quality. You may see it by logging in.
Re: Caluclation of isoelectric point of protein in fasta file using per;
by marto (Cardinal) on Jul 15, 2013 at 19:54 UTC

    If you're going to update a post please mark it as such, and use the proper formatting. If you have errors why aren't you telling us what they are? you provide no input data or expected output. All of the above is addressed in How do I post a question effectively?.

Re: Caluclation of isoelectric point of protein in fasta file using per;
by yuvraj_ghaly (Sexton) on Jul 23, 2013 at 09:26 UTC

    Thank you all users of Perl Monks who helped me to solve my issue.

    Here is the code which I have now and running as per my requirement

    #!/usr/bin/perl -w # calculate pI. # based on example from, and using code from, Fred Lindberg (lindberg@ +id.wustl.edu) # this script came from http://www.id.wustl.edu/~lindberg/docs/program +s/, but see # http://www-nmr.cabm.rutgers.edu/bioinformatics/ZebaView/help.html #use strict; use warnings; my $file = shift || die "pI.pl <fasta file of sequences>\n"; my $allowed= "ACDGHKLQVWSY"; # Supported amino acids my @acids=('C','D','E','Y'); # Acidic for pI my @bases=('H','K','R'); # Basic for pI my $water=18.0152; # mol wt of H2O (added decimals, since # it's multiplied by no_aa-1). # molecular weights my %aawt=('A', 89.09, 'C', 121.16, 'D', 133.10, 'E', 147.13, 'F', 165. +19, 'G', 75.07, 'H', 155.16, 'I', 131.18, 'K', 146.19, 'L', 131.18, 'M',149.21, 'N', 132.12, 'P', 115.13, 'Q', 146.15, 'R', 174.20, 'S',105.09, 'T', 119.12, 'V', 117.15, 'W', 204.23, 'Y', 181.19) +; # pKa/pKbs (< & > are default NT & CT) my %pka= ('C', 8.3, 'D', 3.86, 'E', 4.25, 'Y', 10.07, 'H', 6.04, 'K', 10.79, 'R', 12.48); # NT pKa my %nt= ('A', 9.69, 'E', 9.67, 'M', 9.21, 'P', 10.60, 'S', 9.15, 'T', 9.10, 'V', 9.62); # CT pKa my %ct= ('D', 2.09, 'E', 2.19); # A280s (half cysteine for C) my %A280 = ('W', 5690., 'Y', 1280., 'C', 120.); my %protein; { open (IN, $file) || die "Can't open $file\n"; my $tag; my $seq; while (<IN>) { chomp; if (/^>/) { if ($seq) {$protein{$tag} = uc($seq); undef $seq} $tag = $_; } else {$seq .= $_} } close IN; } open OUT, ">$file.pI" || die "Can't open >$file.pI for writing"; foreach my $proteinid (keys %protein) { my $protein = $protein{$proteinid}; my $first = substr($protein,0,1); # get NT aa my $last = substr($protein, -1,1); # get CT aa my %base; my %acid; $base{'start'}=1; # One amino terminus $acid{'end'}=1; # One carboxy terminus # NT different for some aa, 7.50 default if($nt{$first}) {$pka{'start'} = $nt{$first}} else {$pka{'start'} = +7.50} # CT different for some aa, 3.55 default if($ct{$last}) {$pka{'end'} = $ct{$last}} else {$pka{'end'} = 3.55} # count the amino acid composition my %residue; foreach my $aa (split(//, $allowed)) {$residue{$aa} = ($protein =~ s +/$aa//g)} foreach my $aa (@acids) { if($residue{$aa}) { $acid{$aa} = $residue{$aa}; # collect acids } } foreach my $aa (@bases) { if($residue{$aa}) { $base{$aa} = $residue{$aa}; # collect bases } } # binary search for pI my $hi=14; # Theoretical max my $lo=0; # Theoretical min my $pI=7; my $old_pI=1; my $iterations = 0; while(abs($pI-$old_pI)>0.001) { # Two correct decimals if($iterations++ > 15) { # 14/0.001 > 2^14, so this shouldn't h +appen print STDERR "Excessive iterations. Something's badly wrong\n"; last; } my $result=0; foreach my $aa (keys(%acid)) { # Left (acid) side unless ($pka{$aa}) {print STDERR "pka for $aa NOT FOUND\n"} $result += $acid{$aa}/(1+10**($pka{$aa}-$pI)); } foreach my $aa (keys(%base)) { # Right (base) side $result -= $base{$aa}/(1+10**($pI-$pka{$aa})); } $old_pI = $pI; if($result > 0){ $pI=($pI+$lo)/2; # Go lower since charge is neg $hi=$old_pI; } else { $pI=($hi+$pI)/2; # Go higher; charge is pos $lo=$old_pI; } } print OUT "$proteinid\t$pI\n"; }

      Good job then.