in reply to Iso electric point calculation using perl

If I were you, I'd email the author (lindberg@id.wustl.edu) and ask him about his script. I don't know a whole lot about isoelectric points, but this seems to work:

#!/usr/bin/perl -w # calculate pI. # based on example from, and using code from, Fred Lindberg (lindberg@ +id.wustl.edu) # this script came from http://www.id.wustl.edu/~lindberg/docs/program +s/, but see # http://www-nmr.cabm.rutgers.edu/bioinformatics/ZebaView/help.html use strict; use warnings; my $file = shift || die "pI.pl <fasta file of sequences>;\n"; print STDERR "start\n"; my $allowed="ACDEFGHIKLMNPQRSTVWY";# Supported amino acids my @acids=('C','D','E','Y'); # Acidic for pI my @bases=('H','K','R'); # Basic for pI my $water=18.0152; # mol wt of H2O (added decimals, since # it's multiplied by no_aa-1). # molecular weights my %aawt=('A', 89.09, 'C', 121.16, 'D', 133.10, 'E', 147.13, 'F', 165. +19, 'G', 75.07, 'H', 155.16, 'I', 131.18, 'K', 146.19, 'L', 131.18, 'M',149.21, 'N', 132.12, 'P', 115.13, 'Q', 146.15, 'R', 174.20, 'S',105.09, 'T', 119.12, 'V', 117.15, 'W', 204.23, 'Y', 181.19) +; # pKa/pKbs (&lt; &amp; &gt; are default NT &amp; CT) my %pka= ('C', 9.0, 'D', 4.05, 'E', 4.45, 'Y', 10.0, 'H', 5.98, 'K', 10.0, 'R', 12.0); # NT pKa my %nt= ('A', 7.59, 'E', 7.70, 'M', 7.00, 'P', 8.36, 'S', 6.93, 'T', 6.82, 'V', 7.44); # CT pKa my %ct= ('D', 4.55, 'E', 4.75); # A280s (half cysteine for C) my %A280 = ('W', 5690., 'Y', 1280., 'C', 120.); my @currentProtein; my %protein; { open (IN, $file) || die "Can't open $file\n"; my $tag; my $seq; while (<IN>) { #print STDERR "$_"; chomp; if (($_ !~ /^>/) && ($_ =~ /\w/)) { push (@currentProtein, $_); } if (/^>/) { $tag = $_; } if ((($_ !~ /\w/) || (eof)) && (@currentProtein > 0)) { $seq = join("", @currentProtein); $protein{$tag} = uc($seq); @currentProtein = (); } } close IN; } #open OUT, "> $file.txt" || die "Can't open > $file.txt for writing"; my $counter = keys %protein; foreach my $proteinid (keys %protein) { my $protein = $protein{$proteinid}; my $first = substr($protein,0,1); # get NT aa my $last = substr($protein, -1,1); # get CT aa my %base; my %acid; $base{'start'}=1; # One amino terminus $acid{'end'}=1; # One carboxy terminus # NT different for some aa, 7.50 default if($nt{$first}) {$pka{'start'} = $nt{$first}} else {$pka{'start'} =7 +.50} # CT different for some aa, 3.55 default if($ct{$last}) {$pka{'end'} = $ct{$last}} else {$pka{'end'} = 3.55} # count the amino acid composition my %residue; foreach my $aa (split('', $allowed)) {$residue{$aa} = ($protein =~ s +/$aa//g)} foreach my $aa (@acids) { if($residue{$aa}) { $acid{$aa} = $residue{$aa}; # collect acids } } foreach my $aa (@bases) { if($residue{$aa}) { $base{$aa} = $residue{$aa}; # collect bases } } # binary search for pI my $hi=14; # Theoretical max my $lo=0; # Theoretical min my $pI=7; my $old_pI=1; my $iterations = 0; while(abs($pI-$old_pI)>0.001) { # Two correct decimals if($iterations++ > 15) { # 14/0.001 &gt; 2^14, so this shouldn' +t happen print STDERR "Excessive iterations. Something's badly wrong\n"; last; } my $result=0; foreach my $aa (keys(%acid)) { # Left (acid) side unless ($pka{$aa}) {print STDERR "pka for $aa NOT FOUND\n"} $result += $acid{$aa}/(1+10**($pka{$aa}-$pI)); } foreach my $aa (keys(%base)) { # Right (base) side $result -= $base{$aa}/(1+10**($pI-$pka{$aa})); } $old_pI = $pI; if($result > 0){ $pI=($pI+$lo)/2; # Go lower since charge is neg $hi=$old_pI; } else { $pI=($hi+$pI)/2; # Go higher; charge is pos $lo=$old_pI; } } print STDERR "$proteinid\t$pI\n"; }

Replies are listed 'Best First'.
Re^2: Iso electric point calculation using perl
by yuvraj_ghaly (Sexton) on Jul 25, 2013 at 06:13 UTC

    your code is running but here is a error i have found in your code. The output it is showing contains pI of first sequence in first place however it eliminated the pI of last sequence and after first sequence result it shows the pI of second last sequence then then showed pI result in descending manner.

    this is my sample.fasta containing sequences

    >gi|226694487|sp|Q1DF98.2|ATPF_MYXXD RecName: Full=ATP synthase subuni +t b; AltName: Full=ATP synthase F(0) sector subunit b; AltName: Full= +ATPase subunit I; AltName: Full=F-type ATPase subunit b; Short=F-ATPa +se subunit b MFLPSVLAASNLVKVQPGLIFWTLVTFVIAAVVLKWKAWGPILSLVEEREKQIASSIESAKRERAEAEKL LADQKTAIAEARREAAEMMRRNTQEMEKFREELMAKSRKEAEELKLSARREIDEQKAKAIAEVRSMAVDL AMEVAGKLISERMDDSKQRALAEQFVQGLPLNSTSATGAVRRTA<br><br> >gi|172046103|sp|Q1D4N0.2|LIPA_MYXXD RecName: Full=Lipoyl synthase; Al +tName: Full=Lip-syn; Short=LS; AltName: Full=Lipoate synthase; AltNam +e: Full=Lipoic acid synthase; AltName: Full=Sulfur insertion protein +LipA MTETTRKPEWLKVRLPHGEGYERVKAIVKRTKLATVCEEARCPNIAECWGGGTATVMLMGEVCTRACRFC HVKVGAPPPLDPMEPIHLAQAVKEMDLEYIVVTSVNRDDRPDGGASHFASAIRELRRESPRTIVEVLIPD FKGVEKDLTTVAEAKPHVVAHNVETVERLTPTVRDRRAKYHQSLRVLEYLKNRPEGLYTKTSVMVGLGET DAELEQTFKDLRDVGVDVLTLGQYLQPSQYHLRVERFVTPAQFEAYKTLAESYGFLYVASGPLVRSSYRA AEFFMKGLMERERLERLG<br><br> >gi|123374798|sp|Q1DDB3.1|KDSB_MYXXD RecName: Full=3-deoxy-manno-octul +osonate cytidylyltransferase; AltName: Full=CMP-2-keto-3-deoxyoctulos +onic acid synthase; Short=CKS; Short=CMP-KDO synthase MQSCRTVAVIPARHASTRFPGKPLAIIAGRTMIEHVWRRCQEAQAFDEVWVATDDDRIRAAVEGFGGKAV MTSPACATGTDRVAEVALGRPDIDIWVNVQGDEPLVDPATLQRLAGLFQDASVRMGTLVRPLEADEAASP HVVKAVLALNGDALYFSRSLVPHVREPGTPVQRWGHIGLYGYRREVLLSLAKLAPTPLEDAEKLEQLRAL EHGIPIRCAKVTSHTVAVDLPGDVEKVEALMRARGG<br><br> >gi|123374766|sp|Q1DCG7.1|FMT_MYXXD RecName: Full=Methionyl-tRNA formy +ltransferase MSRPRIVFMGTPEFAVSSLAACFELGDVVAVVTQPDKPKGRGNTVTAPPVKELALSRGVPVLQPTKLRTP PFAEELRQYAPDVCVVTAYGRILPKDLLELPTHGCVNVHGSLLPRFRGAAPIQWAIAHGDTETGVSLMVM DEGLDTGPVLAMKRMAIAPDETSASLYPKLAALGGEVLREFLPAYLSGELKPVPQPSEGMVLAPIIEKDQ GRLDFTKPAVELERRLRAFTPWPGAFTTLGGKLLKVHRAQARGGSGAPGTVLASGPDGIEVACGEGSLVL LDLQPEGKRVMRAADFLQGHKLAPGSQPFVAG<br><br> >gi|123374695|sp|Q1DAN7.1|SSRP_MYXXD RecName: Full=SsrA-binding protei +n MTSGGKSKGVGSEPGVRVIAENRRARFDYTVDEKVEAGLALTGSEVKSLRDGIANLSDAYALPKGDELFL LNANIGSYKAASFFDHLPTRGRKLLMHRGEIDRWTAKVRERGYSIIPLVLYFRNGRAKVELGLCRGKTHE DRRHDIKERETKREMDRAMRRR<br><br> >gi|123374693|sp|Q1DAM1.1|PNP_MYXXD RecName: Full=Polyribonucleotide n +ucleotidyltransferase; AltName: Full=Polynucleotide phosphorylase; Sh +ort=PNPase MLKKSVKIGESELSIEVGRLAKQADGSVVVRYGDTMLLVTAVSAREKKDIDFLPLTVEYQEKLYSAGRIP GSYFKREGRLTEKETLASRLVDRSCRPLFPEGYAYETQIIASVISSDPENEGDIHGITGASAALWVSDIP FDGPIAGIRVGRVGGQLVANPTAKQREQSDLDLVMAVSRKAIVMVEGGAEEVSEADMVAALDFGFTTAQP ALDLQDELRRELNKQVRSFEKPAAVDEGLRAKVRELAMDGIKAGYGIKEKGARYEALGKTKKEALAKLKE QLGDGYTPLVEKHAKAVVEDLKYEHMREMTVNGGRIGDRGHDVVRSITCEVGVLPRTHGSAVFTRGETQA LVVTTLGTSDDEQRLEMLGGMAFKRFMLHYNFPPFSVNETKPLRGPGRREVGHGALAERALRNMVPKSES FPYTVRLVSDILESNGSSSMASVCGGTLALMDAGVPLKAPVAGIAMGLVKEGDKIAILSDILGDEDHLGD MDFKVCGTSKGITSIQMDIKITGLTTEIMSRALEQARQGRLHILGEMLKTLAESRKEISQYAPRITTIQI RPEFIKNVIGPGGKVIKDIIARTGAAINIEDSGRVDIASANGEAVKAAIAMIQALTREAEIGKIYTGTVR KIAEFGAFVELFPGTDGLIHISELSDKRVKSVSDVLNEGDEVLVKVVSIDKTGKIRLSRKEAMAERAAQQ GAAAGEAAAQPAPAPTQPDAKA

    this is the output it showed using your code

    <code>>gi|226694487|sp|Q1DF98.2|ATPF_MYXXD RecName: Full=ATP synthase subunit b; AltName: Full=ATP synthase F(0) sector subunit b; AltName: Full=ATPase subunit I; AltName: Full=F-type ATPase subunit b; Short=F-ATPase subunit b 10.9674072265625
    >gi|123374695|sp|Q1DAN7.1|SSRP_MYXXD RecName: Full=SsrA-binding protein 9.0909423828125
    >gi|123374766|sp|Q1DCG7.1|FMT_MYXXD RecName: Full=Methionyl-tRNA formyltransferase 7.8262939453125
    >gi|123374798|sp|Q1DDB3.1|KDSB_MYXXD RecName: Full=3-deoxy-manno-octulosonate cytidylyltransferase; AltName: Full=CMP-2-keto-3-deoxyoctulosonic acid synthase; Short=CKS; Short=CMP-KDO synthase 4.8543701171875
    >gi|172046103|sp|Q1D4N0.2|LIPA_MYXXD RecName: Full=Lipoyl synthase; AltName: Full=Lip-syn; Short=LS; AltName: Full=Lipoate synthase; AltName: Full=Lipoic acid synthase; AltName: Full=Sulfur insertion protein LipA 8.4586181640625
      I've made the same changes to the script that I made to this script for you about half an hour ago.

      You should get a book about perl.

      Also, see this.

      #!/usr/bin/perl -w # calculate pI. # based on example from, and using code from, Fred Lindberg (lindberg@ +id.wustl.edu) # this script came from http://www.id.wustl.edu/~lindberg/docs/program +s/, but see # http://www-nmr.cabm.rutgers.edu/bioinformatics/ZebaView/help.html use strict; use warnings; my $file = shift || die "pI.pl <fasta file of sequences>;\n"; print STDERR "start\n"; my $allowed="ACDEFGHIKLMNPQRSTVWY";# Supported amino acids my @acids=('C','D','E','Y'); # Acidic for pI my @bases=('H','K','R'); # Basic for pI my $water=18.0152; # mol wt of H2O (added decimals, since # it's multiplied by no_aa-1). # molecular weights my %aawt=('A', 89.09, 'C', 121.16, 'D', 133.10, 'E', 147.13, 'F', 165. +19, 'G', 75.07, 'H', 155.16, 'I', 131.18, 'K', 146.19, 'L', 131.18, 'M',149.21, 'N', 132.12, 'P', 115.13, 'Q', 146.15, 'R', 174.20, 'S',105.09, 'T', 119.12, 'V', 117.15, 'W', 204.23, 'Y', 181.19) +; # pKa/pKbs (&lt; &amp; &gt; are default NT &amp; CT) my %pka= ('C', 9.0, 'D', 4.05, 'E', 4.45, 'Y', 10.0, 'H', 5.98, 'K', 10.0, 'R', 12.0); # NT pKa my %nt= ('A', 7.59, 'E', 7.70, 'M', 7.00, 'P', 8.36, 'S', 6.93, 'T', 6.82, 'V', 7.44); # CT pKa my %ct= ('D', 4.55, 'E', 4.75); # A280s (half cysteine for C) my %A280 = ('W', 5690., 'Y', 1280., 'C', 120.); my @currentProtein; my %protein; my $fastaLine; my $protSequence; my @fastaTag; my @tagOrder; open (FASTA, $file) || die "Can't open $file\n"; my $tag; my $seq; while (<FASTA>) { #print STDERR "CURRENT: $_"; chomp; if (($_ !~ /^>/) && ($_ =~ /\w/)) { push (@currentProtein, $_); } if ((/^>/) || (eof)) { if ((@currentProtein > 0) || (eof)) { $protSequence = join("", @currentProtein); $protSequence =~ s/ //g; @fastaTag = split(" ", $fastaLine); $protein{$fastaTag[0]} = $protSequence; push (@tagOrder, $fastaTag[0]); @currentProtein = (); } $fastaLine = $_ if $_ =~ /\w/; } } close FASTA; #open OUT, "> $file.txt" || die "Can't open > $file.txt for writing"; my $counter = keys %protein; for (@tagOrder) { #print STDERR "ID: $_\t SEQUENCE:$protein{$_}\n"; <--uncomment t +o show tags/sequences; my $protein = $protein{$_}; my $first = substr($protein,0,1); # get NT aa my $last = substr($protein, -1,1); # get CT aa my %base; my %acid; $base{'start'}=1; # One amino terminus $acid{'end'}=1; # One carboxy terminus # NT different for some aa, 7.50 default if($nt{$first}) {$pka{'start'} = $nt{$first}} else {$pka{'start' +} =7.50} # CT different for some aa, 3.55 default if($ct{$last}) {$pka{'end'} = $ct{$last}} else {$pka{'end'} = 3. +55} # count the amino acid composition my %residue; foreach my $aa (split('', $allowed)) {$residue{$aa} = ($protein +=~ s/$aa//g)} foreach my $aa (@acids) { if($residue{$aa}) { $acid{$aa} = $residue{$aa}; # collect acids } } foreach my $aa (@bases) { if($residue{$aa}) { $base{$aa} = $residue{$aa}; # collect bases } } # binary search for pI my $hi=14; # Theoretical max my $lo=0; # Theoretical min my $pI=7; my $old_pI=1; my $iterations = 0; while(abs($pI-$old_pI)>0.001) { # Two correct decimals if($iterations++ > 15) { # 14/0.001 &gt; 2^14, so this shou +ldn't happen print STDERR "Excessive iterations. Something's badly wrong\ +n"; last; } my $result=0; foreach my $aa (keys(%acid)) { # Left (acid) side unless ($pka{$aa}) {print STDERR "pka for $aa NOT FOUND\n"} $result += $acid{$aa}/(1+10**($pka{$aa}-$pI)); } foreach my $aa (keys(%base)) { # Right (base) side $result -= $base{$aa}/(1+10**($pI-$pka{$aa})); } $old_pI = $pI; if($result > 0){ $pI=($pI+$lo)/2; # Go lower since charge is neg $hi=$old_pI; } else { $pI=($hi+$pI)/2; # Go higher; charge is pos $lo=$old_pI; } } print STDERR "$_\t$pI\n"; }

        thank you for the code @mtmcc...hurray it works :)