The code -- revised version by GrandFather.
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
my $data = '/DATA/alignment.fas';
open my $inFile, '<', $data or die "Failed at opening $data: $!\n";
# Populate the info hash with GIs as keys and sequences as values
my $humanGi;
my $accession;
my $gi; # Current gi while reading sequences
my %info;
while (<$inFile>) {
my $line = $_;
chomp $line;
last if m!END!;
if ($line =~ m/(HUMAN|Homo)/) {
($humanGi, $accession) = $line =~ m/^\S+\|(\d+)\|\w+\|(\S{6}?)/;
}
if ($line =~ m/^\S+\|(\d+)/) {
$gi = $1 if defined $1;
} else {
$info{$gi} = $line;
}
}
print Dumper (\%info);
close $inFile;
my $data2 = '/DATA/variantList.txt';
open $inFile, '<', $data2 or die "Failed at opening $data2: $!\n";
my $data3 = '/DATA/pathogenList.txt';
open my $outFile, '>', $data3 or die "Failed at opening $data3: $!\n";
print $outFile
"This is [GI: $humanGi] and [Accession: $accession]\nVARIANT\t\tPOTENTIAL\t\tPD\n";
while (defined (my $Variant = <$inFile>)) {
# Grab a variant from the file (in this example: P82L)
chomp $Variant;
my ($source, $position, $sink) = split /(\d+)(\w)/, $Variant;
# Check whether HS has the source (i.e., P) at the given position (i.e., 82)
#my @char = split //, $info{$humanGi};
#my $target = $char[$position - 1];
my @VariantList;
my @PDList;
# Scan the rest of the sequences to check what amino acid they have at
# the given position
foreach my $gi (keys %info) {
my @char2 = split //, $info{$gi};
my $potential = $char2[$position - 1];
push @VariantList, "${potential}{$gi}";
if ($potential eq $sink) {
# Note the cases where we observe the sink (i.e., L) at this position
push @PDList, "${potential}{$gi}";
}
}
print $outFile "$Variant\t@VariantList\t@PDList\n";
}
close $inFile;
close $outFile;
####
First input. Variant list.
A5V
A5S
A5T
C7F
V8E
L9Q
L9V
G13R
V15G
V15M
G17S
F21C
E22K
E22G
Q23L
G38R
L39R
L39V
G42D
G42S
H44R
F46C
H47R
H49R
H49Q
E50K
T55R
N66S
L68R
G73S
D77Y
H81A
L85F
L85V
G86R
N87S
V88A
A90T
A90V
D91A
D91V
G94A
G94C
G94D
G94R
G94V
V98M
E101G
E101K
D102N
D102G
I105F
S106L
L107V
G109V
I113M
I113T
I114T
G115A
R116G
V119L
D125V
D125G
D126H
L127S
S135N
N140K
L145F
L145S
A146T
C147R
G148R
V149G
V149I
I150T
I152T
####
and the second input file.
>gi|134611|sp|P00441.2|_Homo_sapiens
MATKAVCVLKGDGPVQGIINFEQKESNGPVKVWGSIKGLTEGLHGFHVHEFGDNTAGCTSAGPHFNPLSRKHGGPKDEERHVGDLGNVTADKDGVADVSIEDSVISLSGDHCIIGRTLVVHEKADDLGKGGNEESTKTGNAGSRLACGVIGIAQ
>gi|112419222|Xenopus_laevis
AMVKAVCVLAGSGDVKGVVRFEQQDD-GDVTVEGKIEGLTDGNHGFHIHVFGDNTNGCLSAGPHFNPQNKNHGSPKDADRHVGDLGNVTA-EGGVAQFKFTDPQISLKGERSIIGRTAVVHEKQDDLGKGGDDESLKTGNAGGRLACGVIGFCP
>gi|62858937|_Xenopus_(Silurana)_tropi...
-MVRAVCVLAGSGDVKGVVHFQQQDE-GPVTVEGKIYGLTDGKHGFHIHEFGDNTNGCISAGPHFNPESKTHGAPEDAVRHVGDLGNVTA-KDGVAEFKLTDSLISLKGNHSIIGRCAVVHEKEDDLGKGGNDESLKTGNAGGRLACGVIGLCQ
>gi|226372562|_Rana_catesbeiana
--MKAICVLKGSSEVTGVVRFEQEED-GPVTVTGQITGLTDGKHGFHIHTYGDNTDGCVSAGPHFNPQGKTHGGPDDEVRHVGDLGNVTS-AGGVADINIKDKLISLKGEHSIIGRTAVVHEKEDDLGKGGDNESLITGNAGGRLACGVIGICQ
>gi|116048074|_Scyliorhinus_torazame
--MKAICVLKGTGEVTGTVQFDQAGG-GPVTVKGSITGLTPGKHGFHVHAFGDNTNGCISAGPHYNPFLKTHGGPGDEERHVGDLGNVEANGDGVATFEIQDNQLHLSGERSIIGRTLVVHEKEDDLGKGEDEESTRTGNAGSRLACGVIGIAK
>gi|216963348|_Ctenopharyngodon_idella
-------------------YFEQEGEKSPVTLSGEITGLTAGKHGFHVHAFGDNTNGCISAGPHFNPYSKNHGGPTDSERHVGDLGNVIAGENGVAKIDIVDKMLTLSGPDSIIGRTMVIHEKEDDLGKGGNEESLKTGNAGGRLACGVIGITQ
>gi|226232347|_Pimephales_promelas
---------------------------------------------------------------HFNPHTQNHGGPTDSARHVGDLGNVTAGENGVAKIDIVDKMLTLSGQHSIIGRTMVIHEKEDDLGKGGNE---------------------
>gi|238801237|_Hemibarbus_mylodon
MAKKAVCVLKGTGEVTGTVFFEQETDGSPVKLSGTISGLTAGKHGFHVHVFGDNTNGCISAGPHFNPHNKNHGGPTDGDRHVGDLGNVTAGESGVAKIDIVDKMLTLSGQHSIIGRTMVIHEKEDDLGKGGNEESLKTGNAGGRLACGVIGITG
>gi|47227092|_Tetraodon_nigroviridis
MVIKAVCVLKGAGETSGTVYFEQQDEKAPVKLTGEIKGLTAGEHGFHVHAFGDNTNGCISAGPHYNPHNKTHAGPNDENRHVGDLGNVTAEADQIAKIDITDSVISLHGKFSIIGRTMVIHEKADDLGKGGNEESLKTGNAGGRLACGVIGITQ
>gi|225706520|_Osmerus_mordax
MVLKAVCVLKGTGEVTGTVFFEQEGDNGPVKLTGEISGLTPGEHGFHVHAFGDNTNGCISAGPHFNPHSKTHGGPTDDVRHVGDLGNVTAGQDNVAKISIQDKHLTLNGVHSIIGRTMVIHEKADDLGKGGNEESLKTGNAGGRLACGVIGITQ
>gi|185132317|_Oncorhynchus_mykiss
MAMKAVCVLKGTGEVTGTVFFEQEGADGPVKLIGEISGLAPGEHGFHVHAYGDNTNGCMSAGPHFNPHNQTHGGPTDAVRHVGDLGNVTAGADNVAKINIQDKMLTLTGPDSIIGRTMVIHEKADDLGKGGNEESLKTGNAGGRQACGVIGIAQ
>gi|56790262|_Danio_rerio
MVNKAVCVLKGTGEVTGTVYFNQEGEKKPVKVTGEITGLTPGKHGFHVHAFGDNTNGCISAGPHFNPHDKTHGGPTDSVRHVGDLGNVTADASGVAKIEIEDAMLTLSGQHSIIGRTMVIHEKEDDLGKGGNEESLKTGNAGGRLACGVIGITQ
>gi|185135289|_Salmo_salar
MALKAVCVLKGTGEVTGTVFFEQEGDGAPVKLTGEIAGLTPGEHGFHVHAFGDNTNGCMSAGPHFNPHNHTHGGPTDTVRHVGDLGNVTAAADSVAKINIQDEILSLAGPHSIIGRTMVIHEKADDLGKGDNEESRKTGNAGSRLACGVIGIAQ
>gi|134284932|_Carassius_auratus
---------------------------------------------FHVHAFGDNTNGCTSAGPHYNPHNQTHGGPTDSVRHVGDLGNV------------------------------------------------------------------
>gi|110180503|_Oryzias_javanicus
----------------------------------------PGEHGFHVHAFGDNTNGCISAGPHFNPYGKDHAGPTDEHRHVGDLGNVTANAENVAKLDFTDKVITLAGPHSIIGRTMVIHEKKDDLGKGGNEESLKTGNA-------------
>gi|229365862|_Anoplopoma_fimbria
MVVKAVCVLKGAGETSGVVHFEQEGDTAAVKLTGEIIGLTPGEHGFHVHAFGDNTNGCISAGPHFNPHNNTHAGPTDEQRHVGDLGNVTAGGDNIAKIDITDKIITLTGQHSIIGRTMVIHEKADDLGKGGNDESLKTGNAGARLACGVIGIAQ
>gi|226934254|_Dicentrarchus_labrax
-------------------------------------------------------------------------------RHVGDLGDVTAGGDNIAKIDITDKMLTLTGPLFIIGRTMVIHEKADDLGKGGNEESLKTG---------------
>gi|54873355|_Sebastes_schlegelii
---------------------------------GEIKGLTPGEHGFHVHAFGDNTNGCISAGPHFNPHGKDHAGPTDQERHVGDLGNVTAGAANVAKIDITDKMLTLTGPLSIIRRTMVIHEKKDDLGKGGNEESLKTGNAGG-----------
>gi|62550923|_Sparus_aurata
-------------------------------------------------------------------HGKNHGGPTDAERHVGDLGNVTAGADNVAKIDITDKMLTLSGPLSIIGRTMVIHEKVDDLGKGGNEE--------------------
>gi|27462182|_Pagrus_major
MVQKAVCVLKGAGETTGVVHFEQESESAPVTLKGEISGLTPDEHGFHVHAFGDNTNGCISAGPHFNPHNKNHAGPTDAERHVGDLGNVTAGADNVAKIDITDKMLTLNGPFSIIGRTMVIHEKADDLGKGGNEESLKTGNAGGRLACGVIGICQ
>gi|12733941|_Platichthys_flesus
-----------------------------------IAGLAPGEHGFHVHSFGDNTNGCMSAGPHFNPHGKNHAGPTDADRHVGDLGNVTAGADNVAEINISDKMLTLNGPNSIIGRTMVIHEKADDLGKGGNDESLKTGNA-------------
>gi|151549024|Paralichthys_olivaceus
------------------------------------------EHGFHVHAFGDNTNGCISAGPHFNPHGKNHAGPTDAERHVGDLGNVTAGKDNVAEINISDKIITLFGAHSIIGRTMVIHEKADDLGKGGNEESLKTGNAGARLACGVIG---
>gi|57908848|_Trematomus_bernacchii
---KAVCVFKGTGEASGTVFFEQENDSAPVKLTGEIKGLTPGEHGFHVHAFGDNTNGCISAGPHFNPHNKTHAGPTDEDRHVGDLGNVTAAADNVAKLNITDKMITLAGQYSIIGRTMVIHEKADDLGKGGNDESLKTGNAGGRLACGVIGIAQ
>gi|57908852|Chionodraco_hamatus
---KAVCVFKGAGEASGTVFFEQETDSCPVKLTGEIKGLTPGEHGFHVHAFGDNTNGCISAGPHFNPHNKTHAGPTDENRHVGDLGNVTAAADNVAKLDITDKMITLAGQYSIIGRTMVIHEKADDLGKGGNDESLKTGNAGGRLACGVIGIAQ
>gi|157152709|_Takifugu_obscurus
MAMKAVCVLKGAGDTSGTVYFEQENESAPVKLTGEIKGLTPGEHGFHVHAFGDNTNGCISAGPHYNPHNKTHAGPTDADRHVGDLGNVTAGADNIAKIDIKDSMLTLTGPYSIIGRTMVIHEKADDLGKGGNEESLKTGNAGGRLACGVIGITQ
>gi|67772081|_Siniperca_chuatsi
--------------------------------------FTPGEHGSHVHVFGDNTNGCISAGPHYNPHGKNHAGPNDAERHVGDLGNVTAGADNVAKIDITDKMPSLTGPYSIIGRTMVIHEKADDLGKGGNEESLKTGNAGGRLACGVIGITQ
>gi|40218091|_Oreochromis_mossambicus
MVLKAVCVLKGTGDTSGTVYFEQENDSAPVKLTGEIKGLTPGEHGFHVHAFGDNTNGCISAGPHFNPYNKNHGGPKDAERHVGDLGNVTAGADNVAKIEITDKVITLTGRDSIIGRTMVIHEKVDDLXKGGNEESLKTGNAGGRLACGVIGITQ
>gi|37542151|_Epinephelus_malabaricus
MVLKAVCVLKGAGETSGTVYFEQETDSAPVKLTGEIKGLTPGEHGFQVHAFGDNTNGCISAGPHFNPHNKHHAGPTDAERHVGDLGNVTAGGDNVAKIDITDKIITLNGPYSIIGRTMVIHEKADDLGTGGNEESLKTGNAGGRLACGVIGISQ
>gi|56785775|Epinephelus_coioides
MDLKAVCVLKGAGETSGTVYFEQESDSAPVKLTGEIKGLTPGEHGFHVHAFGDNTNGCISAGPHFNPHNKQHAGPTDADRHVGDLGNVTAGGDNVAKIDITDKMLTLNGPYSIIGRTMVIHEKADDLGRGGNDESLKTGNAGGRLACGVIGIAQ
>gi|47607437|_Oplegnathus_fasciatus
MVLKAVCVLKGAGETTGTVYFEQESDSAPVKLTGEIKGLTPGEHGFHVHAFGDNTNGCISAGPHFNPHNKNHAGPNDAERHVGDLGNVTAGADNVAKIDIKDHIITLTGPDSIIGRTMVIHEKADDLGKGGNEESLKTGNAGGRLACGVIGITQ
>gi|115392225|_Rachycentron_canadum
MVLKAVCVLKGAGETTGTVYFEQESDSAPVKVTGEIKGLTPGEHGFHVHAFGDNTNGCISAGPHFNPHNKNHAGPNDEERHIGDLGNVTAGADNVAKVDITDKMLTLNGPYSIIGRTMVIHEKADDLGKGGNEESLKTGNAGGRLACGVIGIAQ
>gi|224044145|_Taeniopygia_guttata
AAMRAVCVMQGEGAVKGVIHFEQQGT-GPVKVTGEITGLADGEHGFHVHEFGDNTNGCTSAGPHFNPEQKKHGGPSDAERHVGDLGNVTA-KGGVAQVSIQDSVISLSGPHCIIGRTMVVHERRDDLGRGGNDESLLTGNAGPRLACGVIGIAK
>gi|45384218|_Gallus_gallus
ATLKAVCVMKGDAPVEGVIHFQQQGS-GPVKVTGKITGLSDGDHGFHVHEFGDNTNGCTSAGAHFNPEGKQHGGPKDADRHVGDLGNVTA-KGGVAEVEIEDSVISLTGPHCIIGRTMVVHAKSDDLGRGGDNESKLTGNAGPRLACGVIGIAK
>gi|29373121|_Melopsittacus_undulatus
ATLKAVCVMKGEGPVQGVIHFQQQGN-GPVKVTGKISGLADGDHGFHVHEFGDNTNGCTSAGPHFNPEGKQHGGPSDAERHVGDLGNVTA-KGGVAEVAIEDSIISLSGPHSIVGRTMVVHEKCDDLGRGGDNESKLTGNAGPRLACGVIGIAK
>gi|89515076|_Bufo_gargarizans
-MVKAICVLKGNGPVHGIVGFNQDG--GEVTVKGTINGLTDGLHGFHIHVYGDNTNGCMSAGPHFNPHGKSHGAPEDEERHVGDLGNITS-KDGVAEFEFKDKIISLEGEHNIIGRTAVVHEKADDLGKGGDNESKVTGNAGGRLACGVIGICQ
>gi|226844835|_Trachemys_scripta_elegans
---------------------------------------------------------CTSAGAHFNPNGKNHGGPQDKERHVGDLGNVIANKDGVAEVSIKDSLISLTGPLSIIGRTMVVHEKEDDLGKGNN----------------------
>gi|265797|_Caretta_caretta
---------------------------ATVKAVCVLKGEDPVKEPVKGPVKEPVKGIIYFEQQGN-GPVTLSGSITGLTEGKHGFHVHEFGDNTNGCTSAGAHFNPPGKNHGGPQDNERHVGDLGNVIANKEGVAEVCIKDSLISLTGSQSIIG
>gi|126352669|_Equus_caballus
MALKAVCVLKGDGPVHGVIHFEQQQEGGPVVLKGFIEGLTKGDHGFHVHEFGDNTQGCTTAGAHFNPLSKKHGGPKDEERHVGDLGNVTADENGKADVDMKDSVISLSGKHSIIGRTMVVHEKQDDLGKGGNEESTKTGNAGSRLACGVIGIAP
>gi|126325231|_Monodelphis_domestica
MVLKAVCVLKGDGPVQGTIFFEQKQVGEPVELSGSIKGLAEGDHGFHVHEFGDNTQGCTSAGAHFNPHSKKHGGPTDEERHVGDLGNVTANKDGVATVSIKDSHIELSGPMSIIGRTMVVHEKADDLGKGGNAESEKTGNAGPRLACGVIGIAK
>gi|130497065|_Oryctolagus_cuniculus
MATKAVCVLKGDGPVEATIHFEQKGT-GPVVVKGRITGLTEGLHEFHVHQFGDNRQGCTSAGPHFNPLSKKHGGPKDEERHVGDLGNVTAGSNGVADVLIEDSVISLSGDMSVIGRTLVVHEKEDDLGKGGNDESTKTGNAGSRLACGVIGISP
>gi|74136167|_Macaca_mulatta
MAMKAVCVLKGDSPVQGTINFEQKESNGPVKVWGSITGLTEGLHGFHVHQFGDNTQGCTSAGPHFNPLSRQHGGPKDEERHVGDLGNVTAGKDGVAKVSFEDSVISLSGDHSIIGRTLVVHEKADDLGKGGNEESKKTGNAGGRLACGVIGIAQ
>gi|84579183|_Macaca_fascicularis
MAMKAVCVLKGDSPVQGTINFEQKESNGPVKVWGSITGLTEGLHGYHVHQFGDNTQGCTSAGPHFNPLSRQHGGPKDEERHVGDLGNVTAGKDGVAKVSFEDSVISLSGDHSIIGRTLVVHEKADDLGKGGNEESKKTGNAGGRLACGVIGIAH
>gi|197102620|_Pongo_abelii
MATKAVCVLKGDSPVKGIINFEQKERNGPVKVWGSIEGLTEGLHGFHVHEFGDNTVGCTSAGPHFNPLSRKHGGPKDEERHVGDLGNVTADKDGVVSVSIEDSVISLSGDHCIIGRTLVVHEKADDLGKGGNEESTKTGNAGSRLACGVIGIAQ
>gi|223633904|_Ovis_aries
MATKAVCVLKGDGPVQGTIRFEAKGD--KVVVTGSITGLTEGDHGFHVHQFGDNTQGCTSAGPHFNPLSKKHGGPKDEERHVGDLGNVKADKNGVAIVDIVDPLISLSGEYSIIGRTMVVHERPDDLGRGGNEESTKTGNAGGRLACGVIGIAP
>gi|194672519|_Bos_taurus
MATKAVCVLKGDGPVQGTIHFEAKGN--TVVVTGSITGLTEGDHGFHVHQFGDNTQGCTSAGPHFNPLSKKHSGPKDEERHVGDLGNVTADKNGVAVVDIVDSLISLSGEYSIIGRTMVVHEKPDDLGRGGNEESTKTGNAGSRLACGVIGIAK
>gi|2660692|_Cervus_elaphus
MATKAVCVMKGDGPVQGTIRFEAKGN--TVVVTGSITGLTEGDHGFHVHQFGDNTQGCTSAGPHFNPLSKKHGGPKDEERHVGDLGNVTADKNGVAKVDIVDSLISLSGEHSIIGRTMVVHEKPDDLGRGGNEESTKTGNARNRLACGVIGIAQ
>gi|39578718|_Cavia_porcellus
-ATKAVCVLKGDGPVQGIIHFEQKAN-GPVVVKGRITGLVEGKHGFHVHEFGDNTQGCTSAGPHFNPLSKKHGGPQDEERHVGDLGNVTAGADGVANVSIEDSLISLSGANSIIGRTMVVHEKPDDLGKGGNEESTKTGNAGSRLACGVIGIAQ
>gi|15082144|_Sus_scrofa
---KAVCVLKGDGPVQGTIYFELKGE-KTVLVTGTIKGLAEGDHGFHVHQFGDNTQGCTSAGPHFNPESKKHGGPKDQERHVGDLGNVTAGKDGVATVYIEDSVIALSGDHSIIGRTMVVHEKPDDLGRGGNEESTKTGNAGSRLACGVIG---
>gi|281348263|_Ailuropoda_melanoleuca
--------------------------------------------------------GCTSAGPHFNPLSKKHGGPKDEERHVGDLGNVTAGKDGVATVSLEDSLIALSGDHSIIGRTMVVHEKRDDLGKGGNEESTQTGNAGSRLACGVIGIAK
>gi|8394328|_Rattus_norvegicus
MAMKAVCVLKGDGPVQGVIHFEQKASGEPVVVSGQITGLTEGEHGFHVHQYGDNTQGCTTAGPHFNPHSKKHGGPADEERHVGDLGNVAAGKDGVANVSIEDRVISLSGEHSIIGRTMVVHEKQDDLGKGGNEESTKTGNAGSRLACGVIGIAQ
>gi|45597447|_Mus_musculus
MAMKAVCVLKGDGPVQGTIHFEQKASGEPVVLSGQITGLTEGQHGFHVHQYGDNTQGCTSAGPHFNPHSKKHGGPADEERHVGDLGNVTAGKDGVANVSIEDRVISLSGEHSIIGRTMVVHEKQDDLGKGGNEESTKTGNAGSRLACGVIGIAQ
>gi|55925004|_Mus_spretus
------------------------------------------------HQYGDNTQGCTSAGPHFNPHS-------------------------------------------------------------------------------------
END