#!/usr/bin/perl -w use strict; use Data::Dumper; my ( $data, $line, $gi, $humangi, %info); $data = '/DATA/proteinfile.txt'; open INFILE, '<', $data or die "Failed at opening $data!\n"; # Construct the hash with GIs as keys and sequences as values while ( ) { $line = $_; chomp($line); last if m!END!; if($line=~m/HUMAN/){ ($humangi) = ($line=~m/^\S+\|(\d+)/) } if($line=~m/^\S+\|(\d+)/) { if(defined($1)) { $gi=$1; } } else { $info{$gi}=$line; } } #print Dumper (\%info); print "$humangi\n"; close(INFILE); my $data2 = '/DATA/variantlist.txt'; open INFILE2, '<', $data2 or die "Failed at opening $data2!\n"; my $data3 = '/DATA/VariantOutput.txt'; open OUTFILE, '>', $data3 or die "Failed at opening $data3!\n"; while ( ){ # Grab a variant from the file (in this example: P82L) my $Variant = $_; # Split the variant into three parts my ($source, $position, $sink) = split(/(\d+)(\w)/, $Variant); print "$source , $ position , $sink\n"; # Check whether HS has the source (i.e., P) at the given position (i.e., 82) my @temp = $info{$humangi}; if ( $temp[$position] eq $source) { print "Yep, $source has been confirmed!\n"; } else { print "There is something wrong!\n"; } # Scan the rest of the sequences to check what amino acid they have at the given position for my $gi ( keys %info ) { my @value = $info{$gi}; my @VariantList = (); push ( @VariantList, $value[$position]); if ($value[$position] eq $sink){ # Note the cases where we observe the sink (i.e., L) at this position print OUTFILE "A pathogenic deviation has been found at site $position - from $source to $sink !\n" . " And the corresponding gi for this deviation is: $gi\n"; } } #print OUTFILE "Variant list contains: @VariantList\n"; } close(INFILE2);