5390_7_9 MEWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE
EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED
LSNKDYLKRV KNGYAEKYAV NEKVYNVPFT ANAYGIYYNK DKFEELGLKV PETWDEFEQL
VKDIVAKGQT PFGIAGADAW TLNGYNQLAF ATATGGGKEA NQYLRYSQPN AIKLSDPIMK
DDIKVMDILR INGSKQKNWE GAGYTDVIGA FARGDVLMTP NGSWAITAIN EQKPNFKIGT
FMIPGKEKRQ SLTVGAGDLA WSISATTKHP KEANAFVEYM TRPEVMQKYY DVDGSPTAIE
GVKQAGEDSP LAGMTEYAFT DRHLVWLQQY WTSEADFHTL TMNYVLTGDK QGMVNDLNAF
FNPMKADVD
####
5390_7_9 MEWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE
EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED
5390_8_1 MKWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE
EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED
5390_8_2 MEWYKKIGLL ATTALALFGL GACSNYGKSA DDTVTIEYFN QKKEMTKILE
EITRDFEKEN SKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED
5390_7_9 MEWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE
EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED
LSNKDYLKRV KNGYAEKYAV NEKVYNVPFT ANAYGIYYNK DKFEELGLKV PETWDEFEQL
5390_8_1 MKWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE
5390_8_2 MEWYKKIGLL ATTALALFGL GACSNYGKSA DDTVTIEYFN QKKEMTKILE
EITRDFEKEN SKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED
LSNKDYLKRV KNGYAEKYAV NEKVYNVPFT ANAYGIYYNK DKFEELGLKV PETWDEFEQL
####
5390_7_9 MEWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE
EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED
MEWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE EITRDFEKEN
PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED LSNKDYLKRV
KNGYAEKYAV NEKVYNVPFT ANAYGIYYNK DKFEELGLKV PETWDEFEQL
5390_8_1 MKWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE
EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED
MKWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE
5390_8_2 MEWYKKIGLL ATTALALFGL GACSNYGKSA DDTVTIEYFN QKKEMTKILE
EITRDFEKEN SKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED
MEWYKKIGLL ATTALALFGL GACSNYGKSA DDTVTIEYFN QKKEMTKILE EITRDFEKEN
SKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED LSNKDYLKRV
KNGYAEKYAV NEKVYNVPFT ANAYGIYYNK DKFEELGLKV PETWDEFEQL
####
#usr/bin/perl -w
use strict;
if (scalar(@ARGV) != 1) {
print "\n";
print "Usage: script.pl ";
print "\n";
exit();
}
my ($FILENAME) = @ARGV; #read in file
open(INFILE, $FILENAME);
## remove existing files
my $remove = "new_alignment_".$FILENAME; #remove any existing results file
if (unlink($remove) == 1) {
print "Existing \"$remove\" file was removed\n";
}
## generate storage file
my $outputfile = "new_alignment_".$FILENAME;
unless ( open(POS, ">>$outputfile") ) {
print "Cannot open file \"$outputfile\" to write to!!\n\n";
exit;
}
## declare variables
my @array1 = ();
my @no_duplicates = ();
my %seen = ();
our $protein_id;
my $element;
my $key;
my $line;
#read file and do stuff
while ($line = ) {
if($line =~/(\d+)_(\d+)_(\d+)/){
#print POS $line."\n"; # check if the fasta file ID's print
push(@array1, $line); # store all the id's in an array
}
#remove duplicates in "array1" and store array elements in a new array "no_duplicates"
foreach my $a(@array1){
unless ($seen{$a}){
push (@no_duplicates, $a);
chomp @no_duplicates;
$seen{$a} = 1;
}
}
}
#now start poping a single element from the array each time writing out all sequences with the id
while ($element = pop @no_duplicates){
$protein_id = $element;
#print $protein_id."\n"; #check if no duplicates are kept - it works
#now open the file again and start search for the popped element and see whether it marches an id
#if it does print the id and the all the blocks (joines together with that id)
while(my $line2 = ){
if($line2 =~ /$proitein_id/){ #says protein_id need explicit package, why??
print $line2;
}
}
}