5390_7_9 MEWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED LSNKDYLKRV KNGYAEKYAV NEKVYNVPFT ANAYGIYYNK DKFEELGLKV PETWDEFEQL VKDIVAKGQT PFGIAGADAW TLNGYNQLAF ATATGGGKEA NQYLRYSQPN AIKLSDPIMK DDIKVMDILR INGSKQKNWE GAGYTDVIGA FARGDVLMTP NGSWAITAIN EQKPNFKIGT FMIPGKEKRQ SLTVGAGDLA WSISATTKHP KEANAFVEYM TRPEVMQKYY DVDGSPTAIE GVKQAGEDSP LAGMTEYAFT DRHLVWLQQY WTSEADFHTL TMNYVLTGDK QGMVNDLNAF FNPMKADVD #### 5390_7_9 MEWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED 5390_8_1 MKWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED 5390_8_2 MEWYKKIGLL ATTALALFGL GACSNYGKSA DDTVTIEYFN QKKEMTKILE EITRDFEKEN SKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED 5390_7_9 MEWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED LSNKDYLKRV KNGYAEKYAV NEKVYNVPFT ANAYGIYYNK DKFEELGLKV PETWDEFEQL 5390_8_1 MKWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE 5390_8_2 MEWYKKIGLL ATTALALFGL GACSNYGKSA DDTVTIEYFN QKKEMTKILE EITRDFEKEN SKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED LSNKDYLKRV KNGYAEKYAV NEKVYNVPFT ANAYGIYYNK DKFEELGLKV PETWDEFEQL #### 5390_7_9 MEWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED MEWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED LSNKDYLKRV KNGYAEKYAV NEKVYNVPFT ANAYGIYYNK DKFEELGLKV PETWDEFEQL 5390_8_1 MKWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE EITRDFEKEN PKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED MKWYKKIGLL ATTGLALVGL GACSNYGKSA DGTVTIEYFN QKKEMTKTLE 5390_8_2 MEWYKKIGLL ATTALALFGL GACSNYGKSA DDTVTIEYFN QKKEMTKILE EITRDFEKEN SKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED MEWYKKIGLL ATTALALFGL GACSNYGKSA DDTVTIEYFN QKKEMTKILE EITRDFEKEN SKIKVKVVNV PNAGEVLKTR VLAGDVPDVV NIYPQSIELQ EWAKAGVFED LSNKDYLKRV KNGYAEKYAV NEKVYNVPFT ANAYGIYYNK DKFEELGLKV PETWDEFEQL #### #usr/bin/perl -w use strict; if (scalar(@ARGV) != 1) { print "\n"; print "Usage: script.pl "; print "\n"; exit(); } my ($FILENAME) = @ARGV; #read in file open(INFILE, $FILENAME); ## remove existing files my $remove = "new_alignment_".$FILENAME; #remove any existing results file if (unlink($remove) == 1) { print "Existing \"$remove\" file was removed\n"; } ## generate storage file my $outputfile = "new_alignment_".$FILENAME; unless ( open(POS, ">>$outputfile") ) { print "Cannot open file \"$outputfile\" to write to!!\n\n"; exit; } ## declare variables my @array1 = (); my @no_duplicates = (); my %seen = (); our $protein_id; my $element; my $key; my $line; #read file and do stuff while ($line = ) { if($line =~/(\d+)_(\d+)_(\d+)/){ #print POS $line."\n"; # check if the fasta file ID's print push(@array1, $line); # store all the id's in an array } #remove duplicates in "array1" and store array elements in a new array "no_duplicates" foreach my $a(@array1){ unless ($seen{$a}){ push (@no_duplicates, $a); chomp @no_duplicates; $seen{$a} = 1; } } } #now start poping a single element from the array each time writing out all sequences with the id while ($element = pop @no_duplicates){ $protein_id = $element; #print $protein_id."\n"; #check if no duplicates are kept - it works #now open the file again and start search for the popped element and see whether it marches an id #if it does print the id and the all the blocks (joines together with that id) while(my $line2 = ){ if($line2 =~ /$proitein_id/){ #says protein_id need explicit package, why?? print $line2; } } }