#usr/bin/perl -w

use strict;
if (scalar(@ARGV) != 1) {
    print "\n";
    print "Usage: script.pl <file>";
    print "\n";
    exit();
  }

my ($FILENAME) = @ARGV; #read in file
open(INFILE, $FILENAME);

## remove existing files    
my $remove = "new_alignment_".$FILENAME;  #remove any existing results file
    if (unlink($remove) == 1) {
                        print "Existing \"$remove\" file was removed\n";
                          }

## generate storage file    
my $outputfile = "new_alignment_".$FILENAME; 
    unless ( open(POS, ">>$outputfile") ) {
	print "Cannot open file \"$outputfile\" to write to!!\n\n";
	    exit;
	    }

## declare variables
my @array1 = ();
my @no_duplicates = ();
my %seen = ();
our $protein_id;
my $element;
my $key;
my $line;

#read file and do stuff
while ($line = <INFILE>) {
    
    if($line =~/(\d+)_(\d+)_(\d+)/){
        #print POS $line."\n";    # check if the fasta file ID's print        
           push(@array1, $line);  # store all the id's in an array	   
        }

#remove duplicates in "array1" and store array elements in a new array "no_duplicates"
    
    foreach my $a(@array1){ 
	unless ($seen{$a}){
	    push (@no_duplicates, $a);
	    chomp @no_duplicates;
	    $seen{$a} = 1;
	}
    }
}

#now start poping a single element from the array each time writing out all sequences with the id
    while ($element = pop @no_duplicates){
        $protein_id = $element; 
             #print $protein_id."\n"; #check if no duplicates are kept - it works
	     
#now open the file again and start search for the popped element and see whether it marches an id
#if it does print the id and the all the blocks (joines together with that id)
         while(my $line2 = <INFILE>){
	    if($line2 =~ /$proitein_id/){  #says protein_id need explicit package, why??
		print $line2;
	    }
	 }
    }