If I'm following you, I would fashion a data structure like this:
%protein = (
protein1 => 'ASFGTHTRHTHRHTHTRHTRHTR',
protein2 => 'ERYRYTRYHTRHTGEFEWWFEEFFFFREFRGRE',
#...
);
Then it's just a matter of looping over the file, looking for an existing key, and running
length.
Similar to this:
use strict;
use warnings;
use Data::Dumper;
my %protein = ();
my $key = '';
foreach my $line (<DATA>) {
chomp($line);
# Get the key if it's a key line then skip to the next line
if ($line =~ /^>protein/) {
$key = $line;
next;
}
if ($key and $line) { # So this is the protein
if (exists($protein{$key})) { # Have we seen it before
# Test the length and assign if greater
$protein{$key} = $line if ( length($protein{$key}) < lengt
+h($line) );
} else {
# We haven't seen it before so just assign
$protein{$key} = $line;
}
$key = ''; # Reset Key
}
}
print Dumper \%protein
__DATA__
>protein1
ASFGTHTRHTHRHTHTRHTRHTR
>protein2
ERYRYTRYHTRHTGEFEWWFEEFFFFREFRGRE
>protein3
AWEERERGRGRGREGRGREGRRRRRRRRTTHTHTRHRHTRHTR
>protein2
AASEFEFEFE
>protein4
REYTRHTRGRVEVCREVR
grep
| XP matters not. Look at me. Judge me by my XP, do you? |