use strict; use warnings; use Data::Dumper; my %sequences; # This hash will hold sequence ID's and the # associated longest values. while( defined( my $id = ) ) { # Read the ID from file. defined( my $sequence = ) # Read the sequence. or die "Odd number of lines in DATA.\n"; chomp for ( $id, $sequence ); # Chomp the input. $id =~ s/^>//; # Strip the > character # from the ID. if( exists( $sequences{ $id } ) and length( $sequences{ $id } ) >= length( $sequence ) ) { # Current sequence not longer. Skip to next record. next; } else { # Current sequence is longer. Keep track of it. $sequences{ $id } = $sequence; } } # Now %sequences contains all the longest strings for each ID. # Print the hash... print Dumper \%sequences; __DATA__ >protein1 ASFGTHTRHTHRHTHTRHTRHTR >protein2 ERYRYTRYHTRHTGEFEWWFEEFFFFREFRGRE >protein3 AWEERERGRGRGREGRGREGRRRRRRRRTTHTHTRHRHTRHTR >protein2 AASEFEFEFE >protein4 REYTRHTRGRVEVCREVR