>GTM1_RAT GLUTATHIONE S-TRANSFERASE YB1 (EC 2.5.1.18) ( (217 aa)
WFAGDKVTYVDFLAYDILDQYHIFEPKCLDAFPNLKDFLARFEGLKKISAYMKSSRYLST
PIFSKLAQWSNK
>GTMU_CRILO GLUTATHIONE S-TRANSFERASE Y1 (EC 2.5.1.18) (217 aa)
FAGDKVTLCGFLAYDVLDQYQMFEPKCLDPFPNLKDFLARFEGLKKISAYMKTSRFLRRP
IFSKMAQWSNK
>GTM1_HUMAN GLUTATHIONE S-TRANSFERASE MU 1 (EC 2.5.1.18 (217 aa)
LPEKLKLYSEFLGKRPWFAGNKITFVDFLVYDVLDLHRIFEPKCLDAFPNLKDFISRFEG
LEKISAYMKSSRFLPRPVFSKMAVWGNK
>GLNA_ANASP GLUTAMINE SYNTHETASE (EC 6.3.1.2) (GLUTAMAT (473 aa)
SLELALEALENDHAFLTDTGVFTEDFIQNWIDYKLANEVKQMQLRPH-PYEFSIYYDV
>GTM4_HUMAN GLUTATHIONE S-TRANSFERASE MU 4 (EC 2.5.1.18 (218 aa)
LPTMMQHFSQFLGKRPWFVGDKITFVDFLAYDVLDLHRIFEPNCLDAFPNLKDFISRFEG
LEKISAYMKSSRFLPKPLYTRVAVWGNK
####
#!/usr/bin/perl
# Check fasta file format, report cases that don't meet expectations
use strict;
use warnings;
my %records;
$/ = ''; # special Perl rule: setting $/ to empty string
# means: treat each sequence of one or more blank lines
# as a record separator
while (<>)
{
my @lines = split /[\r\n]+/; # split on line-breaks
my $key = shift @lines; # first line should be the identifier
if ( $key !~ /^>/ )
{
warn "record $. does not start with '>':\n$_\n";
}
my $data = '';
for my $line ( @lines )
{
warn "record $. contains odd data:\n$_\n" unless ( $line =~ /^ [-A-Z]+\s*$/ );
$data .= $line."\n"; # (remember to put linefeeds back)
}
if ( exists( $records{$key} )) # have we seen this identifyier before?
{
warn "key $key found on different data:\n$data\n\n$records{$key}\n\n"
if ( $records{$key} ne $data );
}
else # we haven't seen this value of $key before
{
$records{$key} = $data;
}
}
printf STDERR "%d records checked\n", scalar keys %records;
# just for fun, let's try to output some data:
for my $chosen ( grep /^>GTM1_/, keys %records )
{
print join "\n", $chosen, $records{$chosen}, '';
}
####
perl test-format.pl *.fasta > test.out