>GTM1_RAT GLUTATHIONE S-TRANSFERASE YB1 (EC 2.5.1.18) ( (217 aa) WFAGDKVTYVDFLAYDILDQYHIFEPKCLDAFPNLKDFLARFEGLKKISAYMKSSRYLST PIFSKLAQWSNK >GTMU_CRILO GLUTATHIONE S-TRANSFERASE Y1 (EC 2.5.1.18) (217 aa) FAGDKVTLCGFLAYDVLDQYQMFEPKCLDPFPNLKDFLARFEGLKKISAYMKTSRFLRRP IFSKMAQWSNK >GTM1_HUMAN GLUTATHIONE S-TRANSFERASE MU 1 (EC 2.5.1.18 (217 aa) LPEKLKLYSEFLGKRPWFAGNKITFVDFLVYDVLDLHRIFEPKCLDAFPNLKDFISRFEG LEKISAYMKSSRFLPRPVFSKMAVWGNK >GLNA_ANASP GLUTAMINE SYNTHETASE (EC 6.3.1.2) (GLUTAMAT (473 aa) SLELALEALENDHAFLTDTGVFTEDFIQNWIDYKLANEVKQMQLRPH-PYEFSIYYDV >GTM4_HUMAN GLUTATHIONE S-TRANSFERASE MU 4 (EC 2.5.1.18 (218 aa) LPTMMQHFSQFLGKRPWFVGDKITFVDFLAYDVLDLHRIFEPNCLDAFPNLKDFISRFEG LEKISAYMKSSRFLPKPLYTRVAVWGNK #### #!/usr/bin/perl # Check fasta file format, report cases that don't meet expectations use strict; use warnings; my %records; $/ = ''; # special Perl rule: setting $/ to empty string # means: treat each sequence of one or more blank lines # as a record separator while (<>) { my @lines = split /[\r\n]+/; # split on line-breaks my $key = shift @lines; # first line should be the identifier if ( $key !~ /^>/ ) { warn "record $. does not start with '>':\n$_\n"; } my $data = ''; for my $line ( @lines ) { warn "record $. contains odd data:\n$_\n" unless ( $line =~ /^ [-A-Z]+\s*$/ ); $data .= $line."\n"; # (remember to put linefeeds back) } if ( exists( $records{$key} )) # have we seen this identifyier before? { warn "key $key found on different data:\n$data\n\n$records{$key}\n\n" if ( $records{$key} ne $data ); } else # we haven't seen this value of $key before { $records{$key} = $data; } } printf STDERR "%d records checked\n", scalar keys %records; # just for fun, let's try to output some data: for my $chosen ( grep /^>GTM1_/, keys %records ) { print join "\n", $chosen, $records{$chosen}, ''; } #### perl test-format.pl *.fasta > test.out