#!/usr/bin/perl
# cleanup.pl
use strict;
use warnings;
print 'PLEASE ENTER THE FILENAME OF THE PROTEIN SEQUENCE: ';
chomp( my $prot_filename = <STDIN> );
open my $PROTFILE, '<', $prot_filename
or die "Cannot open '$prot_filename' because: $!";
my $out_filename = 'cleaned_'.$prot_filename;
open my $OUTFILE, '>', $out_filename
or die "Cannot open '$out_filename' because: $!";
$/ = ''; # Set paragraph mode
my %fasta_seen; # sequences seen so far
my $header;
my $count_in;
my $count_out;
while ( my $record = <$PROTFILE> ) {
++$count_in;
if ( $record =~ s/^>(.*)//m ){
$header = $1;
# skip fragments
next if $header =~ /\(Fragments\)/i;
};
# Remove comment line(s)
$record =~ s/^\s*#.*//mg;
# trim trailing spaces
$record =~ s/\s+$//;
# skip duplicated
if ( $fasta_seen{ $record }++ ){
print $OUTFILE "\n";
} else {
print $OUTFILE $header.$record."\n\n";
++$count_out;
}
}
close $OUTFILE;
close $PROTFILE;
printf "%d records read from %s\n",$count_in,$prot_filename;
printf "%d records written to %s\n",$count_out,$out_filename;
I'm sure acknowledge of perlmonks.org would be appreciated by the community here.
poj
|