#!/usr/bin/perl use warnings; use strict; print 'Enter protein sequence filename: '; chomp( my $prot_filename = ); open my $PROTFILE, '<', $prot_filename or die "Cannot open '$prot_filename' because: $!"; my $out_filename = 'duplicate_gene_entries_in_'.$prot_filename; open my $OUTFILE, '>', $out_filename or die "Cannot open '$out_filename' because: $!"; $/ = ''; # Set paragraph mode my %seen; my $header; my $count_in; my $count_out; my $gn; while (<>) { if (/^>.* GN=([^ ]+)/) { $gn = $1; $header = exists $seen{$gn} ? undef : $_; } elsif ($header) { undef $seen{$gn}; print $header, $_; } } close $OUTFILE; close $PROTFILE; printf "%d total records read from '%s'\n",$count_in,$prot_filename; printf "%d records written to '%s' after removing duplicate entries\n",$count_out,$out_filename;