use strict;
use warnings;
use Data::Dumper;
#
# The keys of %entries are the Descriptions and Gene Names from all th
+e entries.
# The values are references to anonymous arrays, with each entry in th
+e array
# being a hash reference returned by read_entry(). If there are more t
+han one
# elements in the array, then there are duplicate uses of the Descript
+ion or
# Gene Name.
#
my %entries;
# Populate the hash of entries
while ( my $entry = read_entry() ) {
foreach ( @{ $entry->{DE} }, @{ $entry->{GN} } ) {
push( @{ $entries{$_} }, $entry);
}
if( @{ $entry->{DE} } == 0 and @{ $entry->{GN} } == 0 ) {
print "No names for: " . Dumper($entry) . "\n";
}
}
#
# Report all entries for each Description or Gene Name, noting those w
+ith
# duplicate entries associated.
#
foreach ( sort keys %entries ) {
print "-------------------------\n";
print "Duplicate " if ( @{ $entries{$_} } > 1 );
print "Description or Gene Name: $_\n";
foreach ( @{ $entries{$_} } ) {
local $" = ', ';
print <<EOF;
ID: $_->{ID}
Accession Numbers: @{ $_->{AC} }
Descriptions: @{ $_->{DE} }
Gene Names: @{ $_->{GN} }
EOF
}
print "\n";
}
exit(0);
#
# read_entry() returns a hash reference representing the next entry in
+ the
# file, or undef at end of file.
#
# Each entry has four keys:
#
# ID The IDentifier of the entry, as a string
#
# AC The ACcession numbers of the entry, as an anonymous array refe
+rence
# with each element of the array being one accession number
#
# DE The DEscriptions of the entry, as an anonymous array reference
# with each element of the array being one description
#
# GN The Gene Names of the entry, as an anonymous array reference
# with each element of the array being one gene name
#
sub read_entry {
my $entry = {
ID => 'This entry had no ID',
AC => [],
DE => [],
GN => [],
};
my $line = <>;
$line = <> while( defined( $line) and $line !~ /^ID\s/ );
return(undef) unless(defined($line));
while(defined($line)) {
if($line =~ m/^\/\//) {
last;
} elsif($line =~ m/^ID/) {
if ($line =~ m/^ID\s+(\S+)/) {
$entry->{ID} = $1;
} else {
error("malformed ID line: $line");
}
} elsif ($line =~ m/^AC\s+(.*)/) {
my $accession_numbers;
do {
$accession_numbers .= $1;
} while ( ($line = <>) =~ m/^AC\s+(.*)/ );
$entry->{AC} = [ $accession_numbers =~ m/([^;]+);/g ];
next;
} elsif ($line =~ m/^DE\s+(.*)/) {
my $description;
do {
$description .= $1;
} while ( ($line = <>) =~ m/^DE\s+(.*)/ );
$entry->{DE} = [ map { lc } $description =~ m/=([^;]+);/g
+];
next;
} elsif ($line =~ m/^GN\s+(.*)/) {
my $gene_names;
do {
$gene_names .= $1;
} while ( ($line = <>) =~ m/^DE\s+(.*)/ );
$entry->{GN} = [ map { lc } $gene_names =~ m/=([^;]+);/g ]
+;
next;
}
$line = <>;
}
return($entry);
}
|