comment on

A practical example of nested data structures may help.

use strict;
use warnings;
use Data::Dumper;

#
# The keys of %entries are the Descriptions and Gene Names from all th
+e entries.
# The values are references to anonymous arrays, with each entry in th
+e array
# being a hash reference returned by read_entry(). If there are more t
+han one
# elements in the array, then there are duplicate uses of the Descript
+ion or
# Gene Name.
#
my %entries;

# Populate the hash of entries
while ( my $entry = read_entry() ) {

    foreach ( @{ $entry->{DE} }, @{ $entry->{GN} } ) {
        push( @{ $entries{$_} }, $entry);
    }

    if( @{ $entry->{DE} } == 0 and @{ $entry->{GN} } == 0 ) {
        print "No names for: " . Dumper($entry) . "\n";
    }
}


#
# Report all entries for each Description or Gene Name, noting those w
+ith
# duplicate entries associated.
#
foreach ( sort keys %entries ) {
    print "-------------------------\n";
    print "Duplicate " if ( @{ $entries{$_} } > 1 );
    print "Description or Gene Name: $_\n";

    foreach ( @{ $entries{$_} } ) {
        local $" = ', ';

        print <<EOF;
    ID: $_->{ID}
      Accession Numbers: @{ $_->{AC} }
      Descriptions:      @{ $_->{DE} }
      Gene Names:        @{ $_->{GN} }
EOF

    }
    print "\n";
}



exit(0);






#
# read_entry() returns a hash reference representing the next entry in
+ the
# file, or undef at end of file.
#
# Each entry has four keys:
#
#   ID  The IDentifier of the entry, as a string
#
#   AC  The ACcession numbers of the entry, as an anonymous array refe
+rence
#       with each element of the array being one accession number
#
#   DE  The DEscriptions of the entry, as an anonymous array reference
#       with each element of the array being one description
#
#   GN  The Gene Names of the entry, as an anonymous array reference
#       with each element of the array being one gene name
#
sub read_entry {

    my $entry = {
        ID  =>  'This entry had no ID',
        AC  =>  [],
        DE  =>  [],
        GN  =>  [],
    };

    my $line = <>;

    $line = <> while( defined( $line) and $line !~ /^ID\s/ );

    return(undef) unless(defined($line));

    while(defined($line)) {

        if($line =~ m/^\/\//) {

            last;

        } elsif($line =~ m/^ID/) {

            if ($line =~ m/^ID\s+(\S+)/) {
                $entry->{ID} = $1;
            } else {
                error("malformed ID line: $line");
            }

        } elsif ($line =~ m/^AC\s+(.*)/) {

            my $accession_numbers;
            do {
                $accession_numbers .= $1;
            } while ( ($line = <>) =~ m/^AC\s+(.*)/ );

            $entry->{AC} = [ $accession_numbers =~ m/([^;]+);/g ];

            next;

        } elsif ($line =~ m/^DE\s+(.*)/) {

            my $description;
            do {
                $description .= $1;
            } while ( ($line = <>) =~ m/^DE\s+(.*)/ );

            $entry->{DE} = [ map { lc } $description =~ m/=([^;]+);/g 
+];

            next;


        } elsif ($line =~ m/^GN\s+(.*)/) {

            my $gene_names;
            do {
                $gene_names .= $1;
            } while ( ($line = <>) =~ m/^DE\s+(.*)/ );

            $entry->{GN} = [ map { lc } $gene_names =~ m/=([^;]+);/g ]
+;

            next;

        }

        $line = <>;

    }


    return($entry);
}
[download]

In reply to Re^3: how to parse a UniProt Flat file by ig
in thread how to parse a UniProt Flat file by stanleysj

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.