A practical example of nested data structures may help.
use strict; use warnings; use Data::Dumper; # # The keys of %entries are the Descriptions and Gene Names from all th +e entries. # The values are references to anonymous arrays, with each entry in th +e array # being a hash reference returned by read_entry(). If there are more t +han one # elements in the array, then there are duplicate uses of the Descript +ion or # Gene Name. # my %entries; # Populate the hash of entries while ( my $entry = read_entry() ) { foreach ( @{ $entry->{DE} }, @{ $entry->{GN} } ) { push( @{ $entries{$_} }, $entry); } if( @{ $entry->{DE} } == 0 and @{ $entry->{GN} } == 0 ) { print "No names for: " . Dumper($entry) . "\n"; } } # # Report all entries for each Description or Gene Name, noting those w +ith # duplicate entries associated. # foreach ( sort keys %entries ) { print "-------------------------\n"; print "Duplicate " if ( @{ $entries{$_} } > 1 ); print "Description or Gene Name: $_\n"; foreach ( @{ $entries{$_} } ) { local $" = ', '; print <<EOF; ID: $_->{ID} Accession Numbers: @{ $_->{AC} } Descriptions: @{ $_->{DE} } Gene Names: @{ $_->{GN} } EOF } print "\n"; } exit(0); # # read_entry() returns a hash reference representing the next entry in + the # file, or undef at end of file. # # Each entry has four keys: # # ID The IDentifier of the entry, as a string # # AC The ACcession numbers of the entry, as an anonymous array refe +rence # with each element of the array being one accession number # # DE The DEscriptions of the entry, as an anonymous array reference # with each element of the array being one description # # GN The Gene Names of the entry, as an anonymous array reference # with each element of the array being one gene name # sub read_entry { my $entry = { ID => 'This entry had no ID', AC => [], DE => [], GN => [], }; my $line = <>; $line = <> while( defined( $line) and $line !~ /^ID\s/ ); return(undef) unless(defined($line)); while(defined($line)) { if($line =~ m/^\/\//) { last; } elsif($line =~ m/^ID/) { if ($line =~ m/^ID\s+(\S+)/) { $entry->{ID} = $1; } else { error("malformed ID line: $line"); } } elsif ($line =~ m/^AC\s+(.*)/) { my $accession_numbers; do { $accession_numbers .= $1; } while ( ($line = <>) =~ m/^AC\s+(.*)/ ); $entry->{AC} = [ $accession_numbers =~ m/([^;]+);/g ]; next; } elsif ($line =~ m/^DE\s+(.*)/) { my $description; do { $description .= $1; } while ( ($line = <>) =~ m/^DE\s+(.*)/ ); $entry->{DE} = [ map { lc } $description =~ m/=([^;]+);/g +]; next; } elsif ($line =~ m/^GN\s+(.*)/) { my $gene_names; do { $gene_names .= $1; } while ( ($line = <>) =~ m/^DE\s+(.*)/ ); $entry->{GN} = [ map { lc } $gene_names =~ m/=([^;]+);/g ] +; next; } $line = <>; } return($entry); }
In reply to Re^3: how to parse a UniProt Flat file
by ig
in thread how to parse a UniProt Flat file
by stanleysj
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |