in reply to how to parse a UniProt Flat file

If you can install Swissknife, you don't have to write your own parser for UniProt and a program similar to the following might do what you need.

use strict; use warnings; use Data::Dumper; # # SWISS::Entry is part of Swissknife # Available from http://swissknife.sourceforge.net/ # See: http://swissknife.sourceforge.net/docs/ # use SWISS::Entry; my %entries; # Change the line termination string so we read an entire entry at a t +ime local $/ = "\n//\n"; # Read in all the entries and fill %entries while (<>) { my $entry = SWISS::Entry->fromText($_); # # Add this entry to %entries once for each IDentifier, DEscription # and Gene Name in the entry, all keys converted to lower case. # The hash values are pointers to anonymous arrays, so push the # entries onto the arrays. # foreach my $key ( $entry->IDs->elements, map { $_->text } $entry->DEs->elements, map { ( $_->Name, $_->Synonyms ) } $entry->GNs->elements , ) { push( @{$entries{lc($key)}}, $entry); } } # # Now report on each key in %entries # foreach my $key (sort keys %entries) { print "\n\n----------------------\n"; print "DUPLICATE " if ( @{$entries{$key}} > 1); print "key $key\n"; foreach my $entry ( @{$entries{$key}} ) { print "\n"; print " IDs " . join(", ", $entry->IDs->elements) . "\n" if($entry->IDs); print " DEs " . join(", ", map { $_->text } $entry->DEs->elements) . "\n" if($entry->DEs); print " GNs " . join(", ", map { $_->text } map { ($_->Name, $_->Synonyms) }$entry->GNs->elements) + . "\n" if($entry->GNs); } }

Replies are listed 'Best First'.
Re^2: how to parse a UniProt Flat file
by stanleysj (Novice) on Dec 30, 2008 at 10:11 UTC

    Finally i have come with a code that could do my work of parsing an UniProt File nad getting the terms....Hope this code could be useful to others... Thanks to all who helped me out in this node.

    $/ = "//"; $count = 0; while ($chunkData = <>) { @data = grep {$_ !~ /^\s*$/} map {/.+?\=(.+?);/g} grep {$_ =~ /^DE +.+?\=(.+?);|^GN.+?\=(.+?);/} split ("\n", $chunkData); foreach $term (@data) { next if ($term =~ /Putative uncharacterised protein/); if ($term =~ m/\,/g) { foreach (split (/\,\s/, $term)) { $hash{lc ($_)}++; next if $hash{lc ($_)} > 1; $count++; print "$count "; print lc($_)."\n"; } } elsif ($term =~ /(.+?)\((.+?)\)/g) { $hash{lc ($1)}++; next if $hash{lc ($1)} >1; $count++; print "$count "; print lc($1)."\n"; $hash{lc ($2)}++; next if $hash{lc ($2)} >1; $count++; print "$count "; print lc($2)."\n"; } else { $hash{lc ($term)}++; next if $hash{lc ($term)} > 1; $count++; print "$count "; print lc ($term)."\n"; } } print "\n"; }