=head1 MySAXHandler MySAXHandler is a little package to describe what to do with the events generated by the SAX parser. All events are treated by methods described here. MySAXHandler is instanciated in the parser declaration. Variable %entryArgs keeps data until end_element = 'entry' event and then it writes data. It's keys are : =over 3 =item 'Accession' which is a hash structure also as there can be more than one accession number for each entry. Accession number is hold in the key, and value is empty. =item 'Organism' which stores the organism name =item =item =item =back Other variables used in this package =over 3 =item $currentElement : Stores the Name of the current element =item $currentAttributes : Store a hash reference towards the current element's attributes =item $entryCounter : counts tags seen =item $entryDone : counts tags seen =item $dbReferenceOn : Set to 1 if we are in a ... block =item $comment : as above for ... =back =head1 Methods =cut package MySAXHandler; use base qw(XML::SAX::Base); $|=1; my (%entryArgs); my ( $currentElement, $currentAttributes, $entryCounter, $entryDone, $accessionNumber, $currentAttrs ); my ( $dbReferenceOn, $commentOn, $organismOn ) = ( 0, 0, 0 ); =head2 start_document For us this method does nothing for now. =cut sub start_document { my ( $self, $doc ) = @_; #What to do with the document start print "Here we go ..............\n\n"; } =head2 start_element Event 'start_element' calls this function. Depending on the tag Name, it does various things. Tag name is stored into $currentElement. It's attributes into $currentAttributes. =cut sub start_element { my ( $self, $el ) = @_; #clear %entryArgs when start_element = if ( $el->{Name} eq 'entry' ) { undef(%entryArgs); $entryCounter++; } #Set the current element name and attributes $currentElement = $el->{Name}; #print $el->{Name}."\n"; $currentAttributes = $el->{Attributes}; # foreach my $key (keys %{$currentAttributes}) { # print "===============$key => ".%{$currentAttributes}->{$key}->{Value}."================\n"; # } #Set some flag to 1 for interesting blocks $dbReferenceOn = 1 if ( $el->{Name} eq 'dbReference' ); $commentOn = 1 if ( $el->{Name} eq 'comment' ); $organismOn = 1 if ( $el->{Name} eq 'organism' ); $organismAttrs = $currentAttributes if ( $el->{Name} eq 'name' ); } =head2 end_element Store data into the dB, and prints out if tag name = entry. Set flags back to 0. =cut sub end_element { my ( $self, $el ) = @_; #What to do now ... print out or store in dB if ( $el->{Name} eq 'entry' ) { print "Accession number(s) :\n"; foreach my $key ( sort keys %{ $entryArgs{'Accession'} } ) { print "\t=>$key\n"; } print "\n\tOrganism Name = " . $entryArgs{'Organism'} . "\n"; } #foreach my $key (sort keys %entryArgs) { #Store Accession Number $entryArgs{'Accession'}{$accessionNumber}++ if ( $el->{Name} eq 'accession' ); undef($accessionNumber) if ( $el->{Name} eq 'accession' ); #Set the flags back $dbReferenceOn = 0 if ( $el->{Name} eq 'dbReference' ); $commentOn = 0 if ( $el->{Name} eq 'comment' ); $organismOn = 0 if ( $el->{Name} eq 'organism' ); } =head2 characters Store data into the entryArgs hash. =cut sub characters { my ( $self, $char ) = @_; my $text = $char->{Data}; #Store accesion number if ( $currentElement eq 'accession' ) { $accessionNumber .= $text; $accessionNumber =~ s/[\s\r\n]//g; } #Store organism name if ( $organismOn == 1 and $currentElement eq 'name' ) { if ( $organismAttrs->{'{}type'}->{Value} eq 'scientific' ) { $entryArgs{'Organism'} .= $text; } } } sub end_document { my ( $self, $doc ) = @_; print "\n\n ...................... et voilą ;-)\n"; } #### #!/usr/bin/perl -w # POD documentation - main docs before the code =head1 NAME parseUNIPROT4MySQL =head1 DESCRIPTION This script parse the XML file containing uniprot knowledgebase, fetch informations ad insert them into a MySQL db. =head1 CONTACT TEXTORIS Julien , =head1 USAGE ./parseUNIPROT4MySQL.pl function : - 'coordinate' - =cut use strict; use warnings; use DBI; use XML::SAX; use lib './'; use MySAXHandler; $|=1; =head1 Variables Declaration =over 3 =item $filename : XML file to parse =item $function : implemented function to use =item $dbd : MySQL connection =item $dbname : DB Name =item $host : computer which host the database =item $user / $pass : user and password to connect DB =item $insert_ensembl = Insert EnsEMBL_ID into ensEMBL_ID table if it doesn't exists =item $insert_uniprot = Insert Uniprot_ID into Uniprot_ID table if it doesn't exists =item $insert_relEnsemblUniprot = insert values into the relational table (n:m relation) =item $parser = new XML parser object =item $doc = XML object loaded from $filename, option strip whitespace make parsing 30% faster =item $rootNode = XML root tag =item $AccNumber = store Uniprot Accession number of entry being processed =item $type = type argument of tag =item $value = value of tag of a given comment tag =back =cut my $filename = $ARGV[0]; my $function = $ARGV[1]; my $dbname = "AnnotationDB"; my $host = "localhost"; my $user = "jtextori"; my $pass = "marcel"; #my $dbd = DBI->connect( "DBI:mysql:$dbname;host=$host", # $user, $pass, { RaiseError => 1 } ) # or die # "can't connect : \nerror1 ==> $DBI::errstr\nerror1 ==> $@\nerror1 ==> $!\n\n"; # #my $insert_ensembl = $dbd->prepare('INSERT INTO ensEMBL_ID (`ensEMBL_ID`) VALUES (?)'); #my $insert_uniprot = $dbd->prepare('INSERT INTO Uniprot_ID (`Uniprot_ID`) VALUES (?)'); #my $insert_relEnsemblUniprot = $dbd->prepare('INSERT INTO ensEMBL_ID_has_Uniprot_ID (`ensEMBL_ID_ensEMBL_ID`,`Uniprot_ID_Uniprot_ID`) VALUES(?,?)'); my $parser = XML::SAX::ParserFactory->parser(Handler => MySAXHandler->new); $parser->parse_uri($filename); exit(0)