in reply to Re: How to Parse Huge XML Files ?
in thread How to Parse Huge XML Files ?
=head1 MySAXHandler MySAXHandler is a little package to describe what to do with the event +s generated by the SAX parser. All events are treated by methods described here. MySAXHandler is inst +anciated in the parser declaration. Variable %entryArgs keeps data until end_element = 'entry' event and t +hen it writes data. It's keys are : =over 3 =item 'Accession' which is a hash structure also as there can be mor +e than one accession number for each entry. Accession number is hold +in the key, and value is empty. =item 'Organism' which stores the organism name =item =item =item =back Other variables used in this package =over 3 =item $currentElement : Stores the Name of the current element =item $currentAttributes : Store a hash reference towards the curren +t element's attributes =item $entryCounter : counts <entry> tags seen =item $entryDone : counts </entry> tags seen =item $dbReferenceOn : Set to 1 if we are in a <dbReference> ... </d +bReference> block =item $comment : as above for <comment> ... </comment> =back =head1 Methods =cut package MySAXHandler; use base qw(XML::SAX::Base); $|=1; my (%entryArgs); my ( $currentElement, $currentAttributes, $entryCounter, $entryDone, $accessionNumber, $currentAttrs ); my ( $dbReferenceOn, $commentOn, $organismOn ) = ( 0, 0, 0 ); =head2 start_document For us this method does nothing for now. =cut sub start_document { my ( $self, $doc ) = @_; #What to do with the document start print "Here we go ..............\n\n"; } =head2 start_element Event 'start_element' calls this function. Depending on the tag Name, it does various things. Tag name is stored into $currentElement. It's attributes into $current +Attributes. =cut sub start_element { my ( $self, $el ) = @_; #clear %entryArgs when start_element = <entry> if ( $el->{Name} eq 'entry' ) { undef(%entryArgs); $entryCounter++; } #Set the current element name and attributes $currentElement = $el->{Name}; #print $el->{Name}."\n"; $currentAttributes = $el->{Attributes}; # foreach my $key (keys %{$currentAttributes}) { # print "===============$key => ".%{$currentAttributes}->{$ +key}->{Value}."================\n"; # } #Set some flag to 1 for interesting blocks $dbReferenceOn = 1 if ( $el->{Name} eq 'dbReferen +ce' ); $commentOn = 1 if ( $el->{Name} eq 'comment' +); $organismOn = 1 if ( $el->{Name} eq 'organism' + ); $organismAttrs = $currentAttributes if ( $el->{Name} eq 'name' ); } =head2 end_element Store data into the dB, and prints out if tag name = entry. Set flags back to 0. =cut sub end_element { my ( $self, $el ) = @_; #What to do now ... print out or store in dB if ( $el->{Name} eq 'entry' ) { print "Accession number(s) :\n"; foreach my $key ( sort keys %{ $entryArgs{'Accession'} } ) { print "\t=>$key\n"; } print "\n\tOrganism Name = " . $entryArgs{'Organism'} . "\n"; } #foreach my $key (sort keys %entryArgs) { #Store Accession Number $entryArgs{'Accession'}{$accessionNumber}++ if ( $el->{Name} eq 'accession' ); undef($accessionNumber) if ( $el->{Name} eq 'accession' ); #Set the flags back $dbReferenceOn = 0 if ( $el->{Name} eq 'dbReference' ); $commentOn = 0 if ( $el->{Name} eq 'comment' ); $organismOn = 0 if ( $el->{Name} eq 'organism' ); } =head2 characters Store data into the entryArgs hash. =cut sub characters { my ( $self, $char ) = @_; my $text = $char->{Data}; #Store accesion number if ( $currentElement eq 'accession' ) { $accessionNumber .= $text; $accessionNumber =~ s/[\s\r\n]//g; } #Store organism name if ( $organismOn == 1 and $currentElement eq 'name' ) { if ( $organismAttrs->{'{}type'}->{Value} eq 'scientific' ) { $entryArgs{'Organism'} .= $text; } } } sub end_document { my ( $self, $doc ) = @_; print "\n\n ...................... et voilą ;-)\n"; }
#!/usr/bin/perl -w # POD documentation - main docs before the code =head1 NAME parseUNIPROT4MySQL =head1 DESCRIPTION This script parse the XML file containing uniprot knowledgebase, fetch + informations ad insert them into a MySQL db. =head1 CONTACT TEXTORIS Julien , <julien.textoris@gmail.com> =head1 USAGE ./parseUNIPROT4MySQL.pl <XML_filename> <function> function : - 'coordinate' - =cut use strict; use warnings; use DBI; use XML::SAX; use lib './'; use MySAXHandler; $|=1; =head1 Variables Declaration =over 3 =item $filename : XML file to parse =item $function : implemented function to use =item $dbd : MySQL connection =item $dbname : DB Name =item $host : computer which host the database =item $user / $pass : user and password to connect DB =item $insert_ensembl = Insert EnsEMBL_ID into ensEMBL_ID table if it + doesn't exists =item $insert_uniprot = Insert Uniprot_ID into Uniprot_ID table if it + doesn't exists =item $insert_relEnsemblUniprot = insert values into the relational t +able (n:m relation) =item $parser = new XML parser object =item $doc = XML object loaded from $filename, option strip whitespac +e make parsing 30% faster =item $rootNode = XML root tag =item $AccNumber = store Uniprot Accession number of entry being proc +essed =item $type = type argument of <comment> tag =item $value = value of <text> tag of a given comment tag =back =cut my $filename = $ARGV[0]; my $function = $ARGV[1]; my $dbname = "AnnotationDB"; my $host = "localhost"; my $user = "jtextori"; my $pass = "marcel"; #my $dbd = DBI->connect( "DBI:mysql:$dbname;host=$host", # $user, $pass, { RaiseError => 1 } ) # or die # "can't connect : \nerror1 ==> $DBI::errstr\nerror1 ==> $@\nerror1 = +=> $!\n\n"; # #my $insert_ensembl = $dbd->prepare('INSERT INTO ensEMBL_ID (`ensEMBL_ +ID`) VALUES (?)'); #my $insert_uniprot = $dbd->prepare('INSERT INTO Uniprot_ID (`Uniprot_ +ID`) VALUES (?)'); #my $insert_relEnsemblUniprot = $dbd->prepare('INSERT INTO ensEMBL_ID_ +has_Uniprot_ID (`ensEMBL_ID_ensEMBL_ID`,`Uniprot_ID_Uniprot_ID`) VALU +ES(?,?)'); my $parser = XML::SAX::ParserFactory->parser(Handler => MySAXHandler-> +new); $parser->parse_uri($filename); exit(0)
Edited by planetscape - added readmore tags
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^3: How to Parse Huge XML Files ?
by jsegal (Friar) on Jun 01, 2006 at 16:05 UTC |