in reply to Using Expat: how to extranct and manipulate elements?
OK, it's time for the XML exercise of the day.
Just so you can compare the various programming styles for some of the most common XML modules here are solutions using XML::Parser, XML::Twig, XML::PYX and XML::XPath. You can choose the style you prefer, TMTOWTDI!
So first a solution using XML::Parser:
#!/bin/perl -w use strict; use XML::Parser; # global, but could be attached to the parser or passed to the handler +s my @results; # we need those to hold info about the parsing my( $elt_id, $child_seen, $in_child, $child_text); my $p= new XML::Parser( Handlers => { Start => \&start, # called when a start ta +g is found End => \&end, # called when an end tag + is found Char => \&char, # called when characters + are found }, ); $p->parse( \*DATA); # use parsefile to parse +... a file print join "\n", @results; print "\n"; sub start { my( $p, $elt, %atts)= @_; # set by XML::Parser::Ex +pat if( $elt eq 'elt') # we found an elt start +tag { $child_seen= 0; # reset the flag, no chi +ld found yet $elt_id= $atts{id}; # store it in case we ne +ed it } elsif( $elt eq 'child') # found a child start ta +g { $child_seen= 1; # we've seen a child $in_child= 1; # we are in the child $child_text= ''; # reset the child text } } sub end { my( $p, $elt)= @_; if( $elt eq 'elt') # found and elt end tag { if( $child_seen) { push @results, $child_text; } else { push @results, "missing child for elt $elt_id"; } } elsif( $elt eq 'child') # found a child end tag { $in_child= 0; } # Toto, I guess we are n +ot in the child any more } sub char # called for all non mar +k-up text { my( $p, $string)= @_; $child_text .= $string if( $in_child); # see the docs for why y +ou can't } # just write $child_text + = $string __DATA__ <doc> <dummy>I am a </dummy> <elt id="elt1"><child_1/><child>child 1</child></elt> <elt id="elt2"><child_1/></elt> <elt id="elt3"><child_1/><child>child 2</child></elt> <elt id="elt4"><child_1/></elt> </doc>
Then the obligatory XML::Twig plug:
#!/bin/perl -w use strict; use XML::Twig; my @results; # does not + have to be global, it's just easier # create the twig see the docs for why to use TwigRoots my $t= new XML::Twig( TwigRoots => { elt => \&check_elt }); # call che +ck_elt every time an element elt is parsed $t->parse( \*DATA); # parse th +e XML (use parsefile to parse... a file) print join "\n", @results; print "\n"; sub check_elt { my( $t, $elt)= @_;) # $t is th +e XML::Twig object # $elt is +an XML::Twig::Elt object if( my $child= $elt->first_child( 'child')) # that's h +ow you navigate the element { push @results, $child->text; } # text inc +ludes sub elements of child else { push @results, "missing child for elt " . $elt->att( 'id'); } $t->purge; # call onl +y if your document is huge } # to free +the memory __DATA__ <doc> <dummy>I am a </dummy> <elt id="elt1"><child_1/><child>child 1</child></elt> <elt id="elt2"><child_1/></elt> <elt id="elt3"><child_1/><child>child 2</child></elt> <elt id="elt4"><child_1/></elt> </doc>
Now the XML::PYX way. To run this one you need to have the document in a file, say doc.xml and to run pyx doc.xml | perl test_pyx .
#!/bin/perl -n -w use strict; # global, but could be attached to the parser or passed to the handler +s use vars qw( @results); # we need those to hold info about the parsing # @in_element is a stack of open elements, # the current element is $in_element[-1] use vars qw( @in_element $elt_id $child_seen $child_text); if(m/^\((.*)$/) # element start tag (tag { push @in_element, $1; if( $1 eq 'elt') # elt start tag { $child_seen= 0; } # reset the flag elsif( $1 eq 'child') # child start tag { $child_seen= 1; # set the flag $child_text= ''; # reset the text } } elsif( m/^A([^\s]*) (.*)$/) # attribute Aatt value { # store the id for elt elements $elt_id= $2 if( ($in_element[-1] eq 'elt') && ($1 eq 'id')); } elsif( m/^-(.*)\n/) # text -text { $child_text.= $1 if( $in_element[-1] eq 'child'); } elsif( m/\)(.*)$/) # end tag )tag { if( $1 eq 'elt') { if( $child_seen) { push @results, $child_text; } else { push @results, "missing child for elt $elt_id"; } } } END { print join "\n", @results; print "\n"; }
And finally the XML::XPath version. I am not very familiar with this module so it is probably not the most elegant way to do it, but hey, it works!
#!/bin/perl -w use strict; use XML::XPath; my @results; # create the xpath object from the DATA filehandle my $xp = XML::XPath->new( ioref => \*DATA); # find all paragraphs my $elts = $xp->find('/doc/elt'); foreach my $elt ($elts->get_nodelist) { # there is probably a more elegant way to get the child children # but I don't know XML::XPath enough my $children= $elt->getChildNodes; # get a +ll children my @children= grep { $_->getName eq 'child' } @$children; # grep +only the relevant ones if( @children) { push @results, $children[0]->string_value; } # that' +s how you get the text else { push @results, "missing child for elt " . $elt->getAttribute( +'id'); } } print join "\n", @results; print "\n"; __DATA__ <doc> <dummy>I am a </dummy> <elt id="elt1"><child_1/><child>child 1</child></elt> <elt id="elt2"><child_1/></elt> <elt id="elt3"><child_1/><child>child 2</child></elt> <elt id="elt4"><child_1/></elt> </doc>
|
|---|