=head1 NAME XML::TokeParser::Simple - pretty much like HTML::TokeParser::Simple =head1 DESCRIPTION Get the benefits of L. use XML::TokeParser::Simple and get the blessing ;) Hopefully XML::TokeParser will add this in the next version. =head1 SYNOPSIS # file: printComments.pl # desc: prints all the comments from an xml file use XML::TokeParser::Simple; my $p = new XML::TokeParser::Simple('file.xml'); while(defined( my $t = $p->get_token )) { print $t->text,"\n" if $t->is_comment; } See L<"What is XML::TokeParser::TOKEN"> =cut package XML::TokeParser::Simple; use XML::TokeParser(); use vars qw/ @ISA $VERSION /; $VERSION = '0.01'; @ISA = qw/ XML::TokeParser /; sub get_token { my $self = shift; my $token = $self->SUPER::get_token( @_ ); return unless defined $token; bless $token, 'XML::TokeParser::Token'; } sub get_tag { my $self = shift; my $token = $self->SUPER::get_tag( @_ ); return unless defined $token; bless $token, 'XML::TokeParser::Token'; } package XML::TokeParser::Token; use strict; =head1 What is XML::TokeParser::TOKEN A token is a blessed array reference, that you acquire using $p->get_token, and that might look like: ["S", $tag, $attr, $attrseq, $raw] ["E", $tag, $raw] ["T", $text, $raw] ["C", $text, $raw] ["PI", $target, $data, $raw] If you don't like remembering array indices, you may access the attributes of a token like: $p->get_token->tag, $t->attr, $t->attrseg, $t->raw ... =head2 Methods Tokens may be inspected using any of these is_* methods is_text is_comment is_pi is_process_instruction is_start_tag is_end_tag is_tag like: print $t->target if $t->is_pi; print "The comment says ", $t->text if $t->is_comment; =cut # for PI sub target { return $_[0]->[1] if $_[0]->is_pi; } sub data { return $_[0]->[2] if $_[0]->is_pi; } sub raw { return $_[0]->[-1]; } #for S sub attr { return $_[0]->[2] if $_[0]->is_start_tag(); } sub attrseq { return $_[0]->[3] if $_[0]->is_start_tag(); } #for S|E sub tag { return $_[0]->[1] if $_[0]->is_tag; } #for C|T sub text { return $_[0]->[1] if $_[0]->is_text or $_[0]->is_comment; } # test your token sub is_text { return 1 if $_[0]->[0] eq 'T'; } sub is_comment { return 1 if $_[0]->[0] eq 'C'; } sub is_pi { return 1 if $_[0]->[0] eq 'PI'; } sub is_process_instruction { goto &is_pi; } sub is_start_tag { return $_[0]->_is( S => $_[1] ); } sub is_end_tag { return $_[0]->_is( E => $_[1] ); } sub is_tag { return $_[0]->_is( S => $_[1] ) || $_[0]->_is( E => $_[1] ); } sub _is { if($_[0]->[0] eq $_[1]){ if(defined $_[2]){ return 1 if $_[0]->[1] eq $_[2]; }else{ return 1; } } return 0; } 1; =head1 DEMO execute this file as if it were a script, as in C, and you'll see how/that this module works. =cut package main; unless(caller()){ use Data::Dumper; my $file = 'REC-xml-19980210.xml'; $file = \ q[

Document document prolog element Misc*

]; ## Cause chances are you won't have ## http://www.w3.org/TR/1998/REC-xml-19980210.xml ## as referenced in ## http://www.xmltwig.com/article/ways_to_rome/ways_to_rome.html ## in the current directory my $i = 0; my $p = XML::TokeParser::Simple->new($file); my $Ret = ""; while(defined(my $t = $p->get_token() )){ if( $t->is_start_tag('lhs') ){ $i++; $Ret = join '', "[$i] ", $p->get_text('/lhs'), " ::= "; }elsif( $t->is_start_tag('rhs') ){ $Ret .= $p->get_text('/rhs'); }elsif( $t->is_end_tag('prod') ){ print clean($Ret),"\n"; $Ret = ""; } } undef $Ret; undef $p; ## mirod already did this, so I'm borrowing sub prod { my( $twig, $prod)= @_; my $lhs= $prod->field( 'lhs'); my $rhs= join '', map {$_->text} $prod->children( 'rhs'); $i++; my $prod_text = "[$i] $lhs ::= $rhs"; print clean( $prod_text) . "\n"; } sub clean { my( $string)= @_; $string =~ s/\xc2\xa0/ /sg; $string =~ s/\s+/ /g; $string=~ s{\s$}{}g; return $string; } } 1; =head1 SEE ALSO L, L, L, L =head1 AUTHOR D.H. =head1 LICENSE copyright (c) D.H. 2002 All rights reserved. This program is released under the same terms as perl itself. If you don't know what that means, visit http://perl.com or execute "perl -v" at a commandline (assuming you have perl installed). =cut