0: =head1 NAME
1:
2: XML::TokeParser::Simple - pretty much like HTML::TokeParser::Simple
3:
4: =head1 DESCRIPTION
5:
6: Get the benefits of L<XML::TokeParser::TOKEN|"What is XML::TokeParser::TOKEN">.
7:
8: use XML::TokeParser::Simple and get the blessing ;)
9:
10: Hopefully XML::TokeParser will add this in the next version.
11:
12:
13: =head1 SYNOPSIS
14:
15: # file: printComments.pl
16: # desc: prints all the comments from an xml file
17: use XML::TokeParser::Simple;
18: my $p = new XML::TokeParser::Simple('file.xml');
19: while(defined( my $t = $p->get_token )) {
20: print $t->text,"\n" if $t->is_comment;
21: }
22:
23: See L<"What is XML::TokeParser::TOKEN">
24:
25: =cut
26:
27: package XML::TokeParser::Simple;
28: use XML::TokeParser();
29: use vars qw/ @ISA $VERSION /;
30: $VERSION = '0.01';
31: @ISA = qw/ XML::TokeParser /;
32:
33: sub get_token {
34: my $self = shift;
35: my $token = $self->SUPER::get_token( @_ );
36: return unless defined $token;
37: bless $token, 'XML::TokeParser::Token';
38: }
39:
40: sub get_tag {
41: my $self = shift;
42: my $token = $self->SUPER::get_tag( @_ );
43: return unless defined $token;
44: bless $token, 'XML::TokeParser::Token';
45: }
46:
47: package XML::TokeParser::Token;
48: use strict;
49:
50: =head1 What is XML::TokeParser::TOKEN
51:
52: A token is a blessed array reference,
53: that you acquire using $p->get_token,
54: and that might look like:
55:
56: ["S", $tag, $attr, $attrseq, $raw]
57: ["E", $tag, $raw]
58: ["T", $text, $raw]
59: ["C", $text, $raw]
60: ["PI", $target, $data, $raw]
61:
62: If you don't like remembering array indices,
63: you may access the attributes of a token like:
64:
65: $p->get_token->tag, $t->attr, $t->attrseg, $t->raw ...
66:
67: =head2 Methods
68:
69: Tokens may be inspected using any of these is_* methods
70:
71: is_text
72: is_comment
73: is_pi
74: is_process_instruction
75: is_start_tag
76: is_end_tag
77: is_tag
78:
79: like:
80:
81: print $t->target if $t->is_pi;
82: print "The comment says ", $t->text if $t->is_comment;
83:
84: =cut
85:
86: # for PI
87: sub target { return $_[0]->[1] if $_[0]->is_pi; }
88: sub data { return $_[0]->[2] if $_[0]->is_pi; }
89: sub raw { return $_[0]->[-1]; }
90:
91: #for S
92: sub attr { return $_[0]->[2] if $_[0]->is_start_tag(); }
93: sub attrseq { return $_[0]->[3] if $_[0]->is_start_tag(); }
94:
95: #for S|E
96: sub tag { return $_[0]->[1] if $_[0]->is_tag; }
97:
98: #for C|T
99: sub text { return $_[0]->[1] if $_[0]->is_text or $_[0]->is_comment; }
100:
101: # test your token
102: sub is_text { return 1 if $_[0]->[0] eq 'T'; }
103: sub is_comment { return 1 if $_[0]->[0] eq 'C'; }
104: sub is_pi { return 1 if $_[0]->[0] eq 'PI'; }
105: sub is_process_instruction { goto &is_pi; }
106: sub is_start_tag { return $_[0]->_is( S => $_[1] ); }
107: sub is_end_tag { return $_[0]->_is( E => $_[1] ); }
108: sub is_tag { return $_[0]->_is( S => $_[1] )
109: || $_[0]->_is( E => $_[1] ); }
110:
111: sub _is {
112: if($_[0]->[0] eq $_[1]){
113: if(defined $_[2]){
114: return 1 if $_[0]->[1] eq $_[2];
115: }else{
116: return 1;
117: }
118: }
119: return 0;
120: }
121:
122: 1;
123:
124: =head1 DEMO
125:
126: execute this file as if it were a script, as in C<perl WhateverYouSavedItAs>,
127: and you'll see how/that this module works.
128:
129: =cut
130:
131: package main;
132:
133: unless(caller()){
134: use Data::Dumper;
135: my $file = 'REC-xml-19980210.xml';
136: $file = \ q[<p>
137: <scrap lang='ebnf' id='document'>
138: <head>Document</head>
139: <prod id='NT-document'><lhs>document</lhs>
140: <rhs><nt def='NT-prolog'>prolog</nt>
141: <nt def='NT-element'>element</nt>
142: <nt def='NT-Misc'>Misc</nt>*</rhs></prod>
143: </scrap>
144: </p>];
145: ## Cause chances are you won't have
146: ## http://www.w3.org/TR/1998/REC-xml-19980210.xml
147: ## as referenced in
148: ## http://www.xmltwig.com/article/ways_to_rome/ways_to_rome.html
149: ## in the current directory
150:
151:
152:
153: my $i = 0;
154: my $p = XML::TokeParser::Simple->new($file);
155:
156: my $Ret = "";
157:
158: while(defined(my $t = $p->get_token() )){
159:
160: if( $t->is_start_tag('lhs') ){
161: $i++;
162: $Ret = join '', "[$i] ", $p->get_text('/lhs'), " ::= ";
163: }elsif( $t->is_start_tag('rhs') ){
164: $Ret .= $p->get_text('/rhs');
165: }elsif( $t->is_end_tag('prod') ){
166: print clean($Ret),"\n";
167: $Ret = "";
168: }
169: }
170:
171: undef $Ret;
172: undef $p;
173:
174: ## mirod already did this, so I'm borrowing
175:
176: sub prod {
177: my( $twig, $prod)= @_;
178: my $lhs= $prod->field( 'lhs');
179: my $rhs= join '', map {$_->text} $prod->children( 'rhs');
180:
181: $i++;
182: my $prod_text = "[$i] $lhs ::= $rhs";
183: print clean( $prod_text) . "\n";
184: }
185:
186:
187: sub clean {
188: my( $string)= @_;
189: $string =~ s/\xc2\xa0/ /sg;
190: $string =~ s/\s+/ /g; $string=~ s{\s$}{}g;
191: return $string;
192: }
193: }
194:
195: 1;
196:
197: =head1 SEE ALSO
198:
199: L<XML::TokeParser>, L<HTML::TokeParser>, L<HTML::TokeParser::Simple>,
200: L<XML::Twig>
201:
202: =head1 AUTHOR
203:
204: D.H. <PodMaster@cpan.org>
205:
206: =head1 LICENSE
207:
208: copyright (c) D.H. 2002 All rights reserved.
209:
210: This program is released under the same terms as perl itself.
211: If you don't know what that means, visit http://perl.com
212: or execute "perl -v" at a commandline (assuming you have perl installed).
213:
214: =cut
Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
Read Where should I post X? if you're not absolutely sure you're posting in the right place.
Please read these before you post! —
Posts may use any of the Perl Monks Approved HTML tags:
- a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
| |
For: |
|
Use: |
| & | | & |
| < | | < |
| > | | > |
| [ | | [ |
| ] | | ] |
Link using PerlMonks shortcuts! What shortcuts can I use for linking?
See Writeup Formatting Tips and other pages linked from there for more info.