################## package HTML::TokeParser::Easy; ################## use strict; use Carp; use HTML::TokeParser; use vars qw/ @ISA $VERSION $AUTOLOAD /; $VERSION = '1.0'; @ISA = qw/ HTML::TokeParser /; use constant START_TAG => 'S'; use constant END_TAG => 'E'; use constant TEXT => 'T'; use constant COMMENT => 'C'; use constant DECLARATION => 'D'; use constant PROCESS_INSTRUCTION => 'PI'; my %token_spec = ( S => { _name => 'START_TAG', tag => 1, attr => 2, attrseq => 3, text => 4 }, E => { _name => 'END_TAG', tag => 1, text => 2 }, T => { _name => 'TEXT', text => 1 }, C => { _name => 'COMMENT', text => 1 }, D => { _name => 'DECLARATION', text => 1 }, PI => { _name => 'PROCESS_INSTRUCTION', token0 => 1, text => 2 } ); sub AUTOLOAD { no strict 'refs'; my ($self, $token) = @_; # was it an is_... method? if ( $AUTOLOAD =~ /.*::is_(\w+)/ ) { my $token_type = uc $1; my $tag = &$token_type; *{ $AUTOLOAD } = sub { return $_[ 1 ]->[ 0 ] eq $tag ? 1 : 0 }; return &$AUTOLOAD; } elsif ( $AUTOLOAD =~ /.*::return_(\w+)/ ) { # was it a return_... method? my $token_attr = $1; *{ $AUTOLOAD } = sub { my $attr = $_[ 1 ]->[ 0 ]; if ( exists $token_spec{ $attr } and exists $token_spec{ $attr }{ $token_attr } ) { return $_[ 1 ]->[ $token_spec{ $attr }{ $token_attr } ]; } else { if ( ! exists $token_spec{ $attr } ) { carp "No such token: '$attr'"; } else { carp "No such attribute: '$token_attr' for $token_spec{ $attr }{ _name }"; } } }; return &$AUTOLOAD; } else { # Yo! You can't do that! croak "No such method: $AUTOLOAD"; } } sub DESTROY {}; __END__ =head1 NAME HTML::TokeParser::Easy - simple method access to TokeParser tokens (no more memorizing array indices). =head1 SYNOPSIS use HTML::TokeParser::Easy; my $p = HTML::TokeParser::Easy->new( $somefile ); while ( my $token = $parser->get_token ) { # This prints all text in an HTML doc (i.e., it strips the HTML) next if ! $parser->is_text( $token ); print $parser->return_text( $token ); } =head1 DESCRIPTION C is a fairly common method of parsing HTML. However, the tokens returned are not exactly intuitive to parse: ["S", $tag, $attr, $attrseq, $text] ["E", $tag, $text] ["T", $text, $is_data] ["C", $text] ["D", $text] ["PI", $token0, $text] To simplify this, C allows the user ask more intuitive (read: more self-documenting) questions about the tokens returned. Specifically, there are 6 C type methods and 6 C type methods. The C methods allow you to determine the token type and the C methods get the data that you need. Since this is a subclass of C, all C methods are available. To truly appreciate the power of this module, please read the documentation for C and C. The following will be brief descriptions of the available methods followed by examples. =head1 C Methods =head2 Note: Due to the way that C has been coded, the portion of the C methods after the C part is case-insensitive. For example, the following lines of code are identical: $parser->is_start_tag( $token ); $parser->is_START_TAG( $token ); $parser->is_stArt_tAg( $token ); Yes, I could have done something about that, but why bother? =over 4 =item 1 C Use this to determine if you have a start tag. =item 2 C Use this to determine if you have an end tag. =item 3 C Use this to determine if you have text. Note that this is I to be confused with the C method described below! C will identify text that the user typically sees display in the Web browser. =item 4 C Are you still reading this? Nobody reads POD. Don't you know you're supposed to go to CLPM, ask a question that's answered in the POD and get flamed? It's a rite of passage. Really. C is used to identify comments. See the HTML::Parser documentation for more information about comments. There's more than you might think. =item 5 C This will match the DTD at the top of your HTML. (You I use DTD's, don't you?) =item 6 C Process Instructions are from XML. This is very handy if you need to parse out PHP and similar things with a parser. =back =head1 The 6 C methods =head2 Note: As noted for the 6 C methods, these methods are case-insensitive after the C part. =over 4 =item 1 C Do you have a start tag or end tag? This will return the type (lower case). =item 2 C If you have a start tag, this will return a hash ref with the attribute names as keys and the values as the values. =item 3 C For a start tag, this is an array reference with the sequence of the attributes, if any. =item 4 C This is the exact text of whatever the token is representing. =item 5 C This text is in a CDATA section. =item 6 C For processing instructions, this will return the token found immediately after the opening tag. For \ phbreport.txt" or die "Cannot open phbreport for writing: $!"; foreach my $doc ( @html_docs ) { print "Processing $doc\n"; my $p = HTML::TokeParser::Easy->new( $doc ); while ( my $token = $p->get_token ) { next if ! $p->is_comment( $token ); print PHB $p->return_text( $token ), "\n"; } } close PHB; =head2 Stripping Comments Uh oh. Turns out that your PHB was right for a change. Many of the comments in the HTML weren't very polite. Since your entire graphics department was just fired, it falls on you need to strip those comments from the HTML. use strict; use HTML::TokeParser::Easy; my $new_folder = 'no_comment/'; my @html_docs = glob( "*.html" ); foreach my $doc ( @html_docs ) { print "Processing $doc\n"; my $new_file = "$new_folder$doc"; open PHB, "> $new_file" or die "Cannot open $new_file for writing: $!"; my $p = HTML::TokeParser::Easy->new( $doc ); while ( my $token = $p->get_token ) { next if $p->is_comment( $token ); print PHB $p->return_text( $token ); } close PHB; } =head2 Changing form tags Your company was foo.com and now is bar.com. Unfortunately, whoever wrote your HTML decided to hardcode "http://www.foo.com/" into the C attribute of the form tags. You need to change it to "http://www.bar.com/". use strict; use HTML::TokeParser::Easy; my $new_folder = 'new_html/'; my @html_docs = glob( "*.html" ); foreach my $doc ( @html_docs ) { print "Processing $doc\n"; my $new_file = "$new_folder$doc"; open FILE, "> $new_file" or die "Cannot open $new_file for writing: $!"; my $p = HTML::TokeParser::Easy->new( $doc ); while ( my $token = $p->get_token ) { if ( $p->is_start_tag( $token ) and $p->return_tag( $token ) eq 'form' ) { my $form_tag = new_form_tag( $p->return_attr( $token ), $p->return_attrseq( $token ) ); print FILE $form_tag; } else { print FILE $p->return_text( $token ); } } close FILE; } sub new_form_tag { my ( $attr, $attrseq ) = @_; if ( exists $attr->{ action } ) { $attr->{ action } =~ s/www\.foo\.com/www.bar.com/; } my $tag = ''; foreach ( @$attrseq ) { $tag .= "$_=\"$attr->{ $_ }\" "; } $tag = "
"; return $tag; } =head1 COPYRIGHT Copyright (c) 2001 Curtis "Ovid" Poe. All rights reserved. This program is free software; you may redistribute it and/or modify it under the same terms as Perl itself =head1 AUTHOR Curtis "Ovid" Poe L =head1 BUGS 2001/10/04 There are no known bugs at this time. Use of C<$HTML::Parser::VERSION> which is less than 3.25 may result in incorrect behavior as older versions do not always handle XHTML correctly. It is the programmer's responsibility to verify that the behavior of this code matches the programmer's needs. Address bug reports and comments to: L. When sending bug reports, please provide the version of HTML::Parser, HTML::TokeParser, HTML::TokeParser::Easy, the version of Perl, and the version of the operating system you are using. =head1 BUGS 2001/10/04 There are no known bugs at this time. Use of C<$HTML::Parser::VERSION> which is less than 3.25 may result in incorrect behavior as older versions do not always handle XHTML correctly. It is the programmer's responsibility to verify that the behavior of this code matches the programmer's needs. =cut