Lana has asked for the wisdom of the Perl Monks concerning the following question:
In this simplified example it strips HTML inside the <div id="body">...</div> and prints out just text, but I need all html formatting to be untouched. How to achieve this? Thanks! :)use strict; use HTML::Parser; my $content=<<EOF; <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>Some title goes here</title> </head> <body bgcolor="#FFFFFF"> <div id="leftcol"> menu column </div> <div id="body"> <p>some text goes here some text goes here<br /> some text goes here some text goes here</p> <p><strong>some header</strong></p> <p>some text goes here some text goes here<br /> some text goes here some text goes here</p> <p><img src="img.gif" /> image here</p> <p><strong>some header</strong></p> <p>some text goes here some text goes here<br /> some text goes here some text goes here</p> </div> <div id="rightcol"> news column </div> </body> </html> EOF my $p = HTML::Parser->new( api_version => 3 ); $p->handler( start => \&start_handler, "self,tagname,attr" ); $p->parse($content); exit; sub start_handler { my $self = shift; my $tagname = shift; my $attr = shift; return unless ( $tagname eq 'div' and $attr->{id} eq 'body' ); $self->handler( text => sub { print shift }, "dtext" ); $self->handler(end => sub { shift->eof if shift eq $tagname; }, " +tagname,self"); }
|
|---|