in reply to Parsing with HTML::Parser

Here's an old sub I have that does that, with a flag to optionally remove newlines.

sub text_only { my $content = shift; my $rm_newlines = shift; my $parser; my %inside; my $text; my $tag = sub { my ($tag_name, $num) = @_; $inside{$tag_name} += $num; $text .= " "; }; my $get_text = sub { $text .= $_[0] if ( !$inside{script} && !$inside{style} ); }; $parser = HTML::Parser->new(handlers => [start => [$tag, "tagname, + '+1'"], end => [$tag, "tagname, + '-1'"], text => [$get_text, "dte +xt"], ], marked_sections => 1) or die croak "Failed to create HTML::Parser object: $!\n"; $parser->parse($content); $text =~ s/[\n\r]/ /g if ($rm_newlines); return $text; }