use HTML::Parser 3;
use LWP::Simple;
my $html = get("http://perlmonks.org");
print body_text($html);
sub body_text {
my $content = $_[0] || return 'EMPTY BODY';
# HTML::Parser is broken on Javascript and styles
# (well it leaves it in the text) so we 'fix' it....
my $p = HTML::Parser->new(
start_h => [ sub{ $_[0]->{text}.=' '; $_[0]->{skip}++ if $_[1] eq 'script' or $_[1] eq 'style'; } , 'self,tag' ],
end_h => [ sub{ $_[0]->{skip}-- if $_[1] eq '/script' or $_[1] eq '/style'; } , 'self,tag' ],
text_h => [ sub{ $_[0]->{text}.=$_[1] unless $_[0]->{skip}}, 'self,dtext' ]
)->parse($content);
$p->eof();
my $text = $p->{text};
# remove escapes
$text =~ s/ / /gi;
$text =~ s/&[^;]+;/ /g;
# remove non ASCII printable chars, leaves punctuation stuff
$text =~ s/[^\040-\177]+/ /g;
# remove any < or > in case parser choked - rare but happens
$text =~ s/[<>]/ /g;
# crunch whitespace
$text =~ s/\s{2,}/ /g;
$text =~ s/^\s+//g;
return $text;
}