use HTML::Parser 3; use LWP::Simple; my $html = get("http://perlmonks.org"); print body_text($html); sub body_text { my $content = $_[0] || return 'EMPTY BODY'; # HTML::Parser is broken on Javascript and styles # (well it leaves it in the text) so we 'fix' it.... my $p = HTML::Parser->new( start_h => [ sub{ $_[0]->{text}.=' '; $_[0]->{skip}++ if $_[1] eq 'script' or $_[1] eq 'style'; } , 'self,tag' ], end_h => [ sub{ $_[0]->{skip}-- if $_[1] eq '/script' or $_[1] eq '/style'; } , 'self,tag' ], text_h => [ sub{ $_[0]->{text}.=$_[1] unless $_[0]->{skip}}, 'self,dtext' ] )->parse($content); $p->eof(); my $text = $p->{text}; # remove escapes $text =~ s/ / /gi; $text =~ s/&[^;]+;/ /g; # remove non ASCII printable chars, leaves punctuation stuff $text =~ s/[^\040-\177]+/ /g; # remove any < or > in case parser choked - rare but happens $text =~ s/[<>]/ /g; # crunch whitespace $text =~ s/\s{2,}/ /g; $text =~ s/^\s+//g; return $text; }