use strict; # use LWP::Debug qw(+); use LWP::UserAgent; use XML::LibXML; use Encode qw/encode decode/; my $url = 'http://www.cboe.com/Chinese'; my $ua = 'Mozilla/5.0 (en-US; rv:1.4b) Gecko/20030514'; my $browser = LWP::UserAgent->new( agent => "$ua"); my $response = $browser->get($url); my $content = $response->content; print "Cleaning $url...\n"; # gb2312-raw also fails my $euc_cn = encode("euc-cn", $content); my $utf8 = decode("euc-cn", $euc_cn); clean_html($euc_cn); sub clean_html { my $input = shift; my $p = XML::LibXML->new(); # parser $p->recover(1); my $cleaned = $p->parse_html_string($input)->toStringHTML; }