use LWP::UserAgent;
use HTML::Parser;
use HTML::TreeBuilder;
use Encode;
use strict;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
my $charset = undef;
sub set_charset_from_content_type {
if ( $_[0] =~ /.*; charset=(\S+)/ ) { $charset ||= $1; }
}
# This parser is active only until we get the charset
my $mini_parser = HTML::Parser->new(
api_version => 3,
start_h => [
sub {
$_[0] eq 'meta'
and $_[1]->{'http-equiv'}
and lc( $_[1]->{'http-equiv'} ) eq 'content-type'
and set_charset_from_content_type( $_[1]->{'content'} );
},
"tagname, attr"
],
end_h => [
sub {
$_[0] eq 'head'
and do { $charset ||= "iso-8859-1" }
},
"tagname"
]
);
# This doesn't do what you think it does - it does something
# strange; see the HTML::Parser documentation
$mini_parser->utf8_mode(1);
my $root = HTML::TreeBuilder->new;
my $isfirst = 1;
my $unencoded_buffer = '';
my $result = '';
sub process_lwp_response {
my ( $chunk, $resp_object ) = @_;
$unencoded_buffer .= $chunk;
if ( !$charset ) {
if ($isfirst) {
$isfirst = 0;
set_charset_from_content_type(
$resp_object->header('Content-Type') );
}
$mini_parser->parse($chunk);
}
if ($charset) {
$mini_parser = undef;
$root->parse( decode( $charset, $unencoded_buffer, Encode::FB_QUIET ) );
}
}
my $targeturl = 'http://www.ras.ru/about.aspx?_Language=ru';
# $targeturl = shift;
my $response = $ua->get( $targeturl, ':content_cb' => \&process_lwp_response );
if ( $response->is_success ) {
$root->eof();
# original code
my @paras = $root->find_by_tag_name('p');
foreach my $h (@paras) {
foreach my $item_r ( $h->content_refs_list ) {
next if ref $$item_r;
### proprietary JavaScript/HMTL inserted with substitution
}
} # end foreach
print $root->as_HTML;
}
else {
die $response->status_line;
}