use LWP::UserAgent; use HTML::Parser; use HTML::TreeBuilder; use Encode; use strict; my $ua = LWP::UserAgent->new; $ua->timeout(10); $ua->env_proxy; my $charset = undef; sub set_charset_from_content_type { if ( $_[0] =~ /.*; charset=(\S+)/ ) { $charset ||= $1; } } # This parser is active only until we get the charset my $mini_parser = HTML::Parser->new( api_version => 3, start_h => [ sub { $_[0] eq 'meta' and $_[1]->{'http-equiv'} and lc( $_[1]->{'http-equiv'} ) eq 'content-type' and set_charset_from_content_type( $_[1]->{'content'} ); }, "tagname, attr" ], end_h => [ sub { $_[0] eq 'head' and do { $charset ||= "iso-8859-1" } }, "tagname" ] ); # This doesn't do what you think it does - it does something # strange; see the HTML::Parser documentation $mini_parser->utf8_mode(1); my $root = HTML::TreeBuilder->new; my $isfirst = 1; my $unencoded_buffer = ''; my $result = ''; sub process_lwp_response { my ( $chunk, $resp_object ) = @_; $unencoded_buffer .= $chunk; if ( !$charset ) { if ($isfirst) { $isfirst = 0; set_charset_from_content_type( $resp_object->header('Content-Type') ); } $mini_parser->parse($chunk); } if ($charset) { $mini_parser = undef; $root->parse( decode( $charset, $unencoded_buffer, Encode::FB_QUIET ) ); } } my $targeturl = 'http://www.ras.ru/about.aspx?_Language=ru'; # $targeturl = shift; my $response = $ua->get( $targeturl, ':content_cb' => \&process_lwp_response ); if ( $response->is_success ) { $root->eof(); # original code my @paras = $root->find_by_tag_name('p'); foreach my $h (@paras) { foreach my $item_r ( $h->content_refs_list ) { next if ref $$item_r; ### proprietary JavaScript/HMTL inserted with substitution } } # end foreach print $root->as_HTML; } else { die $response->status_line; }