!/usr/bin/perl -w use strict; use LWP::UserAgent; use XML::Generator; use XML::LibXML; ############################### ## PHASE 1: generate XML file my $url = 'http://www.w3.org/Press/1998/XSL-WD.html.ja'; my $file = 'test'; my $ua = LWP::UserAgent->new(); my ($success, $response, $content_type, $charset, %encoding_opts); $success = ($response = $ua->get($url))->is_success(); die "Couldn't fetch URL: '$url'" unless $success; $content_type = $response->header('Content-Type'); $content_type =~ /charset\s*=\s*([A-Za-z0-9_\-]+)/io if $content_type; $charset = $1 || undef; # HTTP::Message doesn't always seem to recognize Content-Type correctly, override if ($charset) { $encoding_opts{charset} = $charset; } my $decoded = $response->decoded_content(%encoding_opts); die "Cannot decode content: ". $@ unless $decoded; my $gen = XML::Generator->new(pretty => 2, conformance => 1); my $xml = $gen->xml( $gen->parsed($gen->xmlcdata($decoded)), $gen->original({encoding => $charset}, $gen->xmlcdata($response->content())) ); open(FH, '>:utf8', $file) or die "Couldn't write to file: '$file'"; print FH $xml; close(FH); ############################### ## PHASE 2: load it my $parser = XML::LibXML->new(); my $doc = $parser->parse_file($file); #### test:129: parser error : CData section not finished XSL 1.0 ESC$B$N ... 600+ more lines omitted ...