##
#!perl
# This package determines the local character set, and
# translates everything into wide perl strings.
package FromLocalFilter;
use base qw( HTTP::Proxy::BodyFilter );
use HTML::Parser;
use Encode;
use strict;
use warnings;
# Build up a list of characters that need to get
# substituted to get around HTML::Parser bug w.r.t.
# utf-8 characters containing 0xA0
# See http://rt.cpan.org/NoAuth/Bug.html?id=15068
my $a0chars = "";
for (0x80..0xd7FF,0xE000..0xFDCF,0xFE00..0xFFFD) {
my $c = chr($_);
$a0chars .= $c if encode(q[utf-8],$c) =~ /\xA0/;
}
our $a0regexp = qr/([$a0chars])/;
# End HTML::Parser bug workaround; the only other
# line relating to working around this bug is below where
# I do an s/// substitution based on $a0regexp
sub set_charset {
my ($self,$cset) = @_;
### Debugging:
### print "Charset: $1\n";
return unless $cset;
$cset = Encode::resolve_alias($cset) || lc($cset);
$cset =~ s/\s/-/g;
if (! $self->{charset} ) {
$self->{charset} = $cset;
$self->{charseto} = find_encoding($cset);
die "Unsupported character set: $cset" unless $self->{charseto};
}
elsif ($self->{charset} ne $cset) {
warn join(" ", "Charset $cset attempted, but charset",
$self->{charset}, 'already active for',
$self->{current_uri});
}
}
sub set_charset_from_content_type {
if ($_[1] and $_[1] =~ /.*; *charset=(\S+)/)
{
$_[0]->set_charset($1);
}
}
sub default_charset {
my $self = shift;
if (!$self->{charset}) {
my $default_charset = $self->{default_charset};
$self->set_charset($default_charset);
}
}
sub init {
my ($self, $default_charset) = @_;
$self->{default_charset} = $default_charset;
}
sub get_parser {
my ($self) = @_;
my $mini_parser =
HTML::Parser->new(api_version => 3,
start_h => [
sub { $_[0] eq 'meta' and
$_[1]->{'http-equiv'} and
lc($_[1]->{'http-equiv'}) eq 'content-type' and
$self->set_charset_from_content_type($_[1]->{'content'}) },
"tagname, attr" ],
end_h => [
sub {
if ($_[0] eq 'head') {
$self->default_charset;
$_[1]->eof; # abort parsing
$self->free_parser;
}
}, "tagname,self" ],
end_document_h => [
sub {
$self->default_charset;
$self->free_parser;
}, ""],
default_h => [""] );
$mini_parser->utf8_mode(1);
$mini_parser;
}
sub free_parser {
my ($self) = @_;
$self->{mini_parser} = undef;
}
sub begin {
my ( $self, $message) = @_;
$self->{charset} = '';
$self->{current_uri} = $message->request->uri;
if ( $message->header('Content-Type') )
{
$self->set_charset_from_content_type($message->header('Content-Type'));
}
$self->{mini_parser} = $self->get_parser();
$self->{mini_parser_dist} = 0;
}
sub filter {
my ( $self, $dataref, $message, $protocol, $buffer ) = @_;
if ($self->{mini_parser}) {
my $pstr = substr($$dataref, $self->{mini_parser_dist});
$self->{mini_parser}->parse($pstr);
}
if ( !defined($buffer) ) {
if ($self->{mini_parser}) {$self->{mini_parser}->eof;}
my $d=''; $buffer = \$d;
}
$$buffer = $$dataref;
$$dataref = '';
if ($self->{charset}) {
my $decoded = decode($self->{charseto}, $$buffer, Encode::FB_QUIET);
# HTML::Parser bug workaround: (http://rt.cpan.org/NoAuth/Bug.html?id=15068)
$decoded =~ s/$a0regexp/sprintf('%02X;',ord($1))/ge;
$$dataref = $decoded;
}
$self->{mini_parser_dist} = length ($$buffer);
}
# Implements Appendix F.1 (Detection Without External Encoding Information)
# of the XML Recommendation (http://www.w3.org/TR/2004/REC-xml-20040204/#sec-guessing)
#
# As a plus side, rewrites an initial xml declaration (or adds a new one)
# so that the named character set is utf-8 (which is accurate given the other
# transformations we want to apply)
package FromLocalXmlFilter;
use base qw( FromLocalFilter );
use Encode;
use strict;
use warnings;
# regexp, charset, amt. to strip, provisional charset
my $enctable =
[
[qr/^ \x00 \x00 \xFE \xFF/x, 'UCS-4BE', 4, ''],
[qr/^ \xFF \xFE \x00 \x00/x, 'UCS-4LE', 4, ''],
[qr/^ \x00 \x00 \xFF \xFE/x, 'UCS-42143', 4, ''],
[qr/^ \xFE \xFF \x00 \x00/x, 'UCS-43412', 4, ''],
[qr/^ \xFE \xFF/x, 'UTF-16BE', 2, ''],
[qr/^ \xFF \xFE/x, 'UTF-16LE', 2, ''],
[qr/^ \xEF \xBB \xBF/x, 'UTF-8', 3, ''],
[qr/^ \x00 \x00 \x00 \x3C/x, '', 0, 'UCS-4BE'],
[qr/^ \x3C \x00 \x00 \x00/x, '', 0, 'UCS-4LE'],
[qr/^ \x00 \x00 \x3C \x00/x, '', 0, 'UCS-42143'],
[qr/^ \x00 \x3C \x00 \x00/x, '', 0, 'UCS-43412'],
[qr/^ \x00 \x3C \x00 \x3F/x, '', 0, 'UTF-16BE'],
[qr/^ \x3C \x00 \x3F \x00/x, '', 0, 'UTF-16LE'],
[qr/^ \x3C \x3F \x78 \x6D/x, '', 0, 'ASCII'],
[qr/^ \x4C \x6F \xA7 \x94/x, '', 0, 'cp37'], # EBCDIC
# default
];
sub get_parser { return undef; }
sub begin {
my ($self) = shift;
$self->{init4} = 0;
$self->{initgt} = 0;
$self->{provisional_charset} = '';
$self->SUPER::begin(@_);
}
sub filter {
my $self = shift;
my ( $dataref, $message, $protocol, $buffer ) = @_;
if (! $self->{init4}) {
if (length($$dataref) >= 4) {
$self->{init4} = 1;
for (@$enctable) {
if ($$dataref =~ $_->[0]) {
$self->set_charset($_->[1]);
substr($$dataref,0,$_->[2],'');
$self->{provisional_charset} = $_->[1] || $_->[3];
}
if (! $self->{provisional_charset} )
{
$self->default_charset;
$self->{provisional_charset} = $self->{charset};
}
}
} else {
# come back later
if ($buffer) {
$$buffer .= $$dataref;
$$dataref = '';
return;
}
}
}
if (! $self->{initgt}) {
my $predec = $$dataref;
my $provstuff = decode($self->{provisional_charset}, $predec,
Encode::FB_QUIET);
if ($provstuff =~ />/) {
$self->{initgt} = 1;
if ($provstuff =~
/^(\<\? xml \s+
version \s* = \s* (?:'[^']*'|"[^"]*") \s+)
(?:encoding \s* = \s* ('[^']*'|"[^"]*"|\S+))?
(.*? \?\>)/x) {
my ($preenc, $cset, $postenc) = ($1,$2,$3);
$cset =~ s/^["']// if $cset;
$cset =~ s/['"]$// if $cset;
$self->set_charset($cset);
$self->default_charset();
substr($provstuff,0,$+[0],
qq[${preenc}encoding="utf-8"$postenc]);
} else {
$self->default_charset();
substr($provstuff, 0, 0,
qq[\n]);
}
$$dataref = encode($self->{provisional_charset}, $provstuff) . $predec;
} else {
# come back later
if ($buffer) {
$$buffer .= $$dataref;
$$dataref = '';
return;
}
}
}
$self->SUPER::filter(@_);
}
# This takes wide perl strings and outputs them as UTF8
package ToUtf8Filter;
use base qw( HTTP::Proxy::BodyFilter );
use Encode;
use strict;
use warnings;
sub filter {
my ( $self, $dataref, $message, $protocol, $buffer ) = @_;
$$dataref = encode('utf-8', $$dataref);
}
# This makes the header say that the document is in utf8, even if the
# text says otherwise. (As it may after our filtering)
package utf8HeaderFilter;
use base qw( HTTP::Proxy::HeaderFilter );
use strict;
use warnings;
sub filter {
my ($self, $headers, $message) = @_;
### Debugging:
### print "Header: ", $headers->header('Content-Type'), "\n";
if ($headers->header('Content-Type') and
$headers->header('Content-Type') =~
m[^(application/xhtml(\+xml)?|text/html?)\s*($|;)])
{
$message->header('Content-Type' => "$1; charset=utf-8")
}
}
# This is only for debugging - if enabled, it prints out what
# url is attempting to be loaded
package outFilter;
use base qw( HTTP::Proxy::HeaderFilter );
use strict;
use warnings;
sub filter {
my ($self, $headers, $message) = @_;
print "Going for: ", $message->uri, "\n";
}
# This is another proxy filter used for debugging
# Inserted into a filter chain, it logs the text as it passes through
package MonitorBodyFilter;
use base qw( HTTP::Proxy::BodyFilter );
use strict;
use warnings;
use Data::Dumper;
sub filter {
my ( $self, $dataref, $message, $protocol, $buffer ) = @_;
local $Data::Dumper::Indent = 0;
local $Data::Dumper::Useqq = 1;
print Data::Dumper->Dump([$$dataref],[$self->{name} || '$dataref']), "\n";
}
sub init {
my ($self, $name) = @_;
$self->{name} = $name;
return $self;
}
# This filter removes any Content-Type information (including
# charset info) from an included tag, because at this
# point that information's inaccruate anyway.
package RemoveContentTypeMeta;
use HTML::Parser;
use HTTP::Proxy::BodyFilter::htmlparser;
use strict;
use warnings;
sub new {
my $p = HTML::Parser->new(api_version => 3);
$p->handler(default => sub { $_[0]->{output} .= $_[1]; }, 'self,text');
$p->handler(start =>
sub {
unless ($_[2] eq 'meta' and $_[3]->{'http-equiv'} and
lc($_[3]->{'http-equiv'}) eq 'content-type')
{$_[0]->{output} .= $_[1];}
}, 'self,text,tag,attr');
return HTTP::Proxy::BodyFilter::htmlparser->new($p, rw => 1);
}
# This makes certain everything has closing tags, but does
# it in a slightly extreme fashion by passing everything into
# an HTML::TreeBuilder instance.
#
# This means it must wait for the entire document to be absorbed
# before it does anything. However, this is a good example of
# how to use HTML::TreeBuilder in a proxy filter.
#
# A less extreme example would use a simple HTML::Parser to handle
# closing any open elements before other
elements,
# elements and
##
@/=map{[/./g]}qw/.h_nJ Xapou cets krht ele_ r_ra/;
map{y/X_/\n /;print}map{pop@$_}@/for@/