HTML::StripScripts::Parser
--> uses HTML::Parser to tokenise the HTML
--> uses callbacks in HTML::StripScripts to filter the XSS and tidy the HTML
--> uses callbacks in HTML::StripScripts::LibXML to build a DOM tree
####
$hss = HTML::StripScripts->new({
Rules => { a => \&a_callback }
});
sub a_callback {
my ($filter,$element) = @_;
# where $element = {
# tag => 'a',
# attr => { href => '/index.html' },
# content => 'Go to Home page',
# }
return 1;
}
####
sub a_callback {
my ($filter,$element) = @_;
# where $element = {
# tag => 'a',
# attr => { href => '/index.html' },
# children => [
# XML::LibXML::Text --> 'Go to ',
# XML::LibXML::Element --> 'b'
# with child Text --> 'Home',
# XML::LibXML::Text --> ' page',
# ],
# }
return 1;
}
####
package HTML::StripScripts::LibXML;
use strict;
use vars qw($VERSION);
$VERSION = '0.10';
=head1 NAME
HTML::StripScripts::LibXML - XSS filter - outputs a LibXML Document or DocumentFragment
=head1 SYNOPSIS
use HTML::StripScripts::LibXML();
my $hss = HTML::StripScripts::LibXML->new(
{
Context => 'Document', ## HTML::StripScripts configuration
Rules => { ... },
},
strict_comment => 1, ## HTML::Parser options
strict_names => 1,
);
$hss->parse_file("foo.html");
$xml_doc = $hss->filtered_document;
OR
$xml_doc = $hss->filter_html($html);
=head1 DESCRIPTION
This class provides an easy interface to C, using
C to parse the HTML, and returns an XML::LibXML::Document
or XML::LibXML::DocumentFragment.
See L for details of how to customise how the raw HTML is parsed
into tags, and L for details of how to customise the way
those tags are filtered. This module is a subclass of
L.
=cut
=head1 DIFFERENCES FROM HTML::StripScripts
=over
=item CONTEXT
HTML::StripScripts::LibXML still allows you to specify the C of the
HTML (Document, Flow, Inline, NoTags). If C is C, then it
returns an C object, otherwise it returns an
C object.
=item TAG CALLBACKS
HTML::StripScripts allows you to use tag callbacks, for instance:
$hss = HTML::StripScripts->new({
Rules => { a => \&a_callback }
});
sub a_callback {
my ($filter,$element) = @_;
# where $element = {
# tag => 'a',
# attr => { href => '/index.html' },
# content => 'Go to Home page',
# }
return 1;
}
HTML::StripScripts::LibXML still gives you tag callbacks, but they look like
this:
sub a_callback {
my ($filter,$element) = @_;
# where $element = {
# tag => 'a',
# attr => { href => '/index.html' },
# children => [
# XML::LibXML::Text --> 'Go to ',
# XML::LibXML::Element --> 'b'
# with child Text --> 'Home',
# XML::LibXML::Text --> ' page',
# ],
# }
return 1;
}
=item SUBCLASSING
The subs C