HTML::StripScripts::Parser --> uses HTML::Parser to tokenise the HTML --> uses callbacks in HTML::StripScripts to filter the XSS and tidy the HTML --> uses callbacks in HTML::StripScripts::LibXML to build a DOM tree #### $hss = HTML::StripScripts->new({ Rules => { a => \&a_callback } }); sub a_callback { my ($filter,$element) = @_; # where $element = { # tag => 'a', # attr => { href => '/index.html' }, # content => 'Go to Home page', # } return 1; } #### sub a_callback { my ($filter,$element) = @_; # where $element = { # tag => 'a', # attr => { href => '/index.html' }, # children => [ # XML::LibXML::Text --> 'Go to ', # XML::LibXML::Element --> 'b' # with child Text --> 'Home', # XML::LibXML::Text --> ' page', # ], # } return 1; } #### package HTML::StripScripts::LibXML; use strict; use vars qw($VERSION); $VERSION = '0.10'; =head1 NAME HTML::StripScripts::LibXML - XSS filter - outputs a LibXML Document or DocumentFragment =head1 SYNOPSIS use HTML::StripScripts::LibXML(); my $hss = HTML::StripScripts::LibXML->new( { Context => 'Document', ## HTML::StripScripts configuration Rules => { ... }, }, strict_comment => 1, ## HTML::Parser options strict_names => 1, ); $hss->parse_file("foo.html"); $xml_doc = $hss->filtered_document; OR $xml_doc = $hss->filter_html($html); =head1 DESCRIPTION This class provides an easy interface to C, using C to parse the HTML, and returns an XML::LibXML::Document or XML::LibXML::DocumentFragment. See L for details of how to customise how the raw HTML is parsed into tags, and L for details of how to customise the way those tags are filtered. This module is a subclass of L. =cut =head1 DIFFERENCES FROM HTML::StripScripts =over =item CONTEXT HTML::StripScripts::LibXML still allows you to specify the C of the HTML (Document, Flow, Inline, NoTags). If C is C, then it returns an C object, otherwise it returns an C object. =item TAG CALLBACKS HTML::StripScripts allows you to use tag callbacks, for instance: $hss = HTML::StripScripts->new({ Rules => { a => \&a_callback } }); sub a_callback { my ($filter,$element) = @_; # where $element = { # tag => 'a', # attr => { href => '/index.html' }, # content => 'Go to Home page', # } return 1; } HTML::StripScripts::LibXML still gives you tag callbacks, but they look like this: sub a_callback { my ($filter,$element) = @_; # where $element = { # tag => 'a', # attr => { href => '/index.html' }, # children => [ # XML::LibXML::Text --> 'Go to ', # XML::LibXML::Element --> 'b' # with child Text --> 'Home', # XML::LibXML::Text --> ' page', # ], # } return 1; } =item SUBCLASSING The subs C, C and C are not called. Instead, this module uses C which handles the tag callback, (and depending on the result of the tag callback) creates an element and adds its child nodes. Then it adds the element to the list of children for the parent tag. =back =head1 CONSTRUCTORS =over =item new ( {CONFIG}, [PARSER_OPTIONS] ) Creates a new C object. See L for details. =back =cut use base 'HTML::StripScripts::Parser'; use XML::LibXML(); use HTML::Entities(); #=================================== sub output_start_document { #=================================== my ($self) = @_; $self->{_hsxXML} = XML::LibXML::Document->new(); return; } #=================================== sub output_end_document { #=================================== my ($self) = @_; my $top = $self->{_hssStack}[0]; my $document = delete $self->{_hsxXML}; if ( $top->{CTX} ne 'Document' ) { $document = $document->createDocumentFragment(); } foreach my $child ( @{ $top->{CHILDREN} } ) { $document->addChild($child); } $top->{CONTENT} = $document; return; } #=================================== sub output_start { } *output_end = \&output_start; *output_declaration = \&output_start; *output_process = \&output_start; *output = \&output_start; #=================================== my $Entities = { 'amp' => '&', 'lt' => '<', 'gt' => '>', 'quot' => '"', '#39' => "'", }; #=================================== sub output_text { #=================================== my ( $self, $text ) = @_; HTML::Entities::_decode_entities( $text, $Entities ); push @{ $self->{_hssStack}[0]{CHILDREN} }, $self->{_hsxXML}->createTextNode($text); return; } #=================================== sub output_comment { #=================================== my ( $self, $comment ) = @_; $comment =~ s/^\s*\s*$//g; push @{ $self->{_hssStack}[0]{CHILDREN} }, $self->{_hsxXML}->createComment($comment); return; } #=================================== sub output_stack_entry { #=================================== my ( $self, $tag ) = @_; my %entry; $tag->{CHILDREN} ||= []; @entry{qw(tag attr children)} = @{$tag}{qw(NAME ATTR CHILDREN)}; if ( my $tag_callback = $tag->{CALLBACK} ) { $tag_callback->( $self, \%entry ) or return; } if ( my $tagname = $entry{tag} ) { my $element = $self->{_hsxXML}->createElement($tagname); my $attrs = $entry{attr}; foreach my $name ( sort keys %$attrs ) { $element->setAttribute( $name => $attrs->{$name} ); } unless ( $tag->{CTX} eq 'EMPTY' ) { foreach my $children ( @{ $entry{children} } ) { $element->addChild($children); } } push @{ $self->{_hssStack}[0]{CHILDREN} }, $element; } else { push @{ $self->{_hssStack}[0]{CHILDREN} }, @{ $entry{children} }; } $tag->{CHILDREN} = []; } =head1 BUGS AND LIMITATIONS =over =item API - BETA This is the first draft of this module, and currently there are no configuration options for the XML. I would welcome feedback from XML users as to how I could improve the interface. For this reason, the API may change. =back =head1 SEE ALSO L, L, L =head1 AUTHOR Clinton Gormley Eclint@traveljury.comE =head1 COPYRIGHT Copyright (C) 2007 Clinton Gormley. All Rights Reserved. =head1 LICENSE This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1;