package MarkupHandler;
use Carp;
use HTML::Entities qw(encode_entities);
use Exporter;
@ISA = 'Exporter';
@EXPORT_OK = qw(
ret_bal_tag_handlers ret_escape_code
ret_ok_attr_val ret_scrubber
);
use strict;
use vars qw(@open_tags);
# Gets the value of a tag attribute
sub get_attr_val {
my $t_ref = shift;
if (
$$t_ref =~ /
\G\s*=\s*(
[^\s>"'][^\s>]* | # Unquoted
"[^"]*" | # Double-quoted
'[^']*' # Single-quoted
)
/gx
) {
return $1;
}
else {
return ('', "no value found");
}
}
sub ret_bal_tag_handlers {
my @out = ();
while (@_) {
my $tag = shift;
my $name = $tag;
if (ref($tag)) {
if (exists $tag->{name}) {
$name = $tag->{name};
}
else {
confess("Tags must have name attributes");
}
}
else {
$tag = {name => $name};
}
my $attribs = shift;
if (@$attribs) {
push @out, "<$name", ret_tag_open($name, @$attribs);
}
else {
push @out, "<$name>", ret_const_tag_open($name);
}
# Rewrite handler?
if (exists $tag->{pre}) {
push @out, wrap_handler($tag->{pre}, pop(@out), '');
}
push @out, "$name>", ret_tag_close($name);
if (exists $tag->{post}) {
push @out, wrap_handler('', pop(@out), $tag->{post});
}
}
return @out;
}
# Many tags have no attributes allowed. Handle
# them efficiently.
sub ret_const_tag_open {
my $tag = shift;
return sub {
push @open_tags, $tag;
return "<$tag>";
};
}
# Returns the basic "literal escape" that you see for
# code.
sub ret_escape_code {
my $end_pat = shift;
my $name = shift;
return (
$name,
sub {
my $t_ref = shift;
if ($$t_ref =~ m=\G(.*?)$end_pat=gs) {
return "
" . encode_entities($1) . "
";
}
else {
return show_err("Unmatched $name tag found");
}
}
);
}
# Generate an attribute handler based on an "ok value" test.
# Note that quotes on the attribute will exist in the
# value passed to the ok test.
sub ret_ok_attr_val {
my $ok_test = shift;
wrap_handler(
'', \&get_attr_val,
sub {
my $text = shift;
if ($ok_test->($text)) {
$text;
}
else {
return ('', "Illegal val '$text'");
}
}
);
}
# Pass a list of case/handler pairs, returns an anonymous
# sub that processes those pairs.
sub ret_scrubber {
my %handler = @_;
# Sanity check
foreach my $case (keys %handler) {
unless (UNIVERSAL::isa($handler{$case}, 'CODE')) {
carp("Case '$case' dropped - handlers must be functions");
delete $handler{$case};
}
}
$handler{pre} ||= sub {return '';};
$handler{post} ||= sub {
return join '', map "$_>", reverse @open_tags;
};
# Sorted in reverse so that '
($t_ref);
if (@err) {
pos($$t_ref) = 0;
$scrubbed .= show_err(@err);
}
else {
$scrubbed .= $chunk;
# Obscure bug fix. You cannot match 2 zero
# length patterns in a row, this resets the
# flag so you can.
pos($$t_ref) = pos($$t_ref);
}
}
unless (pos($$t_ref)) {
if (length($$t_ref) == $pos) {
# EXIT HERE #
return $scrubbed . $handler{post}->($t_ref);
}
else {
my $char = substr($$t_ref, $pos, 1);
pos($$t_ref) = $pos + 1;
$scrubbed .= encode_entities($char);
}
}
}
confess("I have no idea how I got here!");
}
}
# Returns a sub that closes a tag
sub ret_tag_close {
my $tag = shift;
return sub {
my @searched;
while (@open_tags) {
my $open = pop(@open_tags);
push @searched, $open;
if ($open eq $tag) {
# Close em!
return join '', map "$_>", @searched;
}
}
# No you cannot close a tag you didn't open!
@open_tags = reverse @searched;
pos(${$_[0]}) = 0;
return show_err("Unmatched close tag $tag>");
};
}
# The general open tag
sub ret_tag_open {
my $tag = shift;
my %attr_test;
foreach (@_) {
if (ref($_)) {
foreach my $attrib (keys %$_) {
$attr_test{lc($attrib)} = $_->{$attrib};
}
}
else {
$attr_test{lc($_)} = \&get_attr_val;
}
}
return sub {
my $t_ref = shift;
my $text = "<$tag";
while ($$t_ref =~ /\G(?:\s+(\w+)|\s*>)/g) {
if (defined($1)) {
my $attrib = lc($1);
if (exists $attr_test{$attrib}) {
my ($chunk, @err) = $attr_test{$attrib}->($t_ref);
if (@err) {
return show_err(
"While processing '$attrib' in <$tag>:", @err
);
}
else {
$text .= " $attrib=$chunk";
}
}
else {
pos($$t_ref) = 0;
return show_err(
"Tag '$tag' cannot accept attribute '$attrib'"
);
}
}
else {
$text .= ">";
push @open_tags, $tag;
return $text;
}
}
return show_err("Unended <$tag> detected");
};
}
sub show_err {
if (wantarray()) {
return ('', @_);
}
else {
my $err = encode_entities(join ' ', grep length($_), @_);
return "$err
";
}
}
sub wrap_handler {
my $pre = shift() || sub {''};
my $fn = shift();
my $post = shift() || sub {@_};
return sub {
my $t_ref = shift;
my ($text, @err) = $pre->($t_ref);
if (@err) {
return show_err($text, @err);
}
(my $chunk, @err) = $fn->($t_ref);
@err ?
show_err("$text$chunk", @err) :
$post->("$text$chunk");
};
}
1;
####
use strict;
use vars qw($table_depth);
$table_depth = 0;
use MarkupHandler qw(
ret_bal_tag_handlers ret_escape_code ret_ok_attr_val ret_scrubber
);
my $href_test = ret_ok_attr_val(
sub {
shift() =~ m-^'?"?(http://|ftp://|mailto:|#)-i;
}
);
my @bal_tags = (
a => [
qw(name target),
{'href' => $href_test}
],
font => [qw(color size)],
ol => ['type'],
p => ['align'],
{
name => 'table',
pre => sub {
if (5 > $table_depth++) {
return '';
}
else {
$table_depth--;
return ('', "Exceeded maximum table depth\n");
}
},
post => sub {
if (0 > --$table_depth) {
$table_depth = 0;
return ("", "Table depth should not be below 0");
}
@_;
},
}, [qw(bgcolor border cellpadding cellspacing width)],
in_table_tags(
colgroup => [ qw(align span valign width) ],
td => [qw(align bgcolor colspan height rowspan valign width)],
tr => [qw(align valign width)],
),
map {$_, []} qw(
b big blockquote br dd dl dt em h1 h2 h3 h4 h5 h6 hr
i li pre small strike strong sub sup tt u ul
),
);
my @handlers = (
ret_bal_tag_handlers(@bal_tags),
ret_escape_code(qr(\[/code\]), "[code]"),
);
my $scrubber = ret_scrubber(@handlers);
my $text = <<'EOT';
Hello world. <<;
[code] This is ...see the being escaped? [/code]
Hello world br>ub>
hello world
hello
EOT
print $scrubber->(\$text);
sub in_table_tags {
my @out;
while (@_) {
my $name = shift;
push @out, {
name => $name,
pre => sub {
$table_depth ? '' : ('', "Cannot open <$name>, not in table");
},
}, shift();
}
return @out;
}
|