Here is the code that does this. $q is an Everything::CGI object <which is just like a CGI object except that escapeHTML() also turns ] and [ into entities>. Otherwise, there is very little Everything-specific code so it should be easy to adapt for use in your favorite chatterbox client.
######################################################################
# sub
# tagApprove
#
# purpose
# determines whether or not a tag (and it's specified attributes
+)
# are approved or not. If not, returns a false value.
# Otherwise, cleans the arguments in-place and returns a true
# value. Used by htmlScreen.
#
sub tagApprove
{
my( $close, $tag, $attr, $APPROVED )= @_;
if( exists $APPROVED->{lc($tag)} ) {
$tag = lc($tag);
} elsif( exists $APPROVED->{uc($tag)} ) {
$tag = uc($tag);
} else {
return !1;
}
if( $close ) {
$_[2]= '';
return 1;
}
my $cleanattr= "";
$attr .= " ";
foreach ( split ",", $APPROVED->{$tag} ) {
next if "1" eq $_;
if( "/" eq $_ ) {
$cleanattr .= " ".$_;
last;
} elsif( $attr =~ /\b$_\s*(=\s*('[^'<>]*'|"[^"<>]*"|([^<>'"\s\[\]
+]+)\s))?/i
) {
$cleanattr .= " ".$_;
if( $3 ) {
$cleanattr .= "='$3'";
} elsif( $1 ) {
$cleanattr .= "=".$2;
}
}
}
for( $cleanattr ) {
s/\[/[/g;
s/]/]/g;
}
$_[2]= $cleanattr;
return 1;
}
######################################################################
+#######
# sub
# htmlScreen
#
# purpose
# screen out html tags from a chunk of text
# returns the text with any unapproved tags escaped.
#
# params
# text -- the text to filter
# APPROVED -- ref to hash where approved tags are keys. Null me
+ans
# all HTML will be escaped out.
#
BEGIN
{
my %block; # Block-level tags
my %nonest; # Tags that form linear siblings rather than nest.
{
my @list= ( 'h1'..'h6',
qw[ dl ul ol pre p div blockquote form hr table ] );
@block{ @list }= (1) x @list;
@list= qw( li tr td th p );
@nonest{ @list }= (1) x @list;
}
sub htmlScreen {
my( $html, $APPROVED )= @_;
$APPROVED ||= {};
my $htmlNest= $VARS->{htmlnest} || ($q->param('htmlnest'))[-1];
my %depth;
my $block= 1;
my @nesting;
my $closeTil= sub {
my( $name, $all )= @_;
my $html= '';
my $add= '';
my $extra= !$name;
while( @nesting && $extra ne $name ) {
$extra= pop @nesting;
$add= $html;
$html .= "</$extra>";
pop @{$depth{$extra}};
$block-- if $block{$extra};
}
$add= $html if $all;
if( $add && ($q->param('htmlerror'))[-1] ) {
$html= qq(<font color="#808080" class="htmlerror">)
. $q->escapeHTML($add)
. "</font>" . $html;
}
return $html;
};
## $html =~ s#<\s*(/?)(\w+)(.*?)\># tagApprove($1,$2,$3,$APPROVED)
+ #gse;
$html =~ s{
<
( # $1: whole of "tag"
!--
(.*?-) # $2: comment body; split "--"s
- (?= > )
|
\s*
(/?) # $3: "" or "/" (for end tag)
\s*
(\w+) # $4: tag name
( # $5: rest of tag contents
(?:
[^<>'"\[\]]+
| "[^"<>]*"
| '[^'<>]*'
)*
)
(?= > )
|
)
(>?) # $6: "" or ">", closing of tag
}{
my( $tag, $cmnt, $close, $name, $attrs, $gt )=
( $1, $2, $3, lc($4), $5, $6 );
if( defined($cmnt) ) {
$cmnt =~ s/-(?=-)/- /g if $htmlNest;
"<!--$cmnt->";
} elsif( ! $gt || ! tagApprove($close,$name,$attrs,$APPROVED)
+ ) {
$q->escapeHTML( "<$1$gt" );
} elsif( ! $htmlNest || $attrs =~ m#/$# ) {
"<$close$name$attrs>";
} elsif( ! $close ) {
my $html= '';
my $clean= "<$name$attrs>";
if( $nonest{$name} && $depth{$name}
&& $block == $depth{$name}[-1] ) {
$html .= $closeTil->( $name, 1 );
}
$block++ if $block{$name};
$html .= $clean;
push @{$depth{$name}}, $block;
push @nesting, $name;
$html;
} else {
if( $block{$name} && $depth{$name} && @{$depth{$name}}
or $depth{$name} && $block == $depth{$name}[-1]
) {
$closeTil->( $name );
} else {
$q->escapeHTML( "<$tag$gt" );
}
}
}gsex;
$html .= $closeTil->('',1) if @nesting;
return $html;
}
}