use Benchmark 'cmpthese';
my $data = join'',<DATA>;
print untag($data),
"\n\n\n",
'X' x 79,
"\n\n\n",
untagg($data),
"\n\n\n",
'X' x 79,
"\n\n\n",;
warn "benchmarking the dumb way";
cmpthese(-3,
{
regex => sub { untag($data);},
parse => sub { untagg($data); },
});
warn "benchmarking the smart way";
warn "benchmarking the smart way";
use HTML::Parser;
my $p = HTML::Parser->new( api_version => 3);
my $ret ="";
$p->handler(default =>
sub { $ret .= $_[0] if $_[1] eq 'text'},'text,event');
cmpthese(-3,
{
regex => sub { untag($data);},
parse => sub { $p->parse($data); },
});
sub untagg {
local $_ = $_[0] || $_;
require HTML::Parser;
my $p = HTML::Parser->new( api_version => 3);
my $ret ="";
$p->handler(default => sub { $ret .= $_[0] if $_[1] eq 'text'}
,'text,event');
$p->parse($_);
return($ret);
}
sub untag {
local $_ = $_[0] || $_;
# ALGORITHM:
# find < ,
# comment <!-- ... -->,
# or comment <? ... ?> ,
# or one of the start tags which require correspond
# end tag plus all to end tag
# or if \s or ="
# then skip to next "
# else [^>]
# >
s{
< # open tag
(?: # open group (A)
(!--) | # comment (1) or
(\?) | # another comment (2) or
(?i: # open group (B) for /i
( TITLE | # one of start tags
SCRIPT | # for which
APPLET | # must be skipped
OBJECT | # all content
STYLE # to correspond
) # end tag (3)
) | # close group (B), or
([!/A-Za-z]) # one of these chars, remember in (4)
) # close group (A)
(?(4) # if previous case is (4)
(?: # open group (C)
(?! # and next is not : (D)
[\s=] # \s or "="
["`'] # with open quotes
) # close (D)
[^>] | # and not close tag or
[\s=] # \s or "=" with
`[^`]*` | # something in quotes ` or
[\s=] # \s or "=" with
'[^']*' | # something in quotes ' or
[\s=] # \s or "=" with
"[^"]*" # something in quotes "
)* # repeat (C) 0 or more times
| # else (if previous case is not (4))
.*? # minimum of any chars
) # end if previous char is (4)
(?(1) # if comment (1)
(?<=--) # wait for "--"
) # end if comment (1)
(?(2) # if another comment (2)
(?<=\?) # wait for "?"
) # end if another comment (2)
(?(3) # if one of tags-containers (3)
</ # wait for end
(?i:\3) # of this tag
(?:\s[^>]*)? # skip junk to ">"
) # end if (3)
> # tag closed
}{}gsx; # STRIP THIS TAG
return $_ ? $_ : "";
}
__DATA__
u h a h
<html>
<head>
<title>This title contains Perl but does not get changed.</title>
</head>
<body>
<p>This is some text containing the term 'perl'.</p>
<ol>
<li>Unix</li>
<li>Perl</li>
<li>Linux</li>
</ol>
<p>Notice how the term perl in the following link doesn't change, but
+the text does.
<a href="http://www.perlmonks.org">Perlmonks.org</a></p>
</body>
</html>
> < >
< > <
! ] [ ] [ ] [ ] [ - <!-- --> 2 3 4 5 5
<<a href<<a>>
<!-- foo bar -->
<SCRIPT language="javascript">
// this is valid html
// whether you like it or not
// same goes for older browsers
</SCRIPT>
And the results are ;)
|