with the same attributes
my $replaced = 0;
while (1) {
CHILD: foreach my $child ($tag->content_list()) {
next unless ref($child) and $child->tag() eq $tag->tag();
foreach my $attr ($tag->all_attr_names()) {
next if $attr =~ /^_/;
next CHILD if $tag->attr($attr) ne $child->attr($attr);
};
$child->replace_with_content->delete();
$replaced++;
}
last unless $replaced;
$replaced = 0;
}
return 0;
}
);
# merge ...
{
my $fun;
$fun = sub {
my $tag = $_[0];
return 0 if $tag->content_list() != 1;
my ($child) = $tag->content_list();
return 0 unless ref($child) and $child->tag() eq 'font';
foreach my $attr ($tag->all_attr_names()) {
next if $attr =~ /^_/;
if (! defined($child->attr($attr))) {
$child->attr($attr, $tag->attr($attr))
}
};
$tag->replace_with_content->delete();
$child->look_down( '_tag' => 'font', $fun);
return 0;
};
$root->look_down( '_tag' => 'font', $fun);
}
# replace s with whitespace content by
foreach my $tag ($root->look_down( '_tag' => 'p',
sub {
foreach ($_[0]->content_list()) {
return 0
if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0 is a
or ref($_);
}
return 1;
}
)) {
my $style = $tag->attr("style");
if ($style ne '' and $style =~ /\bMARGIN:\s*0\w+(?:\s+\d+\w+\s+0\w|;|$)/i and !$tag->content_list()) {
$tag->delete();
} else {
$tag->replace_with(HTML::Element->new('br'))->delete();
}
}
#remove s and s with only whitespace content (again, after replacing whitespace
by
)
$root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite|kbd|var)$',
sub {
my $tag = $_[0];
while (1) {
foreach ($tag->content_list()) {
return 0
if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0 is a
or ref($_) and $_->tag() ne 'br';
}
my $parent = $tag->parent();
$tag->replace_with_content->delete();
$tag = $parent or return 0;
return 0 unless $parent->tag() =~ m'^(?:span|small|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite|kbd|var)$';
}
return 0;
}
);
$html = $root->as_HTML(undef, undef, {});
$root->delete();
}
for ($html) {
s{^\s*\s*}{}si;
s{\s*
\s*$}{}si;
s{
\s+\n?}{
\n}g;
#remove excess
s between paragraphs
s{\n{3,}}{\n\n}gs;
s{(?:
\n?){3,}}{
\n
\n}gs;
#remove excess
s between paragraphs
# 1 while s{(?:\s+|
)*
(?:\s+|
)*
}{
}gi;
#remove empty paragraphs at the end of the text
s{(?:\s+| |
)+()?$}{$1}si;
};
return $html;
}
sub DeWordifyHTML {
my $html = shift;
return '' if $html eq '';
for ($html) {
s{<\?xml:namespace [^>]+>}{}g; # remove
s{<\w[\w\d\-]*:\w[\w\d\-]*(?:\s+(?:[^">]+|"[^"]*")*)?>}{}g; # remove
s{\w[\w\d\-]*:\w[\w\d\-]*>}{}g; # remove
s{(}{$1 $2'>}g; # fix
tr/‘’“”/''""/; # use ordinary quotes
s/…/.../g; # use three dots instead of the three-dot character
s/(?:[•·]|) ?/- /g; # don't use fancy dots
}
my $result = ''; # just for sure
$parser ||= HTML::Parser->new( api_version => 3,
marked_sections => 1,
boolean_attribute_value => undef,
);
# remove mso- styles and classes, tabstops from styles
my $sub = sub {
my ($tagname, $attr, $attrseq, $text) = @_;
return '' if $tagname =~ /:/;
my $leave = 1;
if ($attr->{class} and $attr->{class} =~ /^mso/i) {
$leave = 0;
delete $attr->{class};
@$attrseq = grep $_ ne 'class', @$attrseq;
}
if ($attr->{style}) {
my @style = split /\s*;\s*/, $attr->{style};
my $count = scalar(@style);
@style = grep !/^(mso-.*|tab-stops.*|MARGIN-TOP: 0in)$/i, @style;
if ($count > scalar(@style)) { # we removed some
if (@style) {
$attr->{style} = join '; ', @style;
} else {
delete $attr->{style};
@$attrseq = grep $_ ne 'style', @$attrseq;
}
$leave = 0;
}
}
if ($leave) {
$result .= $text
} else {
$result .= _html_tag($tagname, $attr, $attrseq);
}
};
$parser->handler(start => $sub, "tagname, attr, attrseq, text");
$parser->handler(default => sub {$result .= $_[0]}, "text");
$parser->parse($html)->eof;
return $result
}