sub DeMoronizeHTML { my $html = shift; return '' if $html eq ''; for ($html) { # try to replace dashed paragraphs by

... s{
·(?: )+\s*(.*?)
}{$1}gis; s{(?)(?!

}gis; s{<(/?)LÍ>}{<$1LI>}g; # add
after # s{
from within

\s*

}{

}gi; s{

\s*

}{}gi; } { my $root = HTML::TreeBuilder->new(); $root->parse_content($html); $root = $root->guts(); #$root->elementify(); #$h->delete_ignorable_whitespace(); # remove s and s with no attributes foreach my $tag ($root->look_down( '_tag' => qr'^(?:span|font)$', style => undef, sub { return ($_[0]->all_attr_names() <= 3); } )) { $tag->replace_with_content->delete() }; #remove s and s with only whitespace content $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite|kbd|var)$', sub { my $tag = $_[0]; while (1) { foreach ($tag->content_list()) { return 0 if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0 is a or ref($_) and $_->tag() ne 'br'; } my $parent = $tag->parent(); $tag->replace_with_content->delete(); $tag = $parent or return 0; return 0 unless $parent->tag() =~ m'^(?:span|small|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite|kbd|var)$'; } return 0; } ); $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|big|cite|kbd|var)$', sub { my $tag = $_[0]; #merge s that are followed by a with the same attributes RIGHT: while (my $next = $tag->right()) { last if !$next or !ref($next) or $next->tag() ne $tag->tag(); foreach my $attr ($tag->all_attr_names()) { next if $attr =~ /^_/; last RIGHT if $tag->attr($attr) ne $next->attr($attr) }; $tag->push_content($next->detach_content()); $next->delete(); } #remove s that contain just a with the same attributes my $replaced = 0; while (1) { CHILD: foreach my $child ($tag->content_list()) { next unless ref($child) and $child->tag() eq $tag->tag(); foreach my $attr ($tag->all_attr_names()) { next if $attr =~ /^_/; next CHILD if $tag->attr($attr) ne $child->attr($attr); }; $child->replace_with_content->delete(); $replaced++; } last unless $replaced; $replaced = 0; } return 0; } ); # merge ... { my $fun; $fun = sub { my $tag = $_[0]; return 0 if $tag->content_list() != 1; my ($child) = $tag->content_list(); return 0 unless ref($child) and $child->tag() eq 'font'; foreach my $attr ($tag->all_attr_names()) { next if $attr =~ /^_/; if (! defined($child->attr($attr))) { $child->attr($attr, $tag->attr($attr)) } }; $tag->replace_with_content->delete(); $child->look_down( '_tag' => 'font', $fun); return 0; }; $root->look_down( '_tag' => 'font', $fun); } # replace

s with whitespace content by
foreach my $tag ($root->look_down( '_tag' => 'p', sub { foreach ($_[0]->content_list()) { return 0 if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0 is a or ref($_); } return 1; } )) { my $style = $tag->attr("style"); if ($style ne '' and $style =~ /\bMARGIN:\s*0\w+(?:\s+\d+\w+\s+0\w|;|$)/i and !$tag->content_list()) { $tag->delete(); } else { $tag->replace_with(HTML::Element->new('br'))->delete(); } } #remove s and s with only whitespace content (again, after replacing

whitespace

by
) $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite|kbd|var)$', sub { my $tag = $_[0]; while (1) { foreach ($tag->content_list()) { return 0 if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0 is a or ref($_) and $_->tag() ne 'br'; } my $parent = $tag->parent(); $tag->replace_with_content->delete(); $tag = $parent or return 0; return 0 unless $parent->tag() =~ m'^(?:span|small|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite|kbd|var)$'; } return 0; } ); $html = $root->as_HTML(undef, undef, {}); $root->delete(); } for ($html) { s{^\s*

\s*}{}si; s{\s*

\s*$}{}si; s{\s+\n?}{
\n}g; #remove excess
s between paragraphs s{\n{3,}}{\n\n}gs; s{(?:
\n?){3,}}{
\n
\n}gs; #remove excess
s between paragraphs # 1 while s{

(?:\s+|
)*(?:\s+|
)*

}{

}gi; #remove empty paragraphs at the end of the text s{(?:\s+| |
)+()?$}{$1}si; }; return $html; } sub DeWordifyHTML { my $html = shift; return '' if $html eq ''; for ($html) { s{<\?xml:namespace [^>]+>}{}g; # remove s{<\w[\w\d\-]*:\w[\w\d\-]*(?:\s+(?:[^">]+|"[^"]*")*)?>}{}g; # remove s{}{}g; # remove s{(}{$1 $2'>}g; # fix tr/‘’“”/''""/; # use ordinary quotes s/…/.../g; # use three dots instead of the three-dot character s/(?:[•·]|) ?/- /g; # don't use fancy dots } my $result = ''; # just for sure $parser ||= HTML::Parser->new( api_version => 3, marked_sections => 1, boolean_attribute_value => undef, ); # remove mso- styles and classes, tabstops from styles my $sub = sub { my ($tagname, $attr, $attrseq, $text) = @_; return '' if $tagname =~ /:/; my $leave = 1; if ($attr->{class} and $attr->{class} =~ /^mso/i) { $leave = 0; delete $attr->{class}; @$attrseq = grep $_ ne 'class', @$attrseq; } if ($attr->{style}) { my @style = split /\s*;\s*/, $attr->{style}; my $count = scalar(@style); @style = grep !/^(mso-.*|tab-stops.*|MARGIN-TOP: 0in)$/i, @style; if ($count > scalar(@style)) { # we removed some if (@style) { $attr->{style} = join '; ', @style; } else { delete $attr->{style}; @$attrseq = grep $_ ne 'style', @$attrseq; } $leave = 0; } } if ($leave) { $result .= $text } else { $result .= _html_tag($tagname, $attr, $attrseq); } }; $parser->handler(start => $sub, "tagname, attr, attrseq, text"); $parser->handler(default => sub {$result .= $_[0]}, "text"); $parser->parse($html)->eof; return $result }