in reply to Cleaning up HTML
sub DeMoronizeHTML { my $html = shift; return '' if $html eq ''; for ($html) { # try to replace dashed paragraphs by <UL><LI>... s{<p style="[^"]*"><span style="[^"]*"><span><FONT size=\d+>·< +/FONT><SPAN style="[^"]*">(?: )+\s*</SPAN></SPAN></SPAN>(<SPAN s +tyle="[^"]*"><FONT size=\d+>.*?</FONT></SPAN>)</P>}{<LÍ>$1</LÍ>}gis; s{(?<!</LÍ>)<LÍ}{<ul><LÍ}gis; s{</LÍ>(?!<LÍ)}{</LÍ></ul>}gis; s{<(/?)LÍ>}{<$1LI>}g; # add <BR> after </UL> # s{</ul><p\b}{</ul><BR><p}gis; s{style="TEXT-ALIGN: center" align="center"}{align="center"}gi +; #remove <p> from within <LI> s{<li>\s*<p>}{<li>}gi; s{</p>\s*</li>}{</li>}gi; } { my $root = HTML::TreeBuilder->new(); $root->parse_content($html); $root = $root->guts(); #$root->elementify(); #$h->delete_ignorable_whitespace(); # remove <span>s and <font>s with no attributes foreach my $tag ($root->look_down( '_tag' => qr'^(?:span|font) +$', style => undef, sub { return ($_[0]->all_attr_names() <= 3); } )) { $tag->replace_with_content->delete() }; #remove <span>s and <font>s with only whitespace content $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong| +b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite| +kbd|var)$', sub { my $tag = $_[0]; while (1) { foreach ($tag->content_list()) { return 0 if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0 + is a or ref($_) and $_->tag() ne 'br'; } my $parent = $tag->parent(); $tag->replace_with_content->delete(); $tag = $parent or return 0; return 0 unless $parent->tag() =~ m'^(?:span|small +|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockq +uote|center|cite|kbd|var)$'; } return 0; } ); $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong| +b|i|font|u|sub|sup|tt|code|em|big|cite|kbd|var)$', sub { my $tag = $_[0]; #merge <span>s that are followed by a <span> with the same att +ributes RIGHT: while (my $next = $tag->right()) { last if !$next or !ref($next) or $next->tag() ne $ +tag->tag(); foreach my $attr ($tag->all_attr_names()) { next if $attr =~ /^_/; last RIGHT if $tag->attr($attr) ne $next->attr +($attr) }; $tag->push_content($next->detach_content()); $next->delete(); } #remove <span>s that contain just a <span> with the same attri +butes my $replaced = 0; while (1) { CHILD: foreach my $child ($tag->content_list()) { next unless ref($child) and $child->tag() eq $ +tag->tag(); foreach my $attr ($tag->all_attr_names()) { next if $attr =~ /^_/; next CHILD if $tag->attr($attr) ne $child- +>attr($attr); }; $child->replace_with_content->delete(); $replaced++; } last unless $replaced; $replaced = 0; } return 0; } ); # merge <font ... ><font ...>...</font></font> { my $fun; $fun = sub { my $tag = $_[0]; return 0 if $tag->content_list() != 1; my ($child) = $tag->content_list(); return 0 unless ref($child) and $child->tag() eq 'font +'; foreach my $attr ($tag->all_attr_names()) { next if $attr =~ /^_/; if (! defined($child->attr($attr))) { $child->attr($attr, $tag->attr($attr)) } }; $tag->replace_with_content->delete(); $child->look_down( '_tag' => 'font', $fun); return 0; }; $root->look_down( '_tag' => 'font', $fun); } # replace <p>s with whitespace content by <BR/> foreach my $tag ($root->look_down( '_tag' => 'p', sub { foreach ($_[0]->content_list()) { return 0 if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0 + is a or ref($_); } return 1; } )) { my $style = $tag->attr("style"); if ($style ne '' and $style =~ /\bMARGIN:\s*0\w+(?:\s+\d+\ +w+\s+0\w|;|$)/i and !$tag->content_list()) { $tag->delete(); } else { $tag->replace_with(HTML::Element->new('br'))->delete() +; } } #remove <span>s and <font>s with only whitespace content (agai +n, after replacing <P>whitespace</p> by <BR>) $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong| +b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite| +kbd|var)$', sub { my $tag = $_[0]; while (1) { foreach ($tag->content_list()) { return 0 if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0 + is a or ref($_) and $_->tag() ne 'br'; } my $parent = $tag->parent(); $tag->replace_with_content->delete(); $tag = $parent or return 0; return 0 unless $parent->tag() =~ m'^(?:span|small +|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockq +uote|center|cite|kbd|var)$'; } return 0; } ); $html = $root->as_HTML(undef, undef, {}); $root->delete(); } for ($html) { s{^\s*<div>\s*}{}si; s{\s*</div>\s*$}{}si; s{<br\s*/?>\s+\n?}{<br/>\n}g; #remove excess <BR>s between paragraphs s{\n{3,}}{\n\n}gs; s{(?:<br/?>\n?){3,}}{<br/>\n<br/>\n}gs; #remove excess <BR>s between paragraphs # 1 while s{<P>(?:\s+|<BR/?>)*<P\b}{<P}gi; # 1 while s{</P>(?:\s+|<BR/?>)*</P>}{</P>}gi; #remove empty paragraphs at the end of the text s{(?:\s+| |<br/?>)+(</div>)?$}{$1}si; }; return $html; } sub DeWordifyHTML { my $html = shift; return '' if $html eq ''; for ($html) { s{<\?xml:namespace [^>]+>}{}g; + # remove <?xml:namespace ...> s{<\w[\w\d\-]*:\w[\w\d\-]*(?:\s+(?:[^">]+|"[^"]*")*)?>}{}g; + # remove <o:p> s{</\w[\w\d\-]*:\w[\w\d\-]*>}{}g; + # remove </o:p> s{(<SPAN style='FONT: \d+pt)\s*'([^']+)''>}{$1 $2'>}g; + # fix <span style='FONT: xpt 'FontName''> tr/‘’“”/''""/; # use ordinary quot +es s/…/.../g; # use three dots instead + of the three-dot character s/(?:[•·]|) ?/- /g; # don't use fancy dots } my $result = ''; # just for sure $parser ||= HTML::Parser->new( api_version => 3, marked_sections => 1, boolean_attribute_value => undef, ); # remove mso- styles and classes, tabstops from styles my $sub = sub { my ($tagname, $attr, $attrseq, $text) = @_; return '' if $tagname =~ /:/; my $leave = 1; if ($attr->{class} and $attr->{class} =~ /^mso/i) { $leave = 0; delete $attr->{class}; @$attrseq = grep $_ ne 'class', @$attrseq; } if ($attr->{style}) { my @style = split /\s*;\s*/, $attr->{style}; my $count = scalar(@style); @style = grep !/^(mso-.*|tab-stops.*|MARGIN-TOP: 0in)$/i, +@style; if ($count > scalar(@style)) { # we removed some if (@style) { $attr->{style} = join '; ', @style; } else { delete $attr->{style}; @$attrseq = grep $_ ne 'style', @$attrseq; } $leave = 0; } } if ($leave) { $result .= $text } else { $result .= _html_tag($tagname, $attr, $attrseq); } }; $parser->handler(start => $sub, "tagname, attr, attrseq, text"); $parser->handler(default => sub {$result .= $_[0]}, "text"); $parser->parse($html)->eof; return $result }
I guess you can tell I have to work with a fairly (censored) "HTML" at times.
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^2: Cleaning up HTML
by Anonymous Monk on Feb 14, 2016 at 18:15 UTC | |
by Jenda (Abbot) on Mar 02, 2016 at 15:50 UTC |