in reply to Cleaning up HTML

This probably does too much so feel free to dissect it and use what you need:
sub DeMoronizeHTML { my $html = shift; return '' if $html eq ''; for ($html) { # try to replace dashed paragraphs by <UL><LI>... s{<p style="[^"]*"><span style="[^"]*"><span><FONT size=\d+>·< +/FONT><SPAN style="[^"]*">(?:&nbsp;)+\s*</SPAN></SPAN></SPAN>(<SPAN s +tyle="[^"]*"><FONT size=\d+>.*?</FONT></SPAN>)</P>}{<LÍ>$1</LÍ>}gis; s{(?<!</LÍ>)<LÍ}{<ul><LÍ}gis; s{</LÍ>(?!<LÍ)}{</LÍ></ul>}gis; s{<(/?)LÍ>}{<$1LI>}g; # add <BR> after </UL> # s{</ul><p\b}{</ul><BR><p}gis; s{style="TEXT-ALIGN: center" align="center"}{align="center"}gi +; #remove <p> from within <LI> s{<li>\s*<p>}{<li>}gi; s{</p>\s*</li>}{</li>}gi; } { my $root = HTML::TreeBuilder->new(); $root->parse_content($html); $root = $root->guts(); #$root->elementify(); #$h->delete_ignorable_whitespace(); # remove <span>s and <font>s with no attributes foreach my $tag ($root->look_down( '_tag' => qr'^(?:span|font) +$', style => undef, sub { return ($_[0]->all_attr_names() <= 3); } )) { $tag->replace_with_content->delete() }; #remove <span>s and <font>s with only whitespace content $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong| +b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite| +kbd|var)$', sub { my $tag = $_[0]; while (1) { foreach ($tag->content_list()) { return 0 if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0 + is a &nbsp; or ref($_) and $_->tag() ne 'br'; } my $parent = $tag->parent(); $tag->replace_with_content->delete(); $tag = $parent or return 0; return 0 unless $parent->tag() =~ m'^(?:span|small +|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockq +uote|center|cite|kbd|var)$'; } return 0; } ); $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong| +b|i|font|u|sub|sup|tt|code|em|big|cite|kbd|var)$', sub { my $tag = $_[0]; #merge <span>s that are followed by a <span> with the same att +ributes RIGHT: while (my $next = $tag->right()) { last if !$next or !ref($next) or $next->tag() ne $ +tag->tag(); foreach my $attr ($tag->all_attr_names()) { next if $attr =~ /^_/; last RIGHT if $tag->attr($attr) ne $next->attr +($attr) }; $tag->push_content($next->detach_content()); $next->delete(); } #remove <span>s that contain just a <span> with the same attri +butes my $replaced = 0; while (1) { CHILD: foreach my $child ($tag->content_list()) { next unless ref($child) and $child->tag() eq $ +tag->tag(); foreach my $attr ($tag->all_attr_names()) { next if $attr =~ /^_/; next CHILD if $tag->attr($attr) ne $child- +>attr($attr); }; $child->replace_with_content->delete(); $replaced++; } last unless $replaced; $replaced = 0; } return 0; } ); # merge <font ... ><font ...>...</font></font> { my $fun; $fun = sub { my $tag = $_[0]; return 0 if $tag->content_list() != 1; my ($child) = $tag->content_list(); return 0 unless ref($child) and $child->tag() eq 'font +'; foreach my $attr ($tag->all_attr_names()) { next if $attr =~ /^_/; if (! defined($child->attr($attr))) { $child->attr($attr, $tag->attr($attr)) } }; $tag->replace_with_content->delete(); $child->look_down( '_tag' => 'font', $fun); return 0; }; $root->look_down( '_tag' => 'font', $fun); } # replace <p>s with whitespace content by <BR/> foreach my $tag ($root->look_down( '_tag' => 'p', sub { foreach ($_[0]->content_list()) { return 0 if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0 + is a &nbsp; or ref($_); } return 1; } )) { my $style = $tag->attr("style"); if ($style ne '' and $style =~ /\bMARGIN:\s*0\w+(?:\s+\d+\ +w+\s+0\w|;|$)/i and !$tag->content_list()) { $tag->delete(); } else { $tag->replace_with(HTML::Element->new('br'))->delete() +; } } #remove <span>s and <font>s with only whitespace content (agai +n, after replacing <P>whitespace</p> by <BR>) $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong| +b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite| +kbd|var)$', sub { my $tag = $_[0]; while (1) { foreach ($tag->content_list()) { return 0 if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0 + is a &nbsp; or ref($_) and $_->tag() ne 'br'; } my $parent = $tag->parent(); $tag->replace_with_content->delete(); $tag = $parent or return 0; return 0 unless $parent->tag() =~ m'^(?:span|small +|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockq +uote|center|cite|kbd|var)$'; } return 0; } ); $html = $root->as_HTML(undef, undef, {}); $root->delete(); } for ($html) { s{^\s*<div>\s*}{}si; s{\s*</div>\s*$}{}si; s{<br\s*/?>\s+\n?}{<br/>\n}g; #remove excess <BR>s between paragraphs s{\n{3,}}{\n\n}gs; s{(?:<br/?>\n?){3,}}{<br/>\n<br/>\n}gs; #remove excess <BR>s between paragraphs # 1 while s{<P>(?:\s+|<BR/?>)*<P\b}{<P}gi; # 1 while s{</P>(?:\s+|<BR/?>)*</P>}{</P>}gi; #remove empty paragraphs at the end of the text s{(?:\s+|&nbsp;|<br/?>)+(</div>)?$}{$1}si; }; return $html; } sub DeWordifyHTML { my $html = shift; return '' if $html eq ''; for ($html) { s{<\?xml:namespace [^>]+>}{}g; + # remove <?xml:namespace ...> s{<\w[\w\d\-]*:\w[\w\d\-]*(?:\s+(?:[^">]+|"[^"]*")*)?>}{}g; + # remove <o:p> s{</\w[\w\d\-]*:\w[\w\d\-]*>}{}g; + # remove </o:p> s{(<SPAN style='FONT: \d+pt)\s*'([^']+)''>}{$1 $2'>}g; + # fix <span style='FONT: xpt 'FontName''> tr/‘’“”/''""/; # use ordinary quot +es s/…/.../g; # use three dots instead + of the three-dot character s/(?:[•·]|&#61623;) ?/- /g; # don't use fancy dots } my $result = ''; # just for sure $parser ||= HTML::Parser->new( api_version => 3, marked_sections => 1, boolean_attribute_value => undef, ); # remove mso- styles and classes, tabstops from styles my $sub = sub { my ($tagname, $attr, $attrseq, $text) = @_; return '' if $tagname =~ /:/; my $leave = 1; if ($attr->{class} and $attr->{class} =~ /^mso/i) { $leave = 0; delete $attr->{class}; @$attrseq = grep $_ ne 'class', @$attrseq; } if ($attr->{style}) { my @style = split /\s*;\s*/, $attr->{style}; my $count = scalar(@style); @style = grep !/^(mso-.*|tab-stops.*|MARGIN-TOP: 0in)$/i, +@style; if ($count > scalar(@style)) { # we removed some if (@style) { $attr->{style} = join '; ', @style; } else { delete $attr->{style}; @$attrseq = grep $_ ne 'style', @$attrseq; } $leave = 0; } } if ($leave) { $result .= $text } else { $result .= _html_tag($tagname, $attr, $attrseq); } }; $parser->handler(start => $sub, "tagname, attr, attrseq, text"); $parser->handler(default => sub {$result .= $_[0]}, "text"); $parser->parse($html)->eof; return $result }

I guess you can tell I have to work with a fairly (censored) "HTML" at times.

Replies are listed 'Best First'.
Re^2: Cleaning up HTML
by Anonymous Monk on Feb 14, 2016 at 18:15 UTC
    I get an error on the conditional,
     $result .= _html_tag($tagname, $attr, $attrseq);
    sub DeWordifyHTML() says,
     Undefined subroutine &main::_html_tag called at ...
    I resplaced the "sub" at the top and the curly at the bottom to run, starting with,
     #!/usr/bin/perl
     use HTML::Parser ();
            open(my $html, "<", $ARGV[0])...
    Google and inspection of perl5/5.20/HTML/Parser.pm didn't help. I'm assuming the _html_tag() sub has been replaced by something else.

    Do you have an update for this? I realized it's 9 year old code, but I thought I'd ask.

    Thanks, Art

      It's incomplete. Sorry.

      sub _html_tag { my ( $tag, $attr, $attrseq) = @_; my $html; $html = "<$tag"; if ($attrseq and ref($attrseq) eq 'ARRAY') { foreach my $key (@$attrseq) { if (defined $attr->{$key}) { $html .= " $key="._arg_escape($attr->{$key}); } else { $html .= ' '.$key; } } } elsif ($attr and ref($attr)) { foreach my $key (keys %$attr) { if (defined $attr->{$key}) { $html .= " $key="._arg_escape($attr->{$key}); } else { $html .= ' '.$key; } } } $html .= ">"; return $html; } sub _arg_escape { my $arg = shift; return qq{"$arg"} if ($arg !~ /"/); return qq{'$arg'} if ($arg !~ /'/); $arg =~ s/"/&dblquote;/g; return qq{"$arg"}; }

      Not sure it's complete like this. I never got around to releasing this as a module. If it's not complete, either download the Jenda.Rex zip from http://jenda.krynicky.cz/#Jenda.Rex, extract the .pm and dissect that (remove all references to Win32::OLE (needed only when the module is wrapped as a COM DLL for use in VB(script)) and Win32::Registry (only used to find out the code page used by the system), remove the whole package JendaRex::CSVParser, ...) or send me a message with your email and I'll send you the module.

      Jenda
      Enoch was right!
      Enjoy the last years of Rome.