Re: Cleaning up HTML

This probably does too much so feel free to dissect it and use what you need:

sub DeMoronizeHTML {
    my $html = shift;
    return '' if $html eq '';

    for ($html) {
        # try to replace dashed paragraphs by <UL><LI>...
        s{<p style="[^"]*"><span style="[^"]*"><span><FONT size=\d+>·<
+/FONT><SPAN style="[^"]*">(?:&nbsp;)+\s*</SPAN></SPAN></SPAN>(<SPAN s
+tyle="[^"]*"><FONT size=\d+>.*?</FONT></SPAN>)</P>}{<LÍ>$1</LÍ>}gis;
        s{(?<!</LÍ>)<LÍ}{<ul><LÍ}gis;
        s{</LÍ>(?!<LÍ)}{</LÍ></ul>}gis;
        s{<(/?)LÍ>}{<$1LI>}g;

        # add <BR> after </UL>
        # s{</ul><p\b}{</ul><BR><p}gis;

        s{style="TEXT-ALIGN: center" align="center"}{align="center"}gi
+;

        #remove <p> from within <LI>
        s{<li>\s*<p>}{<li>}gi;
        s{</p>\s*</li>}{</li>}gi;
    }

    {
        my $root = HTML::TreeBuilder->new();
        $root->parse_content($html);
        $root = $root->guts();
        #$root->elementify();

        #$h->delete_ignorable_whitespace();

        # remove <span>s and <font>s with no attributes
        foreach my $tag ($root->look_down( '_tag' => qr'^(?:span|font)
+$', style => undef,
            sub {
                return ($_[0]->all_attr_names() <= 3);
            }
        )) {
            $tag->replace_with_content->delete()
        };

        #remove <span>s and <font>s with only whitespace content
        $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong|
+b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite|
+kbd|var)$',
            sub {
                my $tag = $_[0];
                while (1) {
                    foreach ($tag->content_list()) {
                        return 0
                            if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0
+ is a &nbsp;
                            or ref($_) and $_->tag() ne 'br';
                    }
                    my $parent = $tag->parent();
                    $tag->replace_with_content->delete();
                    $tag = $parent or return 0;
                    return 0 unless $parent->tag() =~ m'^(?:span|small
+|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockq
+uote|center|cite|kbd|var)$';
                }
                return 0;
            }
        );

        $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong|
+b|i|font|u|sub|sup|tt|code|em|big|cite|kbd|var)$',
            sub {
                my $tag = $_[0];
        #merge <span>s that are followed by a <span> with the same att
+ributes
RIGHT:            while (my $next = $tag->right()) {
                    last if !$next or !ref($next) or $next->tag() ne $
+tag->tag();
                    foreach my $attr ($tag->all_attr_names()) {
                        next if $attr =~ /^_/;
                        last RIGHT if $tag->attr($attr) ne $next->attr
+($attr)
                    };
                    $tag->push_content($next->detach_content());
                    $next->delete();
                }
        #remove <span>s that contain just a <span> with the same attri
+butes
                my $replaced = 0;
                while (1) {
CHILD:                foreach my $child ($tag->content_list()) {
                        next unless ref($child) and $child->tag() eq $
+tag->tag();
                        foreach my $attr ($tag->all_attr_names()) {
                            next if $attr =~ /^_/;
                            next CHILD if $tag->attr($attr) ne $child-
+>attr($attr);
                        };
                        $child->replace_with_content->delete();
                        $replaced++;
                    }
                    last unless $replaced;
                    $replaced = 0;
                }
                return 0;
            }
        );

        # merge <font ... ><font ...>...</font></font>
        {
        my $fun;
        $fun = sub {
                my $tag = $_[0];
                return 0 if $tag->content_list() != 1;
                my ($child) = $tag->content_list();
                return 0 unless ref($child) and $child->tag() eq 'font
+';

                foreach my $attr ($tag->all_attr_names()) {
                    next if $attr =~ /^_/;
                    if (! defined($child->attr($attr))) {
                        $child->attr($attr, $tag->attr($attr))
                    }
                };

                $tag->replace_with_content->delete();
                $child->look_down( '_tag' => 'font', $fun);
                return 0;
            };

        $root->look_down( '_tag' => 'font', $fun);
        }

        # replace <p>s with whitespace content by <BR/>
        foreach my $tag ($root->look_down( '_tag' => 'p',
                sub {
                    foreach ($_[0]->content_list()) {
                        return 0
                            if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0
+ is a &nbsp;
                            or ref($_);
                    }
                    return 1;
                }
            )) {
            my $style = $tag->attr("style");
            if ($style ne '' and $style =~ /\bMARGIN:\s*0\w+(?:\s+\d+\
+w+\s+0\w|;|$)/i and !$tag->content_list()) {
                $tag->delete();
            } else {
                $tag->replace_with(HTML::Element->new('br'))->delete()
+;
            }
        }

        #remove <span>s and <font>s with only whitespace content (agai
+n, after replacing <P>whitespace</p> by <BR>)
        $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong|
+b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite|
+kbd|var)$',
            sub {
                my $tag = $_[0];
                while (1) {
                    foreach ($tag->content_list()) {
                        return 0
                            if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0
+ is a &nbsp;
                            or ref($_) and $_->tag() ne 'br';
                    }
                    my $parent = $tag->parent();
                    $tag->replace_with_content->delete();
                    $tag = $parent or return 0;
                    return 0 unless $parent->tag() =~ m'^(?:span|small
+|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockq
+uote|center|cite|kbd|var)$';
                }
                return 0;
            }
        );

        $html = $root->as_HTML(undef, undef, {});
        $root->delete();
    }

    for ($html) {
        s{^\s*<div>\s*}{}si;
        s{\s*</div>\s*$}{}si;
        s{<br\s*/?>\s+\n?}{<br/>\n}g;

        #remove excess <BR>s between paragraphs
        s{\n{3,}}{\n\n}gs;
        s{(?:<br/?>\n?){3,}}{<br/>\n<br/>\n}gs;

        #remove excess <BR>s between paragraphs
#            1 while s{<P>(?:\s+|<BR/?>)*<P\b}{<P}gi;
#            1 while s{</P>(?:\s+|<BR/?>)*</P>}{</P>}gi;

        #remove empty paragraphs at the end of the text
        s{(?:\s+|&nbsp;|<br/?>)+(</div>)?$}{$1}si;
    };

    return $html;
}

sub DeWordifyHTML {
    my $html = shift;
    return '' if $html eq '';

    for ($html) {
        s{<\?xml:namespace [^>]+>}{}g;                                
+            # remove <?xml:namespace ...>
        s{<\w[\w\d\-]*:\w[\w\d\-]*(?:\s+(?:[^">]+|"[^"]*")*)?>}{}g;   
+ # remove <o:p>
        s{</\w[\w\d\-]*:\w[\w\d\-]*>}{}g;                             
+           # remove </o:p>
        s{(<SPAN style='FONT: \d+pt)\s*'([^']+)''>}{$1 $2'>}g;        
+        # fix <span style='FONT: xpt 'FontName''>

        tr/‘’“”/''""/;                             # use ordinary quot
+es
        s/…/.../g;                            # use three dots instead
+ of the three-dot character
        s/(?:[•·]|&#61623;) ?/- /g;    # don't use fancy dots
    }

    my $result = ''; # just for sure
    $parser ||= HTML::Parser->new( api_version => 3,
        marked_sections => 1,
        boolean_attribute_value => undef,
    );
    # remove mso- styles and classes, tabstops from styles
    my $sub = sub {
        my ($tagname, $attr, $attrseq, $text) = @_;
        return '' if $tagname =~ /:/;
        my $leave = 1;
        if ($attr->{class} and $attr->{class} =~ /^mso/i) {
            $leave = 0;
            delete $attr->{class};
            @$attrseq = grep $_ ne 'class', @$attrseq;
        }
        if ($attr->{style}) {
            my @style = split /\s*;\s*/, $attr->{style};
            my $count = scalar(@style);
            @style = grep !/^(mso-.*|tab-stops.*|MARGIN-TOP: 0in)$/i, 
+@style;
            if ($count > scalar(@style)) { # we removed some
                if (@style) {
                    $attr->{style} = join '; ', @style;
                } else {
                    delete $attr->{style};
                    @$attrseq = grep $_ ne 'style', @$attrseq;
                }
                $leave = 0;
            }
        }
        if ($leave) {
            $result .= $text
        } else {
            $result .= _html_tag($tagname, $attr, $attrseq);
        }
    };
    $parser->handler(start =>  $sub, "tagname, attr, attrseq, text");
    $parser->handler(default => sub {$result .= $_[0]}, "text");

    $parser->parse($html)->eof;
    return $result
}
[download]

I guess you can tell I have to work with a fairly (censored) "HTML" at times.

Jenda
Support Denmark!
Defend the free world!

Comment on Re: Cleaning up HTML Download Code

Replies are listed 'Best First'.
Re^2: Cleaning up HTML by Anonymous Monk on Feb 14, 2016 at 18:15 UTC
I get an error on the conditional, $result .= _html_tag($tagname, $attr, $attrseq); sub DeWordifyHTML() says, Undefined subroutine &main::_html_tag called at ... I resplaced the "sub" at the top and the curly at the bottom to run, starting with, #!/usr/bin/perl use HTML::Parser (); open(my $html, "<", $ARGV[0])... Google and inspection of perl5/5.20/HTML/Parser.pm didn't help. I'm assuming the _html_tag() sub has been replaced by something else. Do you have an update for this? I realized it's 9 year old code, but I thought I'd ask. Thanks, Art	[reply]
Re^3: Cleaning up HTML by Jenda (Abbot) on Mar 02, 2016 at 15:50 UTC
It's incomplete. Sorry. sub _html_tag { my ( $tag, $attr, $attrseq) = @_; my $html; $html = "<$tag"; if ($attrseq and ref($attrseq) eq 'ARRAY') { foreach my $key (@$attrseq) { if (defined $attr->{$key}) { $html .= " $key="._arg_escape($attr->{$key}); } else { $html .= ' '.$key; } } } elsif ($attr and ref($attr)) { foreach my $key (keys %$attr) { if (defined $attr->{$key}) { $html .= " $key="._arg_escape($attr->{$key}); } else { $html .= ' '.$key; } } } $html .= ">"; return $html; } sub _arg_escape { my $arg = shift; return qq{"$arg"} if ($arg !~ /"/); return qq{'$arg'} if ($arg !~ /'/); $arg =~ s/"/&dblquote;/g; return qq{"$arg"}; } [download] Not sure it's complete like this. I never got around to releasing this as a module. If it's not complete, either download the Jenda.Rex zip from http://jenda.krynicky.cz/#Jenda.Rex, extract the .pm and dissect that (remove all references to Win32::OLE (needed only when the module is wrapped as a COM DLL for use in VB(script)) and Win32::Registry (only used to find out the code page used by the system), remove the whole package JendaRex::CSVParser, ...) or send me a message with your email and I'll send you the module. Jenda Enoch was right! Enjoy the last years of Rome.	[reply] [d/l]