in reply to Cleaning up HTML

Here's my stab. It's a multi pass approach so while there is more of it I believe it could be easier to extend/test. I've just noticed it doesn't address point two in the OP.

Warning: extremely beta and needs more testing. :-)

#!/bin/perl5 use strict; use warnings; use Data::Dumper; use HTML::TokeParser::Simple; $Data::Dumper::Indent = 1; $Data::Dumper::Sortkeys = 1; my ($p, $t, $html, $clean); $html = do{local $/;<DATA>}; open my $out, q{>}, q{clean.html} or die qq{cant open to read: $!\n}; print $out qq{dirty:\n $html\n}; # fix broken nesting $p = HTML::TokeParser::Simple->new(\$html); my (@stack); my %stand_alone = (br => undef, hr =>undef); while ($t = $p->get_token){ if ($t->is_end_tag){ my $tag = pop @stack; $clean .= qq{</$tag>}; next; } if ($t->is_start_tag and not exists $stand_alone{$t->get_tag}){ push @stack, $t->get_tag; } $clean .= $t->as_is; } print $out qq{clean 1:\n*$clean*\n}; # fix font tags $html = $clean; $p = HTML::TokeParser::Simple->new(\$html) or die qq{parse failed\n}; $clean = q{}; while ($t = $p->get_token){ if ($t->is_start_tag(q{font})){ $clean .= fix_font(); } else{ $clean .= $t->as_is; } } print $out qq{clean 2:\n *$clean*\n}; # fix br $html = $clean; $p = HTML::TokeParser::Simple->new(\$html); $clean = q{}; while ($t = $p->get_token){ if ($t->is_start_tag(q{a})){ $clean .= fix_br(); } else{ $clean .= $t->as_is; } } print $out qq{clean 3:\n $clean\n}; # fix empty tags $html = $clean; $clean = q{}; my @strip_empty = qw{b i}; $clean = q{}; my ($in_tag, $start, $end); my $fragment = q{}; for my $tag (@strip_empty){ my $p = HTML::TokeParser::Simple->new(\$html); while (my $t = $p->get_token){ if ($t->is_start_tag($tag)){ $in_tag++; $start = $t->as_is; next; } if ($in_tag){ if ($t->is_end_tag($tag)){ $in_tag--; if ($fragment =~ /\S/){ $clean .= join q{}, $start, $fragment, $t->as_is; $fragment = q{}; } } else{ $fragment .= $t->as_is; } next; } $clean .= $t->as_is; } $html = $clean; } print $out qq{clean 4:\n $clean}; sub fix_br { my (@fragment, @tags); my $start = $t->as_is; while ($t = $p->get_token){ last if $t->is_end_tag(q{a}); my $as_is = $t->as_is; my $tag = $t->get_tag; $tag ||= q{no tag}; push @fragment, { as_is => $as_is, tag => $tag, }; push @tags, $tag if $t->get_tag; } my $end = $t->as_is; my $found_br; if ($tags[-1] eq q{br}){ $found_br++; @fragment = grep{not $_->{tag} eq q{br}} @fragment; } my $fixed = join q{}, map{$_->{as_is}} @fragment; return join q{}, $start, $fixed, $end, $found_br?q{<br />}:q{}; } sub fix_font { if ($t->get_attr(q{face}) eq q{Verdana} and $t->get_attr(q{size}) eq + q{1}){ $t->delete_attr(q{face}); $t->delete_attr(q{size}); } my $start = $t->as_is; my $attr = $t->get_attr; my $fragment; while ($t = $p->get_token){ last if $t->is_end_tag(q{font}); $fragment .= $t->as_is; } if (not $fragment =~ /\S/){ # empty font tags? return q{}; } if (keys %{$attr}){ return join q{}, $start, $fragment, $t->as_is; } else{ return $fragment; } } __DATA__ <font color="#0000ff" face="Verdana" size="1"> </font> <p align="center"> <a href="#"> <font color="#0000ff" face="Verdana" size="1">&euro; 750aa</font> <br /> </a> <b><i>bad nesting</b></i> <b></b> <b>bold</b> </p>
When faced with similar task I've found it useful to use a wrapper around H::T::S which helps avoid either declaring the parser object as a global or passing it around between subs. It's then easier to introduce layers of abstraction which is, for me at least, a big help.