#!/bin/perl5 use strict; use warnings; use Data::Dumper; use HTML::TokeParser::Simple; $Data::Dumper::Indent = 1; $Data::Dumper::Sortkeys = 1; my ($p, $t, $html, $clean); $html = do{local $/;}; open my $out, q{>}, q{clean.html} or die qq{cant open to read: $!\n}; print $out qq{dirty:\n $html\n}; # fix broken nesting $p = HTML::TokeParser::Simple->new(\$html); my (@stack); my %stand_alone = (br => undef, hr =>undef); while ($t = $p->get_token){ if ($t->is_end_tag){ my $tag = pop @stack; $clean .= qq{}; next; } if ($t->is_start_tag and not exists $stand_alone{$t->get_tag}){ push @stack, $t->get_tag; } $clean .= $t->as_is; } print $out qq{clean 1:\n*$clean*\n}; # fix font tags $html = $clean; $p = HTML::TokeParser::Simple->new(\$html) or die qq{parse failed\n}; $clean = q{}; while ($t = $p->get_token){ if ($t->is_start_tag(q{font})){ $clean .= fix_font(); } else{ $clean .= $t->as_is; } } print $out qq{clean 2:\n *$clean*\n}; # fix br $html = $clean; $p = HTML::TokeParser::Simple->new(\$html); $clean = q{}; while ($t = $p->get_token){ if ($t->is_start_tag(q{a})){ $clean .= fix_br(); } else{ $clean .= $t->as_is; } } print $out qq{clean 3:\n $clean\n}; # fix empty tags $html = $clean; $clean = q{}; my @strip_empty = qw{b i}; $clean = q{}; my ($in_tag, $start, $end); my $fragment = q{}; for my $tag (@strip_empty){ my $p = HTML::TokeParser::Simple->new(\$html); while (my $t = $p->get_token){ if ($t->is_start_tag($tag)){ $in_tag++; $start = $t->as_is; next; } if ($in_tag){ if ($t->is_end_tag($tag)){ $in_tag--; if ($fragment =~ /\S/){ $clean .= join q{}, $start, $fragment, $t->as_is; $fragment = q{}; } } else{ $fragment .= $t->as_is; } next; } $clean .= $t->as_is; } $html = $clean; } print $out qq{clean 4:\n $clean}; sub fix_br { my (@fragment, @tags); my $start = $t->as_is; while ($t = $p->get_token){ last if $t->is_end_tag(q{a}); my $as_is = $t->as_is; my $tag = $t->get_tag; $tag ||= q{no tag}; push @fragment, { as_is => $as_is, tag => $tag, }; push @tags, $tag if $t->get_tag; } my $end = $t->as_is; my $found_br; if ($tags[-1] eq q{br}){ $found_br++; @fragment = grep{not $_->{tag} eq q{br}} @fragment; } my $fixed = join q{}, map{$_->{as_is}} @fragment; return join q{}, $start, $fixed, $end, $found_br?q{
}:q{}; } sub fix_font { if ($t->get_attr(q{face}) eq q{Verdana} and $t->get_attr(q{size}) eq q{1}){ $t->delete_attr(q{face}); $t->delete_attr(q{size}); } my $start = $t->as_is; my $attr = $t->get_attr; my $fragment; while ($t = $p->get_token){ last if $t->is_end_tag(q{font}); $fragment .= $t->as_is; } if (not $fragment =~ /\S/){ # empty font tags? return q{}; } if (keys %{$attr}){ return join q{}, $start, $fragment, $t->as_is; } else{ return $fragment; } } __DATA__

€ 750aa
bad nesting bold