use strict;
use warnings;
use HTML::TreeBuilder;
my $inline = qr/^(b|i|s|del|font)$/;
my $block = qr/^(p|table)$/;
my $html = <<'HTML';
paragraph
€ 750aa
foo
HTML
print cleanupHtml($html);
sub cleanupHtml {
my $root = HTML::TreeBuilder->new;
$root->parse_content(shift);
$root->elementify();
$root = cleanupElt($root);
my $str = $root->as_HTML( undef, ' ', {} );
$str =~ s!^.*?(.*).*!$1!s;
return $str;
}
sub cleanupElt {
my $elt = shift;
return unless ref $elt;
expelBr($elt) if $elt->{_tag} eq 'a';
my @elts = $elt->content_list();
if ( $elt->{_tag} =~ $inline
&& @elts == 1
&& ref $elts[0]
&& $elts[0]->{_tag} =~ $block )
{ # Invert order of inline and block elements
my @nested = $elts[0]->detach_content();
$elt->replace_with( $elts[0] );
$elts[0]->push_content($elt);
$elt->push_content(@nested);
$elt = $elts[0];
@elts = $elt->content_list();
}
$_->replace_with_content()->delete() for grep { removeElt($_) } @elts;
return $elt if exists $elt->{_implicit};
return undef if !exists $elt->{_content} || !@{ $elt->{_content} };
return $elt;
}
sub expelBr {
my $elt = shift;
return unless exists $elt->{_content};
for my $index ( 0, -1 ) {
next unless ref $elt->{_content}[$index];
my $br = $elt->{_content}[$index];
next unless $br->{_tag} eq 'br';
$index == 0
? $br->detach()->preinsert($br)
: $br->detach()->postinsert($br);
}
}
sub removeElt {
my $elt = shift;
return unless ref $elt;
$elt = cleanupElt($elt);
return 1 unless $elt;
return 0 unless $elt->{_tag} =~ $inline;
return 1 if $elt->{_tag} eq 'font' and removeFont($elt);
return !exists $elt->{_content};
}
sub removeFont {
my $elt = shift;
delete $elt->{face} if exists $elt->{face} and lc $elt->{face} eq 'verdana';
delete $elt->{size} if exists $elt->{size} and $elt->{size} eq '1';
return !grep { /^[^_]/ } keys %$elt;
}