The following is not smaller, but it does handle all the items on the wish list and should be easier to expand than your current code. Because it's using HTML::TreeBuilder and there doesn't seem to be an option to render HTML without a header and body I've stripped that stuff off using a regex, but the result is sorta ugly looking.
use strict; use warnings; use HTML::TreeBuilder; my $inline = qr/^(b|i|s|del|font)$/; my $block = qr/^(p|table)$/; my $html = <<'HTML'; <FONT color="#0000ff" face="Verdana" size="1"> <p>paragraph</p> </font> <P align="center"><a href="#"><br/> <font color="#0000ff" face="Verdana" size="1">€ 750aa</font> <B><i>foo</b></i> <font face="Verdana" size="1"><b><i></i></b></font> <br /></a> </p> HTML print cleanupHtml($html); sub cleanupHtml { my $root = HTML::TreeBuilder->new; $root->parse_content(shift); $root->elementify(); $root = cleanupElt($root); my $str = $root->as_HTML( undef, ' ', {} ); $str =~ s!^.*?<body>(.*)</body>.*!$1!s; return $str; } sub cleanupElt { my $elt = shift; return unless ref $elt; expelBr($elt) if $elt->{_tag} eq 'a'; my @elts = $elt->content_list(); if ( $elt->{_tag} =~ $inline && @elts == 1 && ref $elts[0] && $elts[0]->{_tag} =~ $block ) { # Invert order of inline and block elements my @nested = $elts[0]->detach_content(); $elt->replace_with( $elts[0] ); $elts[0]->push_content($elt); $elt->push_content(@nested); $elt = $elts[0]; @elts = $elt->content_list(); } $_->replace_with_content()->delete() for grep { removeElt($_) } @e +lts; return $elt if exists $elt->{_implicit}; return undef if !exists $elt->{_content} || !@{ $elt->{_content} } +; return $elt; } sub expelBr { my $elt = shift; return unless exists $elt->{_content}; for my $index ( 0, -1 ) { next unless ref $elt->{_content}[$index]; my $br = $elt->{_content}[$index]; next unless $br->{_tag} eq 'br'; $index == 0 ? $br->detach()->preinsert($br) : $br->detach()->postinsert($br); } } sub removeElt { my $elt = shift; return unless ref $elt; $elt = cleanupElt($elt); return 1 unless $elt; return 0 unless $elt->{_tag} =~ $inline; return 1 if $elt->{_tag} eq 'font' and removeFont($elt); return !exists $elt->{_content}; } sub removeFont { my $elt = shift; delete $elt->{face} if exists $elt->{face} and lc $elt->{face} eq +'verdana'; delete $elt->{size} if exists $elt->{size} and $elt->{size} eq +'1'; return !grep { /^[^_]/ } keys %$elt; }
Prints:
<p><font color="#0000ff">paragraph</font></p> <p align="center"> <br /><a href="#"><font color="#0000ff">€ 750aa</font> < +b><i>foo</i></b> </a><br /> </p>
For reference the same sample run through your code renders as:
<font color="#0000ff"> <p>paragraph</p> </font> <P align="center"><a href="#"><br/> <font color="#0000ff">€ 750aa</font> <B><i>foo</b></i> </a><br /> </p>
In reply to Re: Cleaning up HTML
by GrandFather
in thread Cleaning up HTML
by bart
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |