in reply to Cleaning up HTML
The following is not smaller, but it does handle all the items on the wish list and should be easier to expand than your current code. Because it's using HTML::TreeBuilder and there doesn't seem to be an option to render HTML without a header and body I've stripped that stuff off using a regex, but the result is sorta ugly looking.
use strict; use warnings; use HTML::TreeBuilder; my $inline = qr/^(b|i|s|del|font)$/; my $block = qr/^(p|table)$/; my $html = <<'HTML'; <FONT color="#0000ff" face="Verdana" size="1"> <p>paragraph</p> </font> <P align="center"><a href="#"><br/> <font color="#0000ff" face="Verdana" size="1">€ 750aa</font> <B><i>foo</b></i> <font face="Verdana" size="1"><b><i></i></b></font> <br /></a> </p> HTML print cleanupHtml($html); sub cleanupHtml { my $root = HTML::TreeBuilder->new; $root->parse_content(shift); $root->elementify(); $root = cleanupElt($root); my $str = $root->as_HTML( undef, ' ', {} ); $str =~ s!^.*?<body>(.*)</body>.*!$1!s; return $str; } sub cleanupElt { my $elt = shift; return unless ref $elt; expelBr($elt) if $elt->{_tag} eq 'a'; my @elts = $elt->content_list(); if ( $elt->{_tag} =~ $inline && @elts == 1 && ref $elts[0] && $elts[0]->{_tag} =~ $block ) { # Invert order of inline and block elements my @nested = $elts[0]->detach_content(); $elt->replace_with( $elts[0] ); $elts[0]->push_content($elt); $elt->push_content(@nested); $elt = $elts[0]; @elts = $elt->content_list(); } $_->replace_with_content()->delete() for grep { removeElt($_) } @e +lts; return $elt if exists $elt->{_implicit}; return undef if !exists $elt->{_content} || !@{ $elt->{_content} } +; return $elt; } sub expelBr { my $elt = shift; return unless exists $elt->{_content}; for my $index ( 0, -1 ) { next unless ref $elt->{_content}[$index]; my $br = $elt->{_content}[$index]; next unless $br->{_tag} eq 'br'; $index == 0 ? $br->detach()->preinsert($br) : $br->detach()->postinsert($br); } } sub removeElt { my $elt = shift; return unless ref $elt; $elt = cleanupElt($elt); return 1 unless $elt; return 0 unless $elt->{_tag} =~ $inline; return 1 if $elt->{_tag} eq 'font' and removeFont($elt); return !exists $elt->{_content}; } sub removeFont { my $elt = shift; delete $elt->{face} if exists $elt->{face} and lc $elt->{face} eq +'verdana'; delete $elt->{size} if exists $elt->{size} and $elt->{size} eq +'1'; return !grep { /^[^_]/ } keys %$elt; }
Prints:
<p><font color="#0000ff">paragraph</font></p> <p align="center"> <br /><a href="#"><font color="#0000ff">€ 750aa</font> < +b><i>foo</i></b> </a><br /> </p>
For reference the same sample run through your code renders as:
<font color="#0000ff"> <p>paragraph</p> </font> <P align="center"><a href="#"><br/> <font color="#0000ff">€ 750aa</font> <B><i>foo</b></i> </a><br /> </p>
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^2: Cleaning up HTML
by bart (Canon) on Dec 21, 2007 at 11:49 UTC |