Re: Cleaning up HTML

Here's my stab. It's a multi pass approach so while there is more of it I believe it could be easier to extend/test. I've just noticed it doesn't address point two in the OP.

Warning: extremely beta and needs more testing. :-)

#!/bin/perl5

use strict;
use warnings;
use Data::Dumper;
use HTML::TokeParser::Simple;
$Data::Dumper::Indent = 1;
$Data::Dumper::Sortkeys = 1;

my ($p, $t, $html, $clean);
$html = do{local $/;<DATA>};

open my $out, q{>}, q{clean.html} or die qq{cant open to read: $!\n};

print $out qq{dirty:\n $html\n};

# fix broken nesting

$p = HTML::TokeParser::Simple->new(\$html);
my (@stack);
my %stand_alone = (br => undef, hr =>undef);
while ($t = $p->get_token){
    if ($t->is_end_tag){
        my $tag = pop @stack;
        $clean .= qq{</$tag>};
        next;
    }
    if ($t->is_start_tag and not exists $stand_alone{$t->get_tag}){
        push @stack, $t->get_tag;
    }
    $clean .= $t->as_is;
}

print $out qq{clean 1:\n*$clean*\n};

# fix font tags

$html = $clean;
$p = HTML::TokeParser::Simple->new(\$html)
    or die qq{parse failed\n};
$clean = q{};
while ($t = $p->get_token){
  if ($t->is_start_tag(q{font})){
    $clean .= fix_font();
  }
  else{
    $clean .= $t->as_is;
  }
}

print $out qq{clean 2:\n *$clean*\n};
# fix br

$html = $clean;
$p = HTML::TokeParser::Simple->new(\$html);
$clean = q{};
while ($t = $p->get_token){
  if ($t->is_start_tag(q{a})){
    $clean .= fix_br();
  }
  else{
    $clean .= $t->as_is;
  }
}

print $out qq{clean 3:\n $clean\n};

# fix empty tags

$html = $clean;
$clean = q{};
my @strip_empty = qw{b i};
$clean = q{};
my ($in_tag, $start, $end);
my $fragment = q{};

for my $tag (@strip_empty){
    my $p = HTML::TokeParser::Simple->new(\$html);
    while (my $t = $p->get_token){
      if ($t->is_start_tag($tag)){
        $in_tag++;
        $start = $t->as_is;
        next;
      }
      if ($in_tag){
        if ($t->is_end_tag($tag)){
          $in_tag--;
          if ($fragment =~ /\S/){
                $clean .= join q{}, $start, $fragment, $t->as_is;
                $fragment = q{};
              }
        }
        else{
            $fragment .= $t->as_is;
            }
            next;
      }
      $clean .= $t->as_is;
    }
    $html = $clean;
}

print $out qq{clean 4:\n $clean};

sub fix_br {
  my (@fragment, @tags);
  my $start = $t->as_is;
  while ($t = $p->get_token){
    last if $t->is_end_tag(q{a});
    my $as_is = $t->as_is;
    my $tag = $t->get_tag;
    $tag ||= q{no tag};
    push @fragment, {
      as_is => $as_is,
      tag   => $tag,
    };
    push @tags, $tag if $t->get_tag;
  }
  my $end = $t->as_is;
  my $found_br;
  if ($tags[-1] eq q{br}){
    $found_br++;
    @fragment = grep{not $_->{tag} eq q{br}} @fragment;
  }
  my $fixed = join q{}, map{$_->{as_is}} @fragment;
  return join q{}, $start, $fixed, $end, $found_br?q{<br />}:q{};
}

sub fix_font {
  if ($t->get_attr(q{face}) eq q{Verdana} and $t->get_attr(q{size}) eq
+ q{1}){
    $t->delete_attr(q{face});
    $t->delete_attr(q{size});
  }
  my $start = $t->as_is;
  my $attr = $t->get_attr;
  my $fragment;
  while ($t = $p->get_token){
    last if $t->is_end_tag(q{font});
    $fragment .= $t->as_is;
  }
  if (not $fragment =~ /\S/){ # empty font tags?
    return  q{};
  }
  if (keys %{$attr}){
    return join q{}, $start, $fragment, $t->as_is;
  }
  else{
    return $fragment;
  }
}

__DATA__
<font color="#0000ff" face="Verdana" size="1">
</font>
<p align="center">
<a href="#">
<font color="#0000ff" face="Verdana" size="1">&euro; 750aa</font>
<br />
</a>
<b><i>bad nesting</b></i>
<b></b>
<b>bold</b>
</p>
[download]

When faced with similar task I've found it useful to use a wrapper around H::T::S which helps avoid either declaring the parser object as a global or passing it around between subs. It's then easier to introduce layers of abstraction which is, for me at least, a big help.

Comment on Re: Cleaning up HTML Download Code