I'm currently using HTML::TableExtract to pull data (as HTML) out of html table cells. My problems is that the data contains a lot of cruft, typical leftovers from WYSIWYG tools used to edit the HTML (incl. MS Word), and I'd like to clean it up. That includes:
I'm currently using a custom parser based on HTML::TokeParser::Simple, but
What do you recommend? Can HTML::Element actually even manage tag soup, or does it require properly nested tags? How easy is it to remove or swap tag layers (to change the order of nesting)?
p.s. Here's the cleanup tool I wrote. It is not as complete as my wishlist.
use HTML::TokeParser::Simple; sub dummy () { # empty token return HTML::TokeParser::Simple::Token::Text->new([ T => '' ]); } sub cleanup_html { my($html) = @_; my $p = HTML::TokeParser::Simple->new(string => $html); my @out; my @font; while(my $t = $p->get_token) { if($t->is_start_tag('font')) { if(($t->get_attr('face')||'') eq 'Verdana') { $t->delete_attr('face'); } if(($t->get_attr('size')||'') eq '1') { $t->delete_attr('size'); } if(%{$t->get_attr}) { push @font, 1; } else { push @font, 0; $t = dummy; } } elsif($t->is_end_tag('font')) { unless(pop @font) { $t = dummy; } } my @append = $t; if($t->is_tag('br')) { @append = (); while(my $T = pop @out) { if($T->is_start_tag and $t->get_tag ne 'p') { unshift @append, $T; } else { push @out, $T; last; } } unshift @append, $t; } elsif($t->is_end_tag and $t->get_tag ne 'p') { my $tag = $t->get_tag; while(my $T = pop @out) { unshift @append, $T; if($T->is_text) { last if $T->as_is =~ /\S/; } elsif($T->is_tag('br')) { shift @append; push @append, $T; } elsif($T->is_start_tag($tag)) { @append = (); last; } elsif($out[-1]->is_tag) { last; } } } push @out, @append; } return join '', map $_->as_is, @out; } my $html = "<font color=\"#0000ff\" face=\"Verdana\" size=\"1\">\n</fo +nt>\n<p align=\"center\"><a href=\"#\"><font color=\"#0000ff\" face=\ +"Verdana\" size=\"1\">€ 750aa</font><br /></a></p>"; print cleanup_html($html);
In reply to Cleaning up HTML by bart
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |