use HTML::Tidy Inside a Moo Object: has tidy => ( is => 'rw', lazy => 1, builder => '_build_tidy', isa => InstanceOf ["HTML::Tidy"], ); sub _build_tidy { my $self = shift; my $tidy = HTML::Tidy->new( { #doctype => 'omit', output_xhtml => 1, tidy_mark => 0, } ); $tidy->ignore( text => 'missing declaration', text => 'inserting implicit ', text => 'inserting missing \'title\' element', text => 'missing ', text => ' is not approved by W3C', text => 'plain text isn\'t allowed in elements', text => ' previously mentioned', ); return( $tidy ); } sub _clean_html { my ( $self, $html ) = @_; $self->tidy->clear_messages(); $self->tidy->parse( "1", $html ); if ( $self->tidy->messages ) { $html = $self->tidy->clean( $html ); $html =~ m/\n(.*)\n<\/body>/msgix; $html = $1; } return $html; } #### sub _is_html_clean { # create state variable contain hash of unbalanced tags # that will persist across calls state $is_unbalanced = { area => 1, base => 1, basefont => 1, bgsound => 1, br => 1, col => 1, colgroup => 1, embed => 1, frame => 1, hr => 1, img => 1, input => 1, isindex => 1, li => 1, link => 1, marquee => 1, meta => 1, p => 1, '!doctype' => 1, }; # remove self closing tags $_[0] =~ s/(.*)<.+?\/>/$1/g; # remove commented sections $_[0] =~ s///msg; # load tag names in array my (@a) = ( $_[0] =~ m/<(\S+?)[ >]/msg ); # process each tag counting the open and closes and # then increment or decrement a counter for that tag my %h; foreach (@a) { if (m[^/]) { # closing tag substr( $_, 0, 1 ) = ""; # remove the / $h{$_}--; } else { $h{$_}++; } } foreach ( keys %h ) { if (m/[A-Z]/) { # combine keys in case insensitive manner $h{ lc($_) } += $h{$_}; delete $h{$_}; } } foreach ( sort keys %h ) { next if ( $is_unbalanced->{$_} ); # ignore if tag is in the is unbalanced hash if ( $h{$_} != 0 ) { return 0; # return as soon as an non-paired tag is found } } return 1; # return if all is good }