use strict; use warnings; use diagnostics; use HTML::TreeBuilder; use HTML::Entities; use HTML::Element; sub traverse ; foreach my $file_name (@ARGV) { my $tree = HTML::TreeBuilder->new ; $tree->parse_file($file_name); $tree->dump ; print "\n\nWhere would you like to put the output file? " ; my $output = ; open OUTPUT_FILE, "> $output" or die $! ; select OUTPUT_FILE ; traverse ($tree); $tree = $tree->delete ; close OUTPUT_FILE or die $!; } sub traverse { foreach (@_) { if ($_) { if (ref $_) { print STDERR $_->tag(), "\n\n" ; if ($_->tag() ne "head" && $_->tag() ne "script" && $_->tag() ne "img" && $_->tag() ne "object" && $_->tag() ne "applet") { my @contents = $_->content_list() ; print STDERR "before: @contents\n"; traverse (@contents) ; print STDERR "after: @contents\n"; } if (!$_->parent) { my $s = $_->as_HTML ("",{}) ; $s =~ s/>\n/>/g ; $_ =~ s/'em\s/’em /g ; $_ =~ s/'tis\s/’tis /g ; $_ =~ s/'twas\s/’twas /g ; $_ =~ s/'Twas\s/’Twas /g ; $_ =~ s/'Tis\s/’Tis / ; $_ =~ s/'\s/’ /g ; $_ =~ s/^'/‘/g ; $_ =~ s/(\s)'/$1‘/g ; $_ =~ s/"'/“lsquo;/g ; $_ =~ s/'"/’”/g ; $_ =~ s/\s"/ “/g ; $_ =~ s/^'/‘/g ; $_ =~ s/^"/“/g ; $_ =~ s/"\s/” /g ; $_ =~ s/'$/’/g ; $_ =~ s/"$/”/g ; $_ =~ s/(,|\.)'/$1’/g ; $_ =~ s/(,|\.)"/$1”/g ; $_ =~ s/(\S)'(\S)/$1’$2/g ; print STDERR ($_ , "\n\n"); } } } }