#!/usr/bin/perl -w =head1 COPYRIGHT Copyright 2001 Jason Henry Parker This program is Free Software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut use strict; use integer; use Carp qw(carp croak); use HTML::TreeBuilder; my $tree = new HTML::TreeBuilder; $tree->parse_file( shift||"index.html" ); $tree->add_scores(penalise => 1, detach => 1); my $everything = sub { 1 }; my @contents = $tree->look_down( $everything ); # TODO: extend this to use a Schwartzian transform instead # of recalculating depth() * score() over and over and over. # # What this does is it takes all contents under $tree, # and sorts by depth x score on all objects that are # HTML::Elements (or subclasses). @contents = sort { $b->depth() * $b->score() <=> $a->depth() * $a->score() } grep { defined $_ and ref $_ and $_->isa("HTML::Element") } @contents; my $best = $contents[0]; # Look for the `body' tag. This is probably way overkill. # Since we bother to do this (ostensibly to hand back # `valid' HTML, we should really go to the trouble of # looking up from $best to find a table tag, and so on and # so forth. Feh. my $body = $tree->look_down( sub { my $x = shift; defined $x and ref $x and $x->isa("HTML::Element") and $x->tag eq 'body'; } ); $body->detach_content(); $body->push_content($best); # Use # print $tree->as_Lisp_form(),"\n"; # for a view of the HTML where you can see the scores on # each node. # # Fortunately, attributes beginning with "_" are stripped # out. print $tree->as_HTML(undef, " "),"\n"; exit 0; ### Everything below here is a subroutine. # This is but a simple accessor method for our added # attribute. The code would be much simpler if a perl 5.6 # lvalue sub could be used here instead. sub HTML::Element::score { shift->attr('_score', @_); } # The real work happens here. This method recursively adds # scores to the parse tree; if the `detach' argument is # supplied, negatively scored nodes are removed, but these # usually won't be generated unless the `penalise' option is # also added. sub HTML::Element::add_scores { my ($self, %args) = @_; my $sub; $sub = sub { my $self = shift; if (!defined $self) { carp "undefined value passed to add_scores()\n"; return undef; } if (ref $self) { if ($self->isa("HTML::Element")) { $self->score(0); foreach ($self->content_list()) { $self->score($self->score + $sub->($_)); } if ($args{penalise}) { $self->penalise(); } if ($args{detach} and $self->score <= 0) { my $t = $self->detach(); if (defined $t and $t->isa("HTML::Element")) { $t->normalize_content(); } } return $self->score; } else { carp "unknown ref type passed to add_scores()\n"; return undef; } } else { # $self is not a reference return length $self; } }; $sub->($self); } # Punish content-obsuring nodes, reward content-rich nodes. # TODO: make this use HTML::Tagset TODO: make this sub # changable at run-time to suit specific sites. sub HTML::Element::penalise { my $self = shift; my $tag = $self->tag; my $score = $self->score; ## These elements are considered Just Plain Evil. They ## almost always obscure content. if ($tag eq 'script' or $tag eq 'span' or $tag eq 'form') { # $score = - $score can make a score positive again $score = -abs($score); } elsif ($tag eq 'p') { ## these elements often are, or contain, useful content. $score += 50; } elsif ($tag eq 'a') { ## a tags can be a pain; we could be seeing an ## off-site link to supporting documentation, ## or we could be seeing a mess of navigation links. $score = 1; } $self->score($score); }