#!/usr/bin/perl
use strict;
use HTML::TokeParser::Simple;
my $src;
{ # read the entire HTML input stream as one contiguous string:
local $/ = undef;
$src = <>;
}
my $htm = HTML::TokeParser::Simple->new( \$src );
my $targetlen = int( 0.15 * length( $src ));
# this is a flawed attempt to select 15% of original content
my $outtext = '';
my $outlen = 0;
my @tagstack;
while ( my $tkn = $htm->get_token )
{
if ( $tkn->is_start_tag ) { # this is a start tag
print $tkn->as_is;
next if ( $$tkn[1] =~ /^(img|hr|meta|link|br)$/ );
# img,hr,meta,link tags don't span text content
my $tagname = $tkn->return_tag;
push @tagstack, $tagname
unless ( $tagname =~ /^p$/i and $tagstack[$#tagstack] =~ /^p$/i );
}
elsif ( $tkn->is_end_tag ) { # this is an end tag
print $tkn->as_is;
my $tagname = $tkn->return_tag;
if ( grep /^$tagname$/i, @tagstack ) {
while ( $tagstack[$#tagstack] !~ /^$tagname$/i ) {
pop @tagstack;
}
pop @tagstack;
}
}
elsif ( $tkn->is_text ) { # this is text content
my $txttkn = $tkn->as_is;
$txttkn =~ s/\s+/ /g;
my $txtlen = length( $txttkn );
if ( $txtlen > $targetlen ) {
my $cut = rindex( $txttkn, ' ', $targetlen );
$txttkn = substr( $txttkn, 0, $cut );
print "\n$txttkn\n";
last;
}
print "\n$txttkn\n";
$targetlen -= $txtlen;
}
}
while ( @tagstack ) {
printf "%s>\n", pop @tagstack;
}