in reply to getting the first n printable words from a string of HTML

I have a solution, which solves the problem a differnet way:

#!/usr/cpan/bin/perl use HTML::TokeParser; use Data::Dumper; my $full_text = '<P><img src="/images/logo.png" alt="logo" /> <div cla +ss="nice_colours"><a href="http://edina.ac.uk/">EDINA</a> and <a href +="http://mimas.ac.uk/">MIMAS</A> are pleased to announce a new set of + <a href="http://digimap.ac.uk">EDINA Digimap</a> training dates</div +> (Modules One and Two only) at the <B>Universities of <I>Middlesex</ +I> and <I>Edinburgh</B></I>. <!-- This is a comment --> <P>More details of Digimap training on the <a href="http://edina.ac.uk +/events/">events page</A>.'; my $count = 25; my $p = HTML::TokeParser->new(\$full_text); my @display_elements = (); my $count_of_words = ""; my @tag_stack = (); my @problem_tag_stack = (); # The plan is to get one token at a time, and process it # If the token is a start-tag, add the tag to the # tag-stack, and add the raw text to the display-list # If the token is an end-tag, then it should match the top # tag on the tag-stack, so we pop that off (to show # it's not outstanding), and add the raw text to the # display-list # If the token is a comment-tag, we just skip it # If the token is text, we add it to the display-list, # counting the words as we do so - stopping once we # have $words listed. # # Once we have the requisit number of words, we then close # all the elements still left in the tag-stack # while ($token_ref = $p->get_token) { last unless ($count_of_words - $count); # drop out when at $count if ($token_ref->[0] =~ /T/i) { # we have some text, so count it and stack it my @local_words = split /\b/, $token_ref->[1]; # split on boundries foreach $_ (@local_words) { push @display_elements, $_; $count_of_words++ if /\w+/; # only count when "words" are present last unless ($count_of_words - $count); # drop out when at $count } }; # end of text if ($token_ref->[0] =~ /S/i) { # We have the start of a tag next if ($token_ref->[1] =~ /^img/i); # push the raw HTML onto display-list push @display_elements, pop @$token_ref; # push a reference to the closing tag & closing # element onto the tag stack push @tag_stack, $token_ref->[1]; }; # end of start tag if ($token_ref->[0] =~ /e/i) { # We have the end of a tag # push the raw HTML onto display-list push @display_elements, pop @$token_ref; # the raw HTML # now to pop it off the stack (hopefully) my $tag = $token_ref->[1]; my $top_tag = pop @tag_stack; push @tag_stack, $top_tag unless ($tag eq $top_tag) ; }; # end of end tag } # Now we need to close any outstanding tags, in order # We have a list of element names, so now we close them foreach (@tag_stack) { my $tag = "<\/$_>"; push @display_elements, $tag; } $text = join '', @display_elements; print "Full text: $full_text\n\nLeader: $text\n";

-- Ian Stuart
A man depriving some poor village, somewhere, of a first-class idiot.