Re: getting the first n printable words from a string of HTML

I have a solution, which solves the problem a differnet way:
#!/usr/cpan/bin/perl

use HTML::TokeParser;
use Data::Dumper;

my $full_text = '<P><img src="/images/logo.png" alt="logo" /> <div cla
+ss="nice_colours"><a href="http://edina.ac.uk/">EDINA</a> and <a href
+="http://mimas.ac.uk/">MIMAS</A> are pleased to announce a new set of
+ <a href="http://digimap.ac.uk">EDINA Digimap</a> training dates</div
+> (Modules One and Two only) at the <B>Universities of <I>Middlesex</
+I> and <I>Edinburgh</B></I>.
<!-- This is a comment -->
<P>More details of Digimap training on the <a href="http://edina.ac.uk
+/events/">events page</A>.';
my $count = 25;

my $p = HTML::TokeParser->new(\$full_text);

my @display_elements = ();
my $count_of_words = "";
my @tag_stack = ();
my @problem_tag_stack = ();

# The plan is to get one token at a time, and process it
# If the token is a start-tag, add the tag to the
#    tag-stack, and add the raw text to the display-list
# If the token is an end-tag, then it should match the top
#    tag on the tag-stack, so we pop that off (to show
#    it's not outstanding), and add the raw text to the
#    display-list
# If the token is a comment-tag, we just skip it
# If the token is text, we add it to the display-list,
#    counting the words as we do so - stopping once we
#    have $words listed.
#
# Once we have the requisit number of words, we then close
#    all the elements still left in the tag-stack
#

while ($token_ref = $p->get_token) {
  last unless ($count_of_words - $count); 
  # drop out when at $count

  if ($token_ref->[0] =~ /T/i) {

    # we have some text, so count it and stack it
    my @local_words = split /\b/, $token_ref->[1];
                       # split on boundries
    foreach $_ (@local_words) {
      push @display_elements, $_;
      $count_of_words++ if /\w+/;
                    # only count when "words" are present
      last unless ($count_of_words - $count);
                    # drop out when at $count
    }
  }; # end of text

  if ($token_ref->[0] =~ /S/i) { 
    # We have the start of a tag
    next if ($token_ref->[1] =~ /^img/i);

    # push the raw HTML onto display-list
    push @display_elements, pop @$token_ref; 
  
    # push a reference to the closing tag & closing
    #    element onto the tag stack
    push @tag_stack, $token_ref->[1];
  }; # end of start tag

  if ($token_ref->[0] =~ /e/i) {
    # We have the end of a tag

    # push the raw HTML onto display-list
    push @display_elements, pop @$token_ref; 
    # the raw HTML

    # now to pop it off the stack (hopefully)
    my $tag = $token_ref->[1];
    my $top_tag = pop @tag_stack;
    push @tag_stack, $top_tag unless ($tag eq $top_tag) ;

  }; # end of end tag
}

# Now we need to close any outstanding tags, in order
# We have a list of element names, so now we close them
foreach (@tag_stack) {
  my $tag = "<\/$_>";
  push @display_elements, $tag;
}

$text = join '', @display_elements;
print "Full text: $full_text\n\nLeader: $text\n";
[download]
-- Ian Stuart
A man depriving some poor village, somewhere, of a first-class idiot.
Comment on Re: getting the first n printable words from a string of HTML Download Code