#!/usr/cpan/bin/perl use HTML::TokeParser; use Data::Dumper; my $full_text = '

logo

EDINA and MIMAS are pleased to announce a new set of EDINA Digimap training dates
(Modules One and Two only) at the Universities of Middlesex and Edinburgh.

More details of Digimap training on the events page.'; my $count = 25; my $p = HTML::TokeParser->new(\$full_text); my @display_elements = (); my $count_of_words = ""; my @tag_stack = (); my @problem_tag_stack = (); # The plan is to get one token at a time, and process it # If the token is a start-tag, add the tag to the # tag-stack, and add the raw text to the display-list # If the token is an end-tag, then it should match the top # tag on the tag-stack, so we pop that off (to show # it's not outstanding), and add the raw text to the # display-list # If the token is a comment-tag, we just skip it # If the token is text, we add it to the display-list, # counting the words as we do so - stopping once we # have $words listed. # # Once we have the requisit number of words, we then close # all the elements still left in the tag-stack # while ($token_ref = $p->get_token) { last unless ($count_of_words - $count); # drop out when at $count if ($token_ref->[0] =~ /T/i) { # we have some text, so count it and stack it my @local_words = split /\b/, $token_ref->[1]; # split on boundries foreach $_ (@local_words) { push @display_elements, $_; $count_of_words++ if /\w+/; # only count when "words" are present last unless ($count_of_words - $count); # drop out when at $count } }; # end of text if ($token_ref->[0] =~ /S/i) { # We have the start of a tag next if ($token_ref->[1] =~ /^img/i); # push the raw HTML onto display-list push @display_elements, pop @$token_ref; # push a reference to the closing tag & closing # element onto the tag stack push @tag_stack, $token_ref->[1]; }; # end of start tag if ($token_ref->[0] =~ /e/i) { # We have the end of a tag # push the raw HTML onto display-list push @display_elements, pop @$token_ref; # the raw HTML # now to pop it off the stack (hopefully) my $tag = $token_ref->[1]; my $top_tag = pop @tag_stack; push @tag_stack, $top_tag unless ($tag eq $top_tag) ; }; # end of end tag } # Now we need to close any outstanding tags, in order # We have a list of element names, so now we close them foreach (@tag_stack) { my $tag = "<\/$_>"; push @display_elements, $tag; } $text = join '', @display_elements; print "Full text: $full_text\n\nLeader: $text\n";