use HTML::Parser;
use Data::Dumper;
my $tag_string = 'word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11';
my $tags = parse_string( $tag_string );
print Dumper( $tags );
sub parse_string {
my $string = shift;
my %tags;
my %check_tags;
my $in_tag = 0;
my $current_tag = 'no_tag';
my $prev_tag;
# process start tag event
my $start = sub {
my ($tag) = @_;
$check_tags{$tag}++;
$in_tag++;
$prev_tag = $current_tag;
$current_tag = $tag;
};
# process text event
my $text = sub {
my ($text) = @_;
$text =~ s/^\s+//;
$text =~ s/\s+$//;
return if not length $text;
my @words = split(m/\s+/, $text);
push( @{ $tags{$current_tag} }, @words);
};
# process end tag event
my $end = sub {
my ($tag) = @_;
$check_tags{$tag}++;
$in_tag--;
$current_tag = $in_tag ? $prev_tag : 'no_tag';
};
my $parser = HTML::Parser->new(
api_version => 3,
start_h => [$start, "tagname"],
text_h => [$text, "text"],
end_h => [$end, "tagname"],
default_h => [$text, "text"],
);
$parser->parse($string);
$parser->eof;
# check each tag has an end tag
for my $tag (keys %check_tags) {
if ($check_tags{$tag} % 2) {
print "<$tag> is not valid\n";
}
}
return \%tags;
}
####
$VAR1 = {
'no_tag' => [
'word1',
'word6',
'word9',
'word11'
],
'tag0' => [
'word2',
'word5'
],
'tag1' => [
'word3',
'word4'
],
'tag2' => [
'word7',
'word8'
],
'tag3' => [
'word10'
]
};