This is fairly verbose but I like to use
HTML::Parser for parsing non-standard tagged text as it is very flexible and can be fine-tuned. Also good as a learning exercise.
use HTML::Parser;
use Data::Dumper;
my $tag_string = 'word1 <tag0> word2 <tag1>word3 word4</tag1> word5 </
+tag0> word6 <tag2>word7 word8</tag2> word9 <tag3>word10</tag3> word11
+';
my $tags = parse_string( $tag_string );
print Dumper( $tags );
sub parse_string {
my $string = shift;
my %tags;
my %check_tags;
my $in_tag = 0;
my $current_tag = 'no_tag';
my $prev_tag;
# process start tag event
my $start = sub {
my ($tag) = @_;
$check_tags{$tag}++;
$in_tag++;
$prev_tag = $current_tag;
$current_tag = $tag;
};
# process text event
my $text = sub {
my ($text) = @_;
$text =~ s/^\s+//;
$text =~ s/\s+$//;
return if not length $text;
my @words = split(m/\s+/, $text);
push( @{ $tags{$current_tag} }, @words);
};
# process end tag event
my $end = sub {
my ($tag) = @_;
$check_tags{$tag}++;
$in_tag--;
$current_tag = $in_tag ? $prev_tag : 'no_tag';
};
my $parser = HTML::Parser->new(
api_version => 3,
start_h => [$start, "tagname"],
text_h => [$text, "text"],
end_h => [$end, "tagname"],
default_h => [$text, "text"],
);
$parser->parse($string);
$parser->eof;
# check each tag has an end tag
for my $tag (keys %check_tags) {
if ($check_tags{$tag} % 2) {
print "<$tag> is not valid\n";
}
}
return \%tags;
}
Output:
$VAR1 = {
'no_tag' => [
'word1',
'word6',
'word9',
'word11'
],
'tag0' => [
'word2',
'word5'
],
'tag1' => [
'word3',
'word4'
],
'tag2' => [
'word7',
'word8'
],
'tag3' => [
'word10'
]
};