Re: Parsing string with tags

This is fairly verbose but I like to use HTML::Parser for parsing non-standard tagged text as it is very flexible and can be fine-tuned. Also good as a learning exercise.

use HTML::Parser;
use Data::Dumper;

my $tag_string = 'word1 <tag0> word2 <tag1>word3 word4</tag1> word5 </
+tag0> word6 <tag2>word7 word8</tag2> word9 <tag3>word10</tag3> word11
+';

my $tags = parse_string( $tag_string );
print Dumper( $tags );

sub parse_string {
    my $string = shift;

    my %tags;
    my %check_tags;
    my $in_tag = 0;
    my $current_tag = 'no_tag';
    my $prev_tag;
    
    # process start tag event
    my $start = sub {
        my ($tag) = @_;
        $check_tags{$tag}++;
        $in_tag++;
        $prev_tag = $current_tag;
        $current_tag = $tag;
    };

    # process text event
    my $text = sub {
        my ($text) = @_;
        $text =~ s/^\s+//;
        $text =~ s/\s+$//;
        return if not length $text;
        my @words = split(m/\s+/, $text);
        push( @{ $tags{$current_tag} }, @words);
    };
    
    # process end tag event
    my $end = sub {
        my ($tag) = @_;
        $check_tags{$tag}++;
        $in_tag--;
        $current_tag = $in_tag ? $prev_tag : 'no_tag';
    };

    my $parser = HTML::Parser->new(
        api_version => 3,
        start_h     => [$start, "tagname"],
        text_h      => [$text,  "text"],
        end_h       => [$end,   "tagname"],
        default_h   => [$text,  "text"],
    );

    $parser->parse($string);
    $parser->eof;
    
    # check each tag has an end tag
    for my $tag (keys %check_tags) {
        if ($check_tags{$tag} % 2) {
            print "<$tag> is not valid\n";
        }
    }

    return \%tags;
}
[download]

Output:

$VAR1 = {
    'no_tag' => [
        'word1',
        'word6',
        'word9',
        'word11'
    ],
    'tag0' => [
        'word2',
        'word5'
    ],
    'tag1' => [
        'word3',
        'word4'
    ],
    'tag2' => [
        'word7',
        'word8'
    ],
    'tag3' => [
        'word10'
    ]
};
[download]

Comment on Re: Parsing string with tags Select or Download Code


Keep It Simple, Stupid
	PerlMonks