It's much easier to parse a structure if you mark the end types as well as the start. You could do something like the following:
use strict;
use warnings;
use Data::Dumper;
my $structure = main(join '',<DATA>);
print Dumper($structure);
sub main {
my ($s, $c, %hash) = $_[0];
while ($s =~ /START PAGE(?: (\w+))?\s+(.*?)\s+END PAGE/gs) {
$hash{$1 ? $1 : ++$c} = page($2);
}
return \%hash;
}
sub page {
my ($s, $c, %hash) = $_[0];
while ($s =~ /START QUESTION(?: (\w+))?\s+(.*?)\s+END QUESTION/gs)
+ {
$hash{$1 ? $1 : ++$c} = question($2);
}
return \%hash;
}
sub question {
my ($s, %hash) = $_[0];
($hash{'label'}) = $s =~ /LABEL (.*)/;
$s =~ /START CHOICES\s+(.*?)\s+END CHOICES/s;
for (split / *\n */, $1) {
push @{$hash{'choices'}}, [split / /, $_, 2];
}
return \%hash;
}
__DATA__
START PAGE p1
START QUESTION 4B
LABEL Do you like your pie with ice cream?
START CHOICES
1 Yes
2 No
END CHOICES
END QUESTION
START QUESTION 4C
LABEL Do you like your pie with whipped cream?
START CHOICES
1 Yes
2 No
END CHOICES
END QUESTION
END PAGE
I've made the choices into arrays instead of hashes, to preserve order.