in reply to Parsing newick trees
# Parse::NewickSimple.pm 20oct08waw =comment Parsing newick trees by citromatik (Chaplain) on Oct 17, 2008 at 15:27 UTC (#717769=perlquestion) (PerlMonks) Hi all I'm trying to write a script that parses recursively simple newick binary trees. For example, having: (A,(B,C)) I want to obtain a data structure that represents the binary tree, like: $VAR1 = { 'LEFT' => { 'VALUE' => 'A' }, 'RIGHT' => { 'LEFT' => { 'VALUE' => 'B' }, 'RIGHT' => { 'VALUE' => 'C' } } }; ... citromatik =cut package Parse::NewickSimple; { # begin package closure use warnings; use strict; our $VERSION = '0.1.0'; our @EXPORT = (); # no default exports our @EXPORT_OK = qw(parse); use Exporter; our @ISA = qw(Exporter); BEGIN { # begin closure for initialized debug control my %db = map { $_ => undef } qw{ -PARSE_TRACE }; sub db_onoff { my $onoff = shift; $db{$_} = $onoff for grep { exists $db{$_} or die "unknown debug flag $_" } @_; } sub DBon (;*****************) { db_onoff( 1, @_ ? @_ : keys %db) } sub DBoff (;*****************) { db_onoff(undef, @_ ? @_ : keys %db) } sub DB (;*****************) { return scalar grep $db{$_}, @_ } sub Any (;*****************) { return DB @_ } sub All (;*****************) { return @_ == DB @_ } } # end closure for debug control { # begin closure for parse depth tracking my $depth; # recursion depth sub fresh_parse { $depth = 0 } sub enter_parse () { ++$depth } sub leave_parse ($) { --$depth; $_[0] } sub parse_depth () { $depth } } # end closure for parse depth tracking BEGIN { # begin closure for parsing and initialized error tracking my $error = ''; # parse error string sub error () { $error } sub parse { my ($expr, # string, simple newick expression to parse ) = @_; fresh_parse; local $@ = ''; my $hr_parsed = eval { _parse($expr) }; $error = $@; return $hr_parsed; } } # end closure for parsing and parse error tracking sub _parse { my ($expr, # string: expression to parse ) = @_; DB -PARSE_TRACE and printf "%2d -> `%s' \n", parse_depth, $expr; # CAUTION: these regexes MUST NOT capture. my $value_subexpr = qr{ [a-zA-Z]+ }oxms; my $pure_value = qr{ \A $value_subexpr \z }oxms; # return expression if it is a pure value in a newick expression return $expr if $expr =~ $pure_value and parse_depth; enter_parse; # not a pure value; gotta do some work # NOTE: for reliable performance, these regexes # should be mutually exclusive. my $opener = qr{ \( }oxms; my $closer = qr{ \) }oxms; my $separator = qr{ , }oxms; my $ws = qr{ \s+ }oxms; my $not_open_or_close = qr{ (?! $opener | $closer ) . }oxms; use re 'eval'; my $parenthetic_subexpr; # MUST define and initialize SEPARATELY!?!? (5.8 or 5.8.2 bug?) $parenthetic_subexpr = qr{ $opener (?: (?> $not_open_or_close+ ) # no backtracking | (??{ $parenthetic_subexpr }) # recurse )+ $closer }oxms; no re 'eval'; # regex mutual exclusivity no longer important. my $subexpr = qr{ (?: $value_subexpr | $parenthetic_subexpr ) }oxms; # ok to capture now. # NOTE: 5.10 named capture would be nice here. my ($left, $right) = $expr =~ qr{ # return $1, $2 captures \A $ws? # start of string, maybe some whitespace $opener $ws? # ($subexpr) $ws? # $1 captures left subexpression $separator $ws? # ($subexpr) $ws? # $2 captures right subexpression $closer $ws? # \z # end of string }oxms or die "depth ", parse_depth, ": failed parsing `$expr' \n"; # return anon. hash reference to parsed (sub)expression return leave_parse { LEFT => _parse($left ), RIGHT => _parse($right), }; } # end sub _parse() } # end Parse::NewickSimple package closure 1; # flag inclusion success
|
|---|