in reply to Break string into array

It's basically the same problem as recursive algorithm for nested data structures

Update: Actually, it's a bit more different than I thought because you said you can't have both OR and AND in the same list.

# make_parser.pl use strict; use warnings; use Parse::RecDescent qw( ); my $grammar = <<'__EOI__'; { use strict; use warnings; sub dequote { my ($s) = @_; $s =~ s/^"//; $s =~ s/"\z//; $s =~ s/\\(.)/$1/sg; return $s; } } parse : list /\Z/ { $item[1] } list : term list_[ $item[1] ] list_ : "AND" <commit> and_list { [ $item[1] => $arg[0], @{$item[3]} ] } | "OR" <commit> or_list { [ $item[1] => $arg[0], @{$item[3]} ] } | { $arg[0] } and_list : term and_list_ { [ $item[1], @{$item[2]} ] } and_list_ : "AND" <commit> term and_list_ { [ $item[3], @{$item[4]} ] } | { [] } or_list : term or_list_ { [ $item[1], @{$item[2]} ] } or_list_ : "OR" <commit> term or_list_ { [ $item[3], @{$item[4]} ] } | { [] } term : IDENT | STRING | '(' list ')' { $item[2] } IDENT : /\w+/ STRING : /"(?:\\.|[^\\])*"/s { dequote($item[1]) } __EOI__ Parse::RecDescent->Precompile($grammar, 'Parser') or die("Bad grammar\n");
# test.pl use strict; use warnings; use Data::Dumper qw( Dumper ); use Parser qw( ); my $text = 'dogs OR cats OR "flying fish" OR (shrimp AND squid)'; my $parser = Parser->new(); print(Dumper($parser->parse($text)));
>perl make_parser.pl >perl test.pl $VAR1 = [ 'OR', 'dogs', 'cats', 'flying fish', [ 'AND', 'shrimp', 'squid' ] ];

Update: Small changes to grammar to eliminate backtracking.

Replies are listed 'Best First'.
Re^2: Break string into array
by Anonymous Monk on Sep 18, 2009 at 08:48 UTC
    Here is how that might look with Regexp::Grammars
    #!/usr/bin/perl -- use strict; use warnings; my $s = q[dogs OR cats OR "flying fish" OR (shrimp AND squid)]; my $parser = do { use Regexp::Grammars; qr{ # <logfile: - > <[TERM]>* <rule: TERM> <OP> | <MATCH=IDENT> | <MATCH=STRING> | <LIST> <rule: STRING> "([^"]+?)" <rule: OP> AND|OR <rule: IDENT> \w+ <rule: LIST> \( <[TERM]>* \) }xs }; if($s =~ $parser){ my(%rash) = %/;#bah for scite lexer /# undef %/;# bah for scite lexer /# use Data::Dumper(); print Data::Dumper->new([\%rash])->Indent(1)->Useqq(1)->Dump,"\n"; kek(\%rash); # kill empty key print Data::Dumper->new([\%rash])->Indent(1)->Useqq(1)->Dump,"\n"; my $rash = reorder_terms(\%rash); # consumes %rash print Data::Dumper->new([$rash])->Indent(1)->Useqq(1)->Dump,"\n"; } sub reorder_terms { my( $ref ) = @_; if( $$ref{TERM}){ my @term; my @op; for my $t( @{$$ref{TERM}} ){ if( ref $t ){ if( $$t{OP} ){ push @op, delete $$t{OP}; }elsif( $$t{LIST} ){ push @term, reorder_terms(delete $$t{LIST} ); }else{ die "uh oh, no OP or LIST key"; } } else { push @term, $t; } } undef %$ref; #return [@op, @term ]; return [$op[0], @term ]; } die "uh oh, no TERM key"; } sub kek { my ($ref) = @_; my $typ = ref $ref; if( $typ eq 'HASH'){ delete $$ref{""}; for my $val( values %$ref){ ref $val and kek($val); } } if( $typ eq 'ARRAY'){ for my $val( @$ref){ ref $val and kek($val); } } return; } __END__ $VAR1 = { "" => "dogs OR cats OR \"flying fish\" OR (shrimp AND squid)", "TERM" => [ "dogs", { "" => " OR", "OP" => "OR" }, "cats", { "" => " OR", "OP" => "OR" }, "\"flying fish\"", { "" => " OR", "OP" => "OR" }, { "" => " (shrimp AND squid)", "LIST" => { "" => "(shrimp AND squid)", "TERM" => [ "shrimp", { "" => " AND", "OP" => "AND" }, "squid" ] } } ] }; $VAR1 = { "TERM" => [ "dogs", { "OP" => "OR" }, "cats", { "OP" => "OR" }, "\"flying fish\"", { "OP" => "OR" }, { "LIST" => { "TERM" => [ "shrimp", { "OP" => "AND" }, "squid" ] } } ] }; $VAR1 = [ "OR", "dogs", "cats", "\"flying fish\"", [ "AND", "shrimp", "squid" ] ];
    Uncomment <logfile: - > for some debug. See also KinoSearch::Docs::Cookbook::CustomQueryParser, Text::Query.

      Not only does your solution allow

      my $s = q[dogs OR cats AND "flying fish" OR (shrimp AND squid)];
      it parses to the same as
      my $s = q[dogs OR cats OR "flying fish" OR (shrimp AND squid)];

      And it also allows

      my $s = q[OR dogs OR cats OR "flying fish" OR (shrimp AND squid)];

      Finally, quoted strings are left quoted. A parser shouldn't return literals. If you want to differentiate between quoted and unquoted terms, you'll need to add to the parse tree.

      $VAR1 = [ "OR", [ term => "dogs" ], [ term => "cats" ], [ phrase => "flying fish" ], [ "AND", [ term => "shrimp" ], [ term => "squid" ], ] ];
        That is all true, but it does satisfy the OPs requirement/example :)