Your code and the solutions already presented won't handle nested parens correctly. For example,
NOT ("test(s)"[MESH] AND ("A"[MESH] OR "B"[MESH]))
will fail. The solution below works better (although it uses an "experimental" regexp feature.
#/usr/bin/perl use strict; use warnings; my $string = 'NOT ("test(s)"[MESH] AND ("A"[MESH] OR "B"[MESH]))'; my $parens_guts; # Can't combine this line with the next one. $parens_guts = qr/ (?: "[^"]*" | \( (??{ $parens_guts }) \) | [^"()] )* /sx; $string =~ s/ \s* NOT \s* (?: "[^"]*"\[MESH\] | \( (??{ $parens_guts }) \) ) //gsx; print("[$string]$/");
I could write a Parse::RecDescent solution if you don't want to use the "experimental" (??{ ... }).
Update: I went and did the Parse::RecDescent version for fun at lunch.
make_parser.pl -- Run this once to create MeshGrammar.pm
#!/usr/bin/perl # make_parser.pl use strict; use warnings; use Parse::RecDescent (); my $grammar = <<'__EOI__'; parse : expr eof { $item[1] } eof : /^\Z/ expr : expr_(s?) term { [ (map{@$_}@{$item[1]}), $item[2] ] } expr_ : term binop { [ $item[1], $item[2] ] } term : unary term { [ $item[1], $item[2] ] } | /"[^"]*"\[MESH\]/ { [ 'MESH', $item[1] ] } | '(' expr ')' { [ 'PAREN', $item[2] ] } binop : 'OR' { $item[1] } | 'AND' { $item[1] } | 'NOT' { $item[1] } # unary : 'NOT' { $item[1] } # I don't know what to do with it, # so I'm not supporting it. unary : { undef } __EOI__ $::RD_HINT = 1; # $::RD_TRACE = 1; rename('MeshGrammar.pm', 'MeshGrammar.pm.bak'); Parse::RecDescent->Precompile($grammar, 'MeshGrammar') or die("Bad grammar.\n");
remove_not.pl
#!/usr/bin/perl # remove_not.pl use strict; use warnings; use MeshGrammar (); sub process_term { our $output; local *output = \$_[0]; my $term = $_[1]; if ($term->[0] eq 'MESH') { $output .= $term->[1]; return; } if ($term->[0] eq 'PAREN') { $output .= '('; process_expr($output, $term->[1]); $output .= ')'; return; } if ($term->[0] eq 'NOT') { warn("WARNING: Behaviour for unary NOT not defined. Unary NOT no +t removed.\n"); $output .= $term->[0]; $output .= ' '; process_term($output, $term->[1]); return; } die(); # Should never reach here. } sub process_expr { our $output; local *output = \$_[0]; my $expr = $_[1]; my $i = 0; my $n = @$expr; my $term; my $op; $term = $expr->[$i++]; process_term($output, $term); while ($i != $n) { $op = $expr->[$i++]; $term = $expr->[$i++]; next if ($op eq 'NOT'); $output .= ' '; $output .= $op; $output .= ' '; process_term($output, $term); } } my $parser = MeshGrammar->new(); # The following can be put in a loop. my $string = <<'__EOI__'; ("Immunologic and Biological Factors"[MESH] OR "Immunosuppressive Agen +ts"[MESH] OR "Transplantation Immunology"[MESH] OR "Allergy and Immun +ology"[MESH] OR "Graft vs Host Disease"[MESH]) NOT ("Foo"[MESH] OR "B +ar"[MESH]) AND ("Kidney Transplantation"[MESH] OR "Liver Transplantat +ion"[MESH] OR "Heart Transplantation"[MESH]) NOT ("My Term"[MESH] OR +"Blah"[MESH]) NOT "foobar"[MESH] __EOI__ my $tree = $parser->parse($string) or die("Bad text.\n"); # require Data::Dumper; # print Data::Dumper::Dumper($tree); my $output = ''; process_expr($output, $tree); print($output, $/);
In reply to Re: refine regex
by ikegami
in thread refine regex
by rsiedl
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |