use strict; use warnings 'all'; # The phrase we are testing $_ = '"this is a phrase" +one -two .three, "another" phrase "unbalalanced bit, ""remove'; print $_,"\n"; my (@phrases,@terms); # Grab the chunks and stick into our arrays while(m/\G("[^"]*"|[^"]+|"[^"]*)/gc) { my $p = $1; next unless defined($p); if($p =~ m/"$/) { push @phrases,cleanup('phrase',$p); } else { push @terms,cleanup('term',$p); } } # Display the phrases and terms foreach (@phrases) { print 'phrase: ',$_,"\n"; } foreach (@terms) { print 'term: \'',$_,"'\n"; } # # Sub cleanup # # Removes quotes and multiple spaces. In the case of a # term it also removes all punctuation (other than a + or a -) # and splits on spaces. # sub cleanup { my $context = shift; return unless defined $context; if($context eq 'phrase') { $_[0] =~ s/"+//g; $_[0] =~ s/\s+/ /g; return $_[0]; } else { $_[0] =~ s/"//g; $_[0] =~ s/[^\w\d\+\-]+/ /g; $_[0] =~ s/^\s+//g; return split(/\s+/,$_[0]); } }