in reply to Re: Problems counting regex matches
in thread Problems counting regex matches
... this won't work for all cases though, if you have two matching words in the neighbourhood of the same (currency|foreign exchange), just one will be counted. For exemple in "Currency revenue sales growth", you'll just get "revenue" ...
The following works for overlapping matches. It also needs 5.10+ because in addition to (?|pattern), it uses (*FAIL) from the Special Backtracking Control Verbs introduced in that version. The variation that only counts occurrences may be a little faster.
use 5.010; # for (?|pattern) and (*FAIL) use warnings FATAL => 'all' ; use strict; # factored regexes my $r_s_g = qr{ (?i) (?> \b (?: revenues? | sales | growth ) \b) }x +ms; my $c_fe = qr{ (?i) (?> \b (?: currency | foreign \W exchange) \b) }x +ms; my $word = qr{ \b \w+ \W+ (?: \b | \Z) }xms; my $max_between = 4; my $near = qr{ (?:$word){0,$max_between}? }xms; # the test text my $s = <<EOT; Currency revenue and sales growth in foreign exchange Sales and Revenues EOT print qq{[[$s]] \n\n}; # extract matches 'in context' our @matches; use re 'eval'; $s =~ m{ (?= (?| ($r_s_g) \W+ $near ($c_fe) | ($c_fe) \W+ $near ($r_s_g)) (?{ push @matches, [ $1, $2 ] }) (*FAIL) ) }xmsg; print qq{'$_->[0]' ... '$_->[1]' \n} for @matches; print qq{\n}; # just count matches our $n_matches; $s =~ m{ (?= (?| $r_s_g \W+ $near $c_fe | $c_fe \W+ $near $r_s_g) (?{ ++$n_matches }) (*FAIL) ) }xmsg; print qq{$n_matches matches \n};
Output:
[[Currency revenue and sales growth in foreign exchange Sales and Revenues ]] 'Currency' ... 'revenue' 'Currency' ... 'sales' 'Currency' ... 'growth' 'revenue' ... 'foreign exchange' 'sales' ... 'foreign exchange' 'growth' ... 'foreign exchange' 'foreign exchange' ... 'Sales' 'foreign exchange' ... 'Revenues' 8 matches
|
|---|