in reply to Efficient regex matching with qr//; Can I do better?
use warnings; use strict; use Data::Dumper; MAIN: { # begin main loop # define hashes for text and patterns. my %text = ( text_id_1 => 'four score and foo bar baz seven years', text_id_2 => 'to fee fie foe or not', text_id_3 => 'to wibble or not to wobble', text_id_4 => 'text with no patterns present', text_id_5 => 'text with fee fie foe and wobble present', # ...etc. ); my %patterns = ( 'fee fie foe' => 'hi_lvl_id_1', 'foo bar baz' => 'hi_lvl_id_2', 'wibble' => 'hi_lvl_id_3', 'wobble' => 'hi_lvl_id_4', 'quux' => 'hi_lvl_id_5', # pattern present in no text # ...etc. ); # preprocess patterns hash for data extraction. while (my ($string, $id) = each %patterns) { $patterns{$string} = { hi_lvl_id => $id, regex => qr{ \b \Q$string\E \b }xms, parts => [ split /\s/, $string ], }; } # initialize output hashes. my %pattern_ids = map { $_ => '' } keys %text; my %part_ids = map { $_ => {} } keys %text; match(\%patterns, \%text, \%pattern_ids, \%part_ids); print "%pattern_ids: \n", Dumper \%pattern_ids; print "%part_ids: \n", Dumper \%part_ids; } # end main loop # subroutines ################################## sub match { my ($hr_patterns, # ref. to input hash: patterns $hr_text, # ref. to input hash: text strings $hr_pattern_ids, # ref. to output hash: identified pattern ids $hr_part_ids, # ref. to output hash: identified part ids ) = @_; PATTERN: for my $hr_pattern (values %$hr_patterns) { TEXT: while (my($text_id, $text) = each %$hr_text) { next TEXT unless $text =~ $hr_pattern->{regex}; $hr_pattern_ids->{$text_id} .= ":$hr_pattern->{hi_lvl_id}"; my @parts = @{ $hr_pattern->{parts} }; # for convenience @{ $hr_part_ids->{$text_id} }{@parts} = (0) x @parts; } # end while TEXT loop } # end for PATTERN loop }
Output:
%pattern_ids: $VAR1 = { 'text_id_3' => ':hi_lvl_id_3:hi_lvl_id_4', 'text_id_5' => ':hi_lvl_id_1:hi_lvl_id_4', 'text_id_2' => ':hi_lvl_id_1', 'text_id_4' => '', 'text_id_1' => ':hi_lvl_id_2' }; %part_ids: $VAR1 = { 'text_id_3' => { 'wibble' => 0, 'wobble' => 0 }, 'text_id_5' => { 'foe' => 0, 'wobble' => 0, 'fie' => 0, 'fee' => 0 }, 'text_id_2' => { 'foe' => 0, 'fie' => 0, 'fee' => 0 }, 'text_id_4' => {}, 'text_id_1' => { 'bar' => 0, 'baz' => 0, 'foo' => 0 } };
Perhaps more advantageous would be moritz's suggestion to compile all the patterns into one massive alternation, but I would quail at the thought of a regex that might easily exceed a million characters. But if it works...
|
|---|