=comment ASSUMPTIONS: - all text strings and patterns are composed of alphanumeric characters separated by spaces; - all pattern strings are unique (this is implied by the fact that they are used as keys in a hash). this approach will work if all the text strings can be concatenated together into a string that will fit into memory. (if not, relentless disk thrashing will ensue, along with an orders-of-magnitude speed penalty.) =cut use warnings; use strict; MAIN: { # begin main loop # define hashes for text and patterns. my %text = ( text_id_1 => 'four score and foo bar baz seven years', text_id_2 => 'to fee fie foe or not', text_id_3 => 'to wibble or not to wobble', text_id_4 => 'text with no patterns present', text_id_5 => 'text with fee fie foe and wobble present', text_id_6 => 'text with two wobble present wobble', # ...etc. ); # separator char not found in any text string or pattern string. my $sep = ':'; # anything not a separator is a text string or pattern string char. my $non_sep = qr{ [^$sep] }xms; # concatenation of all text strings. CAUTION: size! # also build hash associating text string endpoints with text ids. my $all_text = ''; my %text_endpoints; for my $text_id (keys %text) { $all_text .= $text{$text_id}; $text_endpoints{ length $all_text } = $text_id; $all_text .= $sep; } my %patterns = ( 'fee fie foe' => 'hi_lvl_id_1', 'foo bar baz' => 'hi_lvl_id_2', 'wibble' => 'hi_lvl_id_3', 'wobble' => 'hi_lvl_id_4', 'quux' => 'hi_lvl_id_5', # pattern present in no text # ...etc. ); # preprocess patterns hash for data extraction. while (my ($string, $id) = each %patterns) { $patterns{$string} = { hi_lvl_id => $id, parts => [ split /\s/, $string ], # capturing parentheses wrapped in non-capturing # look-ahead allows overlapping pattern capture. # this would be critical in the case of more than one # occurrence of a pattern string in a text string. regex => qr{ (?= ( \Q$string\E $non_sep* ) ) }xms, }; } # initialize output hashes. (this could be a single hash.) my %pattern_ids = map { $_ => '' } keys %text; my %part_ids = map { $_ => {} } keys %text; match(\$all_text, \%patterns, \%text_endpoints, \%pattern_ids, \%part_ids); use Data::Dumper; print "%pattern_ids: \n", Dumper \%pattern_ids; print "%part_ids: \n", Dumper \%part_ids; } # end main loop # subroutines ################################## sub match { my ($sr_all_text, # ref. to input scalar: all strings together $hr_patterns, # ref. to input hash: patterns $hr_text_endpoints, # ref. to input hash: text endpoints $hr_pattern_ids, # ref. to output hash: identified pattern ids $hr_part_ids, # ref. to output hash: identified part ids ) = @_; PATTERN: for my $hr_pattern (values %$hr_patterns) { TEXT: while (${$sr_all_text} =~ m{ $hr_pattern->{regex} }xmsg) { # get text id from text endpoint in @+ array. my $text_id = $hr_text_endpoints->{$+[1]}; $hr_pattern_ids->{$text_id} .= ":$hr_pattern->{hi_lvl_id}"; my @parts = @{ $hr_pattern->{parts} }; # for convenience @{ $hr_part_ids->{$text_id} }{@parts} = (0) x @parts; } # end while TEXT loop } # end for PATTERN loop }