=comment

ASSUMPTIONS:
  - all text strings and patterns are composed of
    alphanumeric characters separated by spaces;
  - all pattern strings are unique (this is implied by
    the fact that they are used as keys in a hash).

this approach will work if all the text strings can be
concatenated together into a string that will fit into
memory.  (if not, relentless disk thrashing will ensue, along
with an orders-of-magnitude speed penalty.)

=cut


use warnings;
use strict;


MAIN: {  # begin main loop

# define hashes for text and patterns.
my %text = (
    text_id_1 => 'four score and foo bar baz seven years',
    text_id_2 => 'to fee fie foe or not',
    text_id_3 => 'to wibble or not to wobble',
    text_id_4 => 'text with no patterns present',
    text_id_5 => 'text with fee fie foe and wobble present',
    text_id_6 => 'text with two wobble present wobble',
    # ...etc.
    );

# separator char not found in any text string or pattern string.
my $sep = ':';

# anything not a separator is a text string or pattern string char.
my $non_sep = qr{ [^$sep] }xms;

# concatenation of all text strings.  CAUTION: size!
# also build hash associating text string endpoints with text ids.

my $all_text = '';
my %text_endpoints;

for my $text_id (keys %text) {
    $all_text .= $text{$text_id};
    $text_endpoints{ length $all_text } = $text_id;
    $all_text .= $sep;
    }

my %patterns = (
    'fee fie foe' => 'hi_lvl_id_1',
    'foo bar baz' => 'hi_lvl_id_2',
    'wibble'      => 'hi_lvl_id_3',
    'wobble'      => 'hi_lvl_id_4',
    'quux'        => 'hi_lvl_id_5',  # pattern present in no text
    # ...etc.
    );

# preprocess patterns hash for data extraction.
while (my ($string, $id) = each %patterns) {
    $patterns{$string} = {
        hi_lvl_id => $id,
        parts     => [ split /\s/, $string ],
        # capturing parentheses wrapped in non-capturing
        # look-ahead allows overlapping pattern capture.
        # this would be critical in the case of more than one
        # occurrence of a pattern string in a text string.
        regex => qr{ (?= ( \Q$string\E $non_sep* ) ) }xms,
        };
    }

# initialize output hashes.  (this could be a single hash.)
my %pattern_ids = map { $_ => '' } keys %text;
my %part_ids    = map { $_ => {} } keys %text;


match(\$all_text, \%patterns, \%text_endpoints,
      \%pattern_ids, \%part_ids);


use Data::Dumper;

print "%pattern_ids: \n", Dumper \%pattern_ids;
print "%part_ids: \n",    Dumper \%part_ids;

}  # end main loop


# subroutines ##################################

sub match {

    my ($sr_all_text,    # ref. to input scalar: all strings together
        $hr_patterns,    # ref. to input hash: patterns
        $hr_text_endpoints, # ref. to input hash: text endpoints
        $hr_pattern_ids, # ref. to output hash: identified pattern ids
        $hr_part_ids,    # ref. to output hash: identified part ids
        ) = @_;

    PATTERN:
    for my $hr_pattern (values %$hr_patterns) {

        TEXT:
        while (${$sr_all_text} =~ m{ $hr_pattern->{regex} }xmsg) {

          # get text id from text endpoint in @+ array.
          my $text_id = $hr_text_endpoints->{$+[1]};

          $hr_pattern_ids->{$text_id} .= ":$hr_pattern->{hi_lvl_id}";

          my @parts = @{ $hr_pattern->{parts} };  # for convenience
          @{ $hr_part_ids->{$text_id} }{@parts} = (0) x @parts;

          }  # end while TEXT loop

        }  # end for PATTERN loop

    }