in reply to Best method to eliminate substrings from array

Here's an approach based on index. This is presented in a unit-test format (see How to ask better questions using Test::More and sample data); if you have a test case that's problematic, please let me know it using the test-case syntax detailed in the code below (see the  @Tests array). I haven't tried this code with an input list of a million 250-character part number groups, but I think it should fit into a laptop-ish memory footprint! I've made no attempt to Benchmark this; you're welcome to do so, and if you do, please let us know the results!

File toss_included_substrings_1.pl:

use warnings; use strict; use Test::More 'no_plan'; use Test::NoWarnings; use Data::Dump qw(dd); note "perl version: $]"; my @Tests = ( # test case syntax: # [ [ qw(input list) ] => [ qw(output list) ], 'optional comment' ] 'degenerate cases', [ [] => [] ], [ [ '' ] => [ '' ] ], [ [ '', '' ] => [ '', '' ] ], 'single item lists', [ [qw(1F2)] => [qw(1F2)] ], [ [qw(1F2|3Z4)] => [qw(1F2|3Z4)] ], 'copies of items - ARE THESE CORRECT?', [ [qw(123 123) ] => [qw(123 123) ], '???' ], [ [qw(123 123 123) ] => [qw(123 123 123) ], '???' ], [ [qw(123|456 123|456) ] => [qw(123|456 123|456)], '???' ], 'no subsets', [ [qw(1F2|3Z4 2F3) ] => [qw(1F2|3Z4 2F3)] ], [ [qw(2F3 1F2|3Z4) ] => [qw(2F3 1F2|3Z4)] ], [ [qw(1F2|3Z4 2F3) ] => [qw(1F2|3Z4 2F3)] ], [ [qw(1F2|3Z4 2F3|4Z5)] => [qw(1F2|3Z4 2F3|4Z5)] ], q{no subsets - embedded 'part number' substrings}, [ [qw(345 23456 1234567)] => [qw(345 23456 1234567)] ], [ [qw(1234567 23456 345)] => [qw(1234567 23456 345)] ], [ [qw(345 23456|1234567)] => [qw(345 23456|1234567)] ], [ [qw(23456|1234567 345)] => [qw(23456|1234567 345)] ], 'subsets are present', [ [ qw(123|456 987|123|456 123|456|789) ] => [ qw( 987|123|456 123|456|789) ] ], [ [ qw(987|123|456 123|456 123|456|789) ] => [ qw(987|123|456 123|456|789) ] ], [ [ qw(987|123|456 123|456|789 123|456) ] => [ qw(987|123|456 123|456|789 ) ] ], [ [ qw( 2N0472|6N8595|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 +|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|2757 +803 3419308|3514531|3525716|3557019|3586192|3635776|3783741 3T3625|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 +|1F7854 3T3625|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 +|1F7854|8W1152|8R0721 3T3628|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 +|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|1336 +934 4N4906|6N6481|9L1366|1189902|1413983|8B2026|1M3381|7K3377 4N4906|6N6481|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 6N7936|6N5049|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 +|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|2757 +803 6Y0248|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 +|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|1336 +934 6Y0248|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377 6Y0248|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 +|1F7854|8W1152 ) ] => [ qw( 2N0472|6N8595|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 +|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|2757 +803 3419308|3514531|3525716|3557019|3586192|3635776|3783741 3T3625|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 +|1F7854|8W1152|8R0721 3T3628|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 +|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|1336 +934 4N4906|6N6481|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 6N7936|6N5049|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 +|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|2757 +803 6Y0248|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788 +|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|1336 +934 ) ], 'from catemp pm#11101979' ], 'oddball subsets', [ [ qw(123 123|123) ] => [ qw(123|123) ] ], [ [ qw(123 123|123 123|123|123) ] => [ qw(123|123|123) ] ], ); VECTOR: for my $ar_vector (@Tests) { if (not ref $ar_vector) { note $ar_vector; next VECTOR; } my ($ar_groups, $ar_expected, $comment) = @$ar_vector; $comment = '' unless defined $comment; my $ar_got = no_subgroups($ar_groups); is_deeply $ar_got, $ar_expected, $comment; } # end for VECTOR note "test set finished"; done_testing; exit; # subroutines ###################################################### use List::MoreUtils qw(uniq); sub no_subgroups { # return all groups not present in larger groups my ($ar_groups # part number groups ) = @_; my $guard = "\x00"; my $sep = '|'; my $str_grps = join "$sep$guard$sep", '', uniq(@$ar_groups), ''; return [ # return array reference map $_->[0], grep index($str_grps, $_->[1], 1 + index($str_grps, $_->[1])) +< 0, # map { print "=== '$_->[1]' ", index($str_grps, $_->[1], lengt +h($_->[1]) + index($str_grps, $_->[1])), "\n"; $_; } # for debug map [ $_, "$sep$_$sep" ], @$ar_groups ]; }

Update: Note on Output Order. The output order of the  no_subgroups() function is similar to that for List::MoreUtils::uniq() (also in recent versions of List::Util): identical to the input order except that "subgroup" items are removed from the stream. The one question I have concerns identical repeats of part numbers or part number groups. The | My current code allows repeated identical items to pass through unmolested (update: the OPed code suggests this specification). In the notation of the test code:
    [ [ qw(foo foo) ] => [ qw(foo foo) ] ]
But whether an item can be a subgroup of itself is unclear to me. Two other possibilities for handling such repeats would be:
    [ [ qw(foo foo) ] => [ qw(foo) ] ] Repeats made unique a la uniq();
    [ [ qw(foo foo) ] => [ ] ] Repeats removed because they are subgroups of themselves.
Either of these latter two behaviors can be achieved simply by changing the position and number of calls to  uniq() within the  no_subgroups() function. If you're interested in this, I can post further info.


Give a man a fish:  <%-{-{-{-<