Re: Best method to eliminate substrings from array (updated)

Here's an approach based on index. This is presented in a unit-test format (see How to ask better questions using Test::More and sample data); if you have a test case that's problematic, please let me know it using the test-case syntax detailed in the code below (see the @Tests array). I haven't tried this code with an input list of a million 250-character part number groups, but I think it should fit into a laptop-ish memory footprint! I've made no attempt to Benchmark this; you're welcome to do so, and if you do, please let us know the results!

File toss_included_substrings_1.pl:

use warnings;
use strict;

use Test::More 'no_plan';
use Test::NoWarnings;

use Data::Dump qw(dd);

note "perl version: $]";

my @Tests = (
  # test case syntax:
  #  [ [ qw(input list) ] => [ qw(output list) ], 'optional comment' ]
  'degenerate cases',
  [ [] => [] ],  [ [ '' ] => [ '' ] ],  [ [ '', '' ] => [ '', '' ] ],

  'single item lists',
  [ [qw(1F2)] => [qw(1F2)] ],    [ [qw(1F2|3Z4)] => [qw(1F2|3Z4)] ],

  'copies of items - ARE THESE CORRECT?',
  [ [qw(123  123)         ] => [qw(123  123)        ], '???' ],
  [ [qw(123  123  123)    ] => [qw(123  123  123)   ], '???' ],
  [ [qw(123|456  123|456) ] => [qw(123|456  123|456)], '???' ],

  'no subsets',
  [ [qw(1F2|3Z4  2F3)    ] => [qw(1F2|3Z4  2F3)]     ],
  [ [qw(2F3  1F2|3Z4)    ] => [qw(2F3  1F2|3Z4)]     ],
  [ [qw(1F2|3Z4  2F3)    ] => [qw(1F2|3Z4  2F3)]     ],
  [ [qw(1F2|3Z4  2F3|4Z5)] => [qw(1F2|3Z4  2F3|4Z5)] ],
  q{no subsets - embedded 'part number' substrings},
  [ [qw(345  23456  1234567)] => [qw(345  23456  1234567)] ],
  [ [qw(1234567  23456  345)] => [qw(1234567  23456  345)] ],
  [ [qw(345   23456|1234567)] => [qw(345   23456|1234567)] ],
  [ [qw(23456|1234567   345)] => [qw(23456|1234567   345)] ],

  'subsets are present',
  [ [ qw(123|456  987|123|456  123|456|789) ]
    =>
    [ qw(         987|123|456  123|456|789) ]
    ],
  [ [ qw(987|123|456  123|456  123|456|789) ]
    =>
    [ qw(987|123|456           123|456|789) ]
    ],
  [ [ qw(987|123|456  123|456|789  123|456) ]
    =>
    [ qw(987|123|456  123|456|789         ) ]
    ],
  [ [ qw(
      2N0472|6N8595|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
+|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|2757
+803
      3419308|3514531|3525716|3557019|3586192|3635776|3783741
      3T3625|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
+|1F7854
      3T3625|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
+|1F7854|8W1152|8R0721
      3T3628|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
+|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|1336
+934
      4N4906|6N6481|9L1366|1189902|1413983|8B2026|1M3381|7K3377
      4N4906|6N6481|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
      6N7936|6N5049|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
+|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|2757
+803
      6Y0248|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
+|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|1336
+934
      6Y0248|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377
      6Y0248|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
+|1F7854|8W1152
      ) ]
    =>
    [ qw(
      2N0472|6N8595|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
+|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|2757
+803
      3419308|3514531|3525716|3557019|3586192|3635776|3783741
      3T3625|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
+|1F7854|8W1152|8R0721
      3T3628|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
+|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|1336
+934
      4N4906|6N6481|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
      6N7936|6N5049|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
+|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|2757
+803
      6Y0248|6T7765|9L1366|1189902|1413983|8B2026|1M3381|7K3377|3H5788
+|1F7854|8W1152|8R0721|9C5344|6W6672|9G7101|3023908|6Y1352|4P0489|1336
+934
      ) ], 'from catemp pm#11101979'
      ],

  'oddball subsets',
  [ [ qw(123  123|123)              ] => [ qw(123|123)     ] ],
  [ [ qw(123  123|123  123|123|123) ] => [ qw(123|123|123) ] ],
  );

VECTOR:
for my $ar_vector (@Tests) {

    if (not ref $ar_vector) {
        note $ar_vector;
        next VECTOR;
        }

    my ($ar_groups, $ar_expected, $comment) = @$ar_vector;
    $comment = '' unless defined $comment;

    my $ar_got = no_subgroups($ar_groups);

    is_deeply $ar_got, $ar_expected, $comment;

    }  # end for VECTOR

note "test set finished";

done_testing;

exit;

# subroutines ######################################################

use List::MoreUtils qw(uniq);

sub no_subgroups {  # return all groups not present in larger groups

    my ($ar_groups  # part number groups
        ) = @_;

    my $guard = "\x00";
    my $sep   = '|';

    my $str_grps = join "$sep$guard$sep", '', uniq(@$ar_groups), '';

    return [  # return array reference
        map  $_->[0],
        grep index($str_grps, $_->[1], 1 + index($str_grps, $_->[1])) 
+< 0,
      # map  { print "=== '$_->[1]' ", index($str_grps, $_->[1], lengt
+h($_->[1]) + index($str_grps, $_->[1])), "\n";  $_; }  # for debug
        map  [ $_, "$sep$_$sep" ],
        @$ar_groups
        ];

    }
[download]

Update: Note on Output Order. The output order of the no_subgroups() function is similar to that for List::MoreUtils::uniq() (also in recent versions of List::Util): identical to the input order except that "subgroup" items are removed from the stream. The one question I have concerns identical repeats of part numbers or part number groups. ~~The~~ | My current code allows repeated identical items to pass through unmolested (update: the OPed code suggests this specification). In the notation of the test code:
[ [ qw(foo foo) ] => [ qw(foo foo) ] ]
But whether an item can be a subgroup of itself is unclear to me. Two other possibilities for handling such repeats would be:
[ [ qw(foo foo) ] => [ qw(foo) ] ] Repeats made unique a la uniq();
[ [ qw(foo foo) ] => [ ] ] Repeats removed because they are subgroups of themselves.
Either of these latter two behaviors can be achieved simply by changing the position and number of calls to uniq() within the no_subgroups() function. If you're interested in this, I can post further info.

Give a man a fish: <%-{-{-{-<

Comment on Re: Best method to eliminate substrings from array (updated) Select or Download Code