Re: An efficient way to gather a common portion of several strings' beginnings

Here's my solution (probably not the best, but at least O(n)), along with testing of all the others except atcroft's, which has many failures. (These failures may have a simple fix, but I've no time right now.)

File common_base_string_1.pl:

use 5.010;

use warnings;
use strict;

use List::Util qw(minstr maxstr);
use Data::Dump qw(pp);

use Test::More
  # tests => ?? + 1  # Test::NoWarnings adds 1 test
    'no_plan'
    ;
use Test::NoWarnings;


use constant STR => qw(abcdefXyzzy  abcdefFoobar  abcdefWibble  abcdef
+Zot);


FUNT:  # functions under test
for my $func_name (qw(
  common_base_anomalousmonk  common_base_browseruk  common_base_hippo
  common_base_grandfather
  ),
  # qw(common_base_atcroft),
  ) {

  note qq{\n---- testing $func_name() ---- \n\n};
  *common_base = do { no strict 'refs';  *$func_name };

  VECTOR:
  for my $ar_vector (
    q{empty list: degenerate case returns undef in scalar context},
    [ undef ],

    q{all these should have ZERO length common base},
    [ '', '' ],  [ '', '', '' ],  [ '', 'x', 'y' ],  [ '', 'abc', 'xyz
+' ],
    [ '', STR, '' ],  [ '', STR, 'x' ],
    [ '', '', STR ],  [ '', 'x', STR ],
    [ '', STR, '', STR ],

    q{all these should have NON-ZERO length common base},
    [ 'a', 'a' ],  [ 'a', 'a', 'a' ],  [ 'abcdef', STR ],

    q{strings containing nulls},
    [ qq{\000}, qq{\000} ],    [ qq{\000}, qq{\000}, qq{\000} ],
    [ qq{\000}, qq{\000},     qq{\000\000} ],
    [ qq{\000}, qq{\000\000}, qq{\000}     ],
    [ qq{\000}, qq{\000}, qq{\000\000}, qq{\000\000\000} ],
    [ qq{\000}, qq{\000\000\000}, qq{\000\000}, qq{\000} ],
    [ qq{\000z}, qq{\000z} ],  [ qq{\000}, qq{\000x}, qq{\000y} ],
    [ qq{\000z\000}, qq{\000z\000} ],
    [ qq{\000z\000}, qq{\000z\000y\000}, qq{\000z\000x\000} ],
    [ qq{\000}, qq{\000x\000}, qq{\000y\000} ],
    [ qq{\000\000}, qq{\000\000}, qq{\000\000\000} ],
  # [ qq{}, qq{}, qq{} ],

    ) {

    if (not ref $ar_vector) {
      note $ar_vector;
      next VECTOR;
      }

    my ($expected_common_base, @strings) = @$ar_vector;

    # my $cmnt_strings  = join q{', '}, @strings;
    #    $cmnt_strings  = qq{'$cmnt_strings'} if length $cmnt_strings;
    # my $cmnt_expected = $expected_common_base // 'undef';
    #    $cmnt_expected = qq{'$cmnt_expected'} if defined $expected_co
+mmon_base;

    is common_base(@strings), $expected_common_base,
     # qq{($cmnt_strings) -> $cmnt_expected}
       sprintf '%s -> %s', pp(@strings), pp($expected_common_base)
       ;

    }  # end for VECTOR

  }  # end for FUNT


# functions under test #############################################

sub common_base_anomalousmonk {

  return unless @_;  # degenerate case: empty list

  my $min = minstr @_;
  my $max = maxstr @_;

  return
    ($min ^ $max) =~ m{ [^\x00] }xms ? substr $min, 0, $-[0] : $min
    ;

  }


sub common_base_browseruk {

    my (@strings) = @_;

    # additional: deal with degenerate and corner cases
    return             unless @strings;
    return $strings[0] unless @strings > 1;

    # match  m[(^\0+)]  changed to  m[(^\0*)]  to avoid assigning
    # undef to  $common  if there is no common base substring;
    # empty string assigned in this case.

  # my( $mask ) = ( $strings[ 0 ] ^ $strings[ 1 ] ) =~ m[(^\0+)];
    my( $mask ) = ( $strings[ 0 ] ^ $strings[ 1 ] ) =~ m[(^\0*)];
    my $common = substr $strings[ 0 ], 0, length $mask;

    for my $i ( 2 .. $#strings ) {
        if( substr( $strings[ $i ], 0, length $common ) ne $common ) {
          # ( $mask ) = ( $strings[ 0 ] ^ $strings[ $i ] ) =~ m[(^\0+)
+];
            ( $mask ) = ( $strings[ 0 ] ^ $strings[ $i ] ) =~ m[(^\0*)
+];
            $common = substr $strings[ 0 ], 0, length $mask;
        }
    }

    return $common;

    }


sub common_base_hippo {

    # additional: deal with degenerate and corner cases
    return       unless @_;
    return $_[0] unless @_ > 1;

    my @strings = sort @_;

    return
        ($strings[0] ^ $strings[-1]) =~ m{ [^\x00] }xms
        ? substr $strings[0], 0, $-[0] : $strings[0]
        ;

    }


sub common_base_grandfather {

    my (@strings) = @_;

    # additional: deal with degenerate and corner cases
    return             unless @strings;
    return $strings[0] unless @strings > 1;

    my $common = $strings[0];

    # match  m/^\0+/  changed to  m/^\0*/  to avoid assigning
    # undef to  $common  if there is no common base substring;
    # empty string assigned in this case.

    for my $str (@strings[1 .. $#strings]) {
      # ($common ^ $str) =~ m/^\0+/;  # no common base substring: fail
        ($common ^ $str) =~ m/^\0*/;
        $common = substr $str, 0, $+[0] if $+[0] < length $common;
        }

    return $common;

    }


sub common_base_atcroft {

    my (@string) = @_;

    # additional: deal with degenerate and corner cases
    return            unless @string;
    return $string[0] unless @string > 1;

    my $common = shift @string;

    while ( my $teststr = shift @string
      and length $common ) {
      # if ( $common = substr( $teststr, 0, length $common ) {
        if ( $common = substr  $teststr, 0, length $common ) {
            next;
        }
        my $flag = length $common;
        while ( $flag ) {
          # if ( $common eq substr( $teststr, 0, $flag ) {
            if ( $common eq substr  $teststr, 0, $flag ) {
                $flag = 0;
            }
            else {
                $flag--;
                $common = substr( $common, 0, $flag );
            }
        }
    }

    return $common;

    }
[download]

Output:

c:\@Work\Perl\monks\igoryonya>perl common_base_string_1.pl
#
# ---- testing common_base_anomalousmonk() ----
#
# empty list: degenerate case returns undef in scalar context
ok 1 - () -> undef
# all these should have ZERO length common base
ok 2 - "" -> ""
ok 3 - ("", "") -> ""
ok 4 - ("x", "y") -> ""
ok 5 - ("abc", "xyz") -> ""
ok 6 - ("abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot", ""
+) -> ""
ok 7 - ("abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot", "x
+") -> ""
ok 8 - ("", "abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot"
+) -> ""
ok 9 - ("x", "abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot
+") -> ""
ok 10 - (
#   "abcdefXyzzy",
#   "abcdefFoobar",
#   "abcdefWibble",
#   "abcdefZot",
#   "",
#   "abcdefXyzzy",
#   "abcdefFoobar",
#   "abcdefWibble",
#   "abcdefZot",
# ) -> ""
# all these should have NON-ZERO length common base
ok 11 - "a" -> "a"
ok 12 - ("a", "a") -> "a"
ok 13 - ("abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot") -
+> "abcdef"
# strings containing nulls
ok 14 - "\0" -> "\0"
ok 15 - ("\0", "\0") -> "\0"
ok 16 - ("\0", "\0\0") -> "\0"
ok 17 - ("\0\0", "\0") -> "\0"
ok 18 - ("\0", "\0\0", "\0\0\0") -> "\0"
ok 19 - ("\0\0\0", "\0\0", "\0") -> "\0"
ok 20 - "\0z" -> "\0z"
ok 21 - ("\0x", "\0y") -> "\0"
ok 22 - "\0z\0" -> "\0z\0"
ok 23 - ("\0z\0y\0", "\0z\0x\0") -> "\0z\0"
ok 24 - ("\0x\0", "\0y\0") -> "\0"
ok 25 - ("\0\0", "\0\0\0") -> "\0\0"
#
# ---- testing common_base_browseruk() ----
#
# empty list: degenerate case returns undef in scalar context
ok 26 - () -> undef
# all these should have ZERO length common base
ok 27 - "" -> ""
ok 28 - ("", "") -> ""
ok 29 - ("x", "y") -> ""
ok 30 - ("abc", "xyz") -> ""
ok 31 - ("abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot", "
+") -> ""
ok 32 - ("abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot", "
+x") -> ""
ok 33 - ("", "abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot
+") -> ""
ok 34 - ("x", "abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZo
+t") -> ""
ok 35 - (
#   "abcdefXyzzy",
#   "abcdefFoobar",
#   "abcdefWibble",
#   "abcdefZot",
#   "",
#   "abcdefXyzzy",
#   "abcdefFoobar",
#   "abcdefWibble",
#   "abcdefZot",
# ) -> ""
# all these should have NON-ZERO length common base
ok 36 - "a" -> "a"
ok 37 - ("a", "a") -> "a"
ok 38 - ("abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot") -
+> "abcdef"
# strings containing nulls
ok 39 - "\0" -> "\0"
ok 40 - ("\0", "\0") -> "\0"
ok 41 - ("\0", "\0\0") -> "\0"
not ok 42 - ("\0\0", "\0") -> "\0"
#   Failed test '("\0\0", "\0") -> "\0"'
#   at common_base_string_1.pl line 102.
#          got: '  '
#     expected: ' '
ok 43 - ("\0", "\0\0", "\0\0\0") -> "\0"
not ok 44 - ("\0\0\0", "\0\0", "\0") -> "\0"
#   Failed test '("\0\0\0", "\0\0", "\0") -> "\0"'
#   at common_base_string_1.pl line 102.
#          got: '   '
#     expected: ' '
ok 45 - "\0z" -> "\0z"
ok 46 - ("\0x", "\0y") -> "\0"
ok 47 - "\0z\0" -> "\0z\0"
ok 48 - ("\0z\0y\0", "\0z\0x\0") -> "\0z\0"
ok 49 - ("\0x\0", "\0y\0") -> "\0"
ok 50 - ("\0\0", "\0\0\0") -> "\0\0"
#
# ---- testing common_base_hippo() ----
#
# empty list: degenerate case returns undef in scalar context
ok 51 - () -> undef
# all these should have ZERO length common base
ok 52 - "" -> ""
ok 53 - ("", "") -> ""
ok 54 - ("x", "y") -> ""
ok 55 - ("abc", "xyz") -> ""
ok 56 - ("abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot", "
+") -> ""
ok 57 - ("abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot", "
+x") -> ""
ok 58 - ("", "abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot
+") -> ""
ok 59 - ("x", "abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZo
+t") -> ""
ok 60 - (
#   "abcdefXyzzy",
#   "abcdefFoobar",
#   "abcdefWibble",
#   "abcdefZot",
#   "",
#   "abcdefXyzzy",
#   "abcdefFoobar",
#   "abcdefWibble",
#   "abcdefZot",
# ) -> ""
# all these should have NON-ZERO length common base
ok 61 - "a" -> "a"
ok 62 - ("a", "a") -> "a"
ok 63 - ("abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot") -
+> "abcdef"
# strings containing nulls
ok 64 - "\0" -> "\0"
ok 65 - ("\0", "\0") -> "\0"
ok 66 - ("\0", "\0\0") -> "\0"
ok 67 - ("\0\0", "\0") -> "\0"
ok 68 - ("\0", "\0\0", "\0\0\0") -> "\0"
ok 69 - ("\0\0\0", "\0\0", "\0") -> "\0"
ok 70 - "\0z" -> "\0z"
ok 71 - ("\0x", "\0y") -> "\0"
ok 72 - "\0z\0" -> "\0z\0"
ok 73 - ("\0z\0y\0", "\0z\0x\0") -> "\0z\0"
ok 74 - ("\0x\0", "\0y\0") -> "\0"
ok 75 - ("\0\0", "\0\0\0") -> "\0\0"
#
# ---- testing common_base_grandfather() ----
#
# empty list: degenerate case returns undef in scalar context
ok 76 - () -> undef
# all these should have ZERO length common base
ok 77 - "" -> ""
ok 78 - ("", "") -> ""
ok 79 - ("x", "y") -> ""
ok 80 - ("abc", "xyz") -> ""
ok 81 - ("abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot", "
+") -> ""
ok 82 - ("abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot", "
+x") -> ""
ok 83 - ("", "abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot
+") -> ""
ok 84 - ("x", "abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZo
+t") -> ""
ok 85 - (
#   "abcdefXyzzy",
#   "abcdefFoobar",
#   "abcdefWibble",
#   "abcdefZot",
#   "",
#   "abcdefXyzzy",
#   "abcdefFoobar",
#   "abcdefWibble",
#   "abcdefZot",
# ) -> ""
# all these should have NON-ZERO length common base
ok 86 - "a" -> "a"
ok 87 - ("a", "a") -> "a"
ok 88 - ("abcdefXyzzy", "abcdefFoobar", "abcdefWibble", "abcdefZot") -
+> "abcdef"
# strings containing nulls
ok 89 - "\0" -> "\0"
ok 90 - ("\0", "\0") -> "\0"
ok 91 - ("\0", "\0\0") -> "\0"
not ok 92 - ("\0\0", "\0") -> "\0"
#   Failed test '("\0\0", "\0") -> "\0"'
#   at common_base_string_1.pl line 102.
#          got: '  '
#     expected: ' '
ok 93 - ("\0", "\0\0", "\0\0\0") -> "\0"
not ok 94 - ("\0\0\0", "\0\0", "\0") -> "\0"
#   Failed test '("\0\0\0", "\0\0", "\0") -> "\0"'
#   at common_base_string_1.pl line 102.
#          got: '   '
#     expected: ' '
ok 95 - "\0z" -> "\0z"
ok 96 - ("\0x", "\0y") -> "\0"
ok 97 - "\0z\0" -> "\0z\0"
ok 98 - ("\0z\0y\0", "\0z\0x\0") -> "\0z\0"
ok 99 - ("\0x\0", "\0y\0") -> "\0"
ok 100 - ("\0\0", "\0\0\0") -> "\0\0"
ok 101 - no warnings
1..101
# Looks like you failed 4 tests of 101.
[download]

Give a man a fish: <%-{-{-{-<

Comment on Re: An efficient way to gather a common portion of several strings' beginnings Select or Download Code

Replies are listed 'Best First'.
Re^2: An efficient way to gather a common portion of several strings' beginnings by james28909 (Deacon) on Nov 15, 2015 at 20:49 UTC
You left me out of that test, I feel so unappreciated ;( haha :P	[reply]