Re: Script to reduce tokens to minimal unique characters

Here's an approach that also generates a hash of unique leading substrings for a set of strings. This hash is then used to generate a set of regexes for unique matching along the lines of QM's final thought above. This is more complex, but IMHO more flexible than some of the other approaches. YMMV.

File LeadingDistinct.pm:

# LeadingDistinct.pm                                        15jun15waw

# for each token in a set of tokens, find minimum leading
# substring that uniquely distinguishes the token
# from all other tokens in the set.

# works with Perl 5.8.9

package LeadingDistinct;

use warnings;
use strict;

use List::Util       qw(max);
use List::MoreUtils  qw(uniq);


sub extract {

    my (@tokens,
        ) = @_;

    @tokens = ('', uniq(sort @tokens), '');

    return
        map { $tokens[$_], distinguish(@tokens[ $_-1, $_, $_+1 ]) }
        1 .. $#tokens-1
        ;

    }


sub distinguish {

    my ($before,
        $word,
        $after,
        ) = @_;  # contiguous sequence of three alpha-sorted words

    return substr $word, 0, 1 + max diff($word, $before), diff($word, 
+$after);

    }


sub diff {

    my ($w1,
        $w2,
        ) = @_;

    my $x = $w1 ^ $w2;

    $x =~ s{ [^\x00] .* \z }{}xms;

    return length $x;

    }

1;
[download]

File LeadingDistinct.t:

# LeadingDistinct.t       test LeadingDistinct.pm           15jun15waw

use warnings;
use strict;

use Test::More
  # tests => ?? + 1  # Test::NoWarnings adds 1 test
    'no_plan'
    ;
use Test::NoWarnings;


# normal tokens (alpha and underscore).
use constant TOKENS => qw/ab abcd abcdef  ghi ghij ghijk
    lnm lnmopq lnmopqrst  uvw xyz  dcba edcba  xyzzy xyzzz
    pqrst pqrtu
    report_time report_day reset read
    /;

use constant TOKENS_AND_UNIQUE => (
    ab          => 'ab',
    abcd        => 'abcd',
    abcdef      => 'abcde',
    dcba        => 'd',
    edcba       => 'e',
    ghi         => 'ghi',
    ghij        => 'ghij',
    ghijk       => 'ghijk',
    lnm         => 'lnm',
    lnmopq      => 'lnmopq',
    lnmopqrst   => 'lnmopqr',
    pqrst       => 'pqrs',
    pqrtu       => 'pqrt',
    read        => 'rea',
    report_day  => 'report_d',
    report_time => 'report_t',
    reset       => 'res',
    uvw         => 'u',
    xyz         => 'xyz',
    xyzzy       => 'xyzzy',
    xyzzz       => 'xyzzz',
    );

# some tokens to test handling of metacharacters.
use constant WEIRD_TOKENS => qw(foo? foo?* foo+++  foo?*+ foo???);

use constant WEIRD_TOKENS_AND_UNIQUE => (
    'foo+++' => 'foo+',
    'foo?'   => 'foo?',
    'foo?*'  => 'foo?*',
    'foo?*+' => 'foo?*+',
    'foo???' => 'foo??',
    );


BEGIN { use_ok 'LeadingDistinct'; }

is_deeply { LeadingDistinct::extract(TOKENS) }, { TOKENS_AND_UNIQUE },
    "normal chars: alphas & underscores";

is_deeply { LeadingDistinct::extract(TOKENS, WEIRD_TOKENS) },
          { TOKENS_AND_UNIQUE, WEIRD_TOKENS_AND_UNIQUE },
    "normal chars and some metacharacters";
[download]

Output:

c:\@Work\Perl\monks\gator456>perl LeadingDistinct.t
ok 1 - use LeadingDistinct;
ok 2 - normal chars: alphas & underscores
ok 3 - normal chars and some metacharacters
ok 4 - no warnings
1..4
[download]

File extract_min_uniq_2.pl:

# extract_min_uniq_2.pl                                     15jun15waw


use warnings;
use strict;

use LeadingDistinct;

use Data::Dump qw(dd);

use constant DEBUGGING => 0;
use constant {
    PR_DB1 => 1 && DEBUGGING,
    PR_DB2 => 1 && DEBUGGING,
    PR_DB3 => 1 && DEBUGGING,
    };

my @tokens = qw/report_time report_day reset read/;

my %min_distinct = LeadingDistinct::extract(@tokens);
dd \%min_distinct if PR_DB1;

for my $k (keys %min_distinct) {
    $min_distinct{$k} = max_optional_rx($k, $min_distinct{$k});
    }
dd \%min_distinct if PR_DB2;

my ($find) =
    map qr{ (?<! [[:alpha:]_]) (?: $_) (?! [[:alpha:]_]) }xms,
    join '|',
    values %min_distinct
    ;
dd $find if PR_DB3;

while (<DATA>) {
    next unless m{ \A ($find) }xms;
    print "$1 \n";
    }


# subroutines ######################################################

sub max_optional_rx {

    my ($whole,   # full word
        $unique,  # distinctive characters at start of word
        ) = @_;

    my @tail = split //, substr $whole, length $unique;
    my $opt_tail = optional_tail(@tail);

    return qr{ \Q$unique\E $opt_tail }xms;

    }

sub optional_tail {

    my $rx = '';
    return $rx unless @_;

    $rx = quotemeta shift @_;
    for my $c (@_) {
        $c = quotemeta $c;
        $rx .= " (?: $c)?";
        }

    $rx = qr{ $rx }xms;
    return "$rx?";

    }


__DATA__
report_t
  14:09:33 PDT
report_d
Fri Jun 12 2015
report (should not show up)

res
Resetting the time

report_time
  00:00:00

rea foo.bar
Info: reading file foo.bar
[download]

Output:

c:\@Work\Perl\monks\gator456>perl extract_min_uniq_2.pl
report_t
report_d
res
report_time
rea
[download]

Update: Here's a simpler (and I expect faster, but I've done no Benchmark-ing) definition of the LeadingDistinct::diff() subroutine. The (assumed) speed-up won't be significant unless you're processing millions of symbols. Tested.

sub diff {
    ($_[0] ^ $_[1]) =~ m{ \A \x00* }xms;
    return $+[0];
    }
[download]

Give a man a fish: <%-(-(-(-<

Comment on Re: Script to reduce tokens to minimal unique characters Select or Download Code