comment on

Here's an approach that also generates a hash of unique leading substrings for a set of strings. This hash is then used to generate a set of regexes for unique matching along the lines of QM's final thought above. This is more complex, but IMHO more flexible than some of the other approaches. YMMV.

File LeadingDistinct.pm:

# LeadingDistinct.pm                                        15jun15waw

# for each token in a set of tokens, find minimum leading
# substring that uniquely distinguishes the token
# from all other tokens in the set.

# works with Perl 5.8.9

package LeadingDistinct;

use warnings;
use strict;

use List::Util       qw(max);
use List::MoreUtils  qw(uniq);


sub extract {

    my (@tokens,
        ) = @_;

    @tokens = ('', uniq(sort @tokens), '');

    return
        map { $tokens[$_], distinguish(@tokens[ $_-1, $_, $_+1 ]) }
        1 .. $#tokens-1
        ;

    }


sub distinguish {

    my ($before,
        $word,
        $after,
        ) = @_;  # contiguous sequence of three alpha-sorted words

    return substr $word, 0, 1 + max diff($word, $before), diff($word, 
+$after);

    }


sub diff {

    my ($w1,
        $w2,
        ) = @_;

    my $x = $w1 ^ $w2;

    $x =~ s{ [^\x00] .* \z }{}xms;

    return length $x;

    }

1;
[download]

File LeadingDistinct.t:

# LeadingDistinct.t       test LeadingDistinct.pm           15jun15waw

use warnings;
use strict;

use Test::More
  # tests => ?? + 1  # Test::NoWarnings adds 1 test
    'no_plan'
    ;
use Test::NoWarnings;


# normal tokens (alpha and underscore).
use constant TOKENS => qw/ab abcd abcdef  ghi ghij ghijk
    lnm lnmopq lnmopqrst  uvw xyz  dcba edcba  xyzzy xyzzz
    pqrst pqrtu
    report_time report_day reset read
    /;

use constant TOKENS_AND_UNIQUE => (
    ab          => 'ab',
    abcd        => 'abcd',
    abcdef      => 'abcde',
    dcba        => 'd',
    edcba       => 'e',
    ghi         => 'ghi',
    ghij        => 'ghij',
    ghijk       => 'ghijk',
    lnm         => 'lnm',
    lnmopq      => 'lnmopq',
    lnmopqrst   => 'lnmopqr',
    pqrst       => 'pqrs',
    pqrtu       => 'pqrt',
    read        => 'rea',
    report_day  => 'report_d',
    report_time => 'report_t',
    reset       => 'res',
    uvw         => 'u',
    xyz         => 'xyz',
    xyzzy       => 'xyzzy',
    xyzzz       => 'xyzzz',
    );

# some tokens to test handling of metacharacters.
use constant WEIRD_TOKENS => qw(foo? foo?* foo+++  foo?*+ foo???);

use constant WEIRD_TOKENS_AND_UNIQUE => (
    'foo+++' => 'foo+',
    'foo?'   => 'foo?',
    'foo?*'  => 'foo?*',
    'foo?*+' => 'foo?*+',
    'foo???' => 'foo??',
    );


BEGIN { use_ok 'LeadingDistinct'; }

is_deeply { LeadingDistinct::extract(TOKENS) }, { TOKENS_AND_UNIQUE },
    "normal chars: alphas & underscores";

is_deeply { LeadingDistinct::extract(TOKENS, WEIRD_TOKENS) },
          { TOKENS_AND_UNIQUE, WEIRD_TOKENS_AND_UNIQUE },
    "normal chars and some metacharacters";
[download]

Output:

c:\@Work\Perl\monks\gator456>perl LeadingDistinct.t
ok 1 - use LeadingDistinct;
ok 2 - normal chars: alphas & underscores
ok 3 - normal chars and some metacharacters
ok 4 - no warnings
1..4
[download]

File extract_min_uniq_2.pl:

# extract_min_uniq_2.pl                                     15jun15waw


use warnings;
use strict;

use LeadingDistinct;

use Data::Dump qw(dd);

use constant DEBUGGING => 0;
use constant {
    PR_DB1 => 1 && DEBUGGING,
    PR_DB2 => 1 && DEBUGGING,
    PR_DB3 => 1 && DEBUGGING,
    };

my @tokens = qw/report_time report_day reset read/;

my %min_distinct = LeadingDistinct::extract(@tokens);
dd \%min_distinct if PR_DB1;

for my $k (keys %min_distinct) {
    $min_distinct{$k} = max_optional_rx($k, $min_distinct{$k});
    }
dd \%min_distinct if PR_DB2;

my ($find) =
    map qr{ (?<! [[:alpha:]_]) (?: $_) (?! [[:alpha:]_]) }xms,
    join '|',
    values %min_distinct
    ;
dd $find if PR_DB3;

while (<DATA>) {
    next unless m{ \A ($find) }xms;
    print "$1 \n";
    }


# subroutines ######################################################

sub max_optional_rx {

    my ($whole,   # full word
        $unique,  # distinctive characters at start of word
        ) = @_;

    my @tail = split //, substr $whole, length $unique;
    my $opt_tail = optional_tail(@tail);

    return qr{ \Q$unique\E $opt_tail }xms;

    }

sub optional_tail {

    my $rx = '';
    return $rx unless @_;

    $rx = quotemeta shift @_;
    for my $c (@_) {
        $c = quotemeta $c;
        $rx .= " (?: $c)?";
        }

    $rx = qr{ $rx }xms;
    return "$rx?";

    }


__DATA__
report_t
  14:09:33 PDT
report_d
Fri Jun 12 2015
report (should not show up)

res
Resetting the time

report_time
  00:00:00

rea foo.bar
Info: reading file foo.bar
[download]

Output:

c:\@Work\Perl\monks\gator456>perl extract_min_uniq_2.pl
report_t
report_d
res
report_time
rea
[download]

Update: Here's a simpler (and I expect faster, but I've done no Benchmark-ing) definition of the LeadingDistinct::diff() subroutine. The (assumed) speed-up won't be significant unless you're processing millions of symbols. Tested.

sub diff {
    ($_[0] ^ $_[1]) =~ m{ \A \x00* }xms;
    return $+[0];
    }
[download]

Give a man a fish: <%-(-(-(-<

In reply to Re: Script to reduce tokens to minimal unique characters by AnomalousMonk
in thread Script to reduce tokens to minimal unique characters by Anonymous Monk

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.