# LeadingDistinct.pm 15jun15waw # for each token in a set of tokens, find minimum leading # substring that uniquely distinguishes the token # from all other tokens in the set. # works with Perl 5.8.9 package LeadingDistinct; use warnings; use strict; use List::Util qw(max); use List::MoreUtils qw(uniq); sub extract { my (@tokens, ) = @_; @tokens = ('', uniq(sort @tokens), ''); return map { $tokens[$_], distinguish(@tokens[ $_-1, $_, $_+1 ]) } 1 .. $#tokens-1 ; } sub distinguish { my ($before, $word, $after, ) = @_; # contiguous sequence of three alpha-sorted words return substr $word, 0, 1 + max diff($word, $before), diff($word, $after); } sub diff { my ($w1, $w2, ) = @_; my $x = $w1 ^ $w2; $x =~ s{ [^\x00] .* \z }{}xms; return length $x; } 1; #### # LeadingDistinct.t test LeadingDistinct.pm 15jun15waw use warnings; use strict; use Test::More # tests => ?? + 1 # Test::NoWarnings adds 1 test 'no_plan' ; use Test::NoWarnings; # normal tokens (alpha and underscore). use constant TOKENS => qw/ab abcd abcdef ghi ghij ghijk lnm lnmopq lnmopqrst uvw xyz dcba edcba xyzzy xyzzz pqrst pqrtu report_time report_day reset read /; use constant TOKENS_AND_UNIQUE => ( ab => 'ab', abcd => 'abcd', abcdef => 'abcde', dcba => 'd', edcba => 'e', ghi => 'ghi', ghij => 'ghij', ghijk => 'ghijk', lnm => 'lnm', lnmopq => 'lnmopq', lnmopqrst => 'lnmopqr', pqrst => 'pqrs', pqrtu => 'pqrt', read => 'rea', report_day => 'report_d', report_time => 'report_t', reset => 'res', uvw => 'u', xyz => 'xyz', xyzzy => 'xyzzy', xyzzz => 'xyzzz', ); # some tokens to test handling of metacharacters. use constant WEIRD_TOKENS => qw(foo? foo?* foo+++ foo?*+ foo???); use constant WEIRD_TOKENS_AND_UNIQUE => ( 'foo+++' => 'foo+', 'foo?' => 'foo?', 'foo?*' => 'foo?*', 'foo?*+' => 'foo?*+', 'foo???' => 'foo??', ); BEGIN { use_ok 'LeadingDistinct'; } is_deeply { LeadingDistinct::extract(TOKENS) }, { TOKENS_AND_UNIQUE }, "normal chars: alphas & underscores"; is_deeply { LeadingDistinct::extract(TOKENS, WEIRD_TOKENS) }, { TOKENS_AND_UNIQUE, WEIRD_TOKENS_AND_UNIQUE }, "normal chars and some metacharacters"; #### c:\@Work\Perl\monks\gator456>perl LeadingDistinct.t ok 1 - use LeadingDistinct; ok 2 - normal chars: alphas & underscores ok 3 - normal chars and some metacharacters ok 4 - no warnings 1..4 #### # extract_min_uniq_2.pl 15jun15waw use warnings; use strict; use LeadingDistinct; use Data::Dump qw(dd); use constant DEBUGGING => 0; use constant { PR_DB1 => 1 && DEBUGGING, PR_DB2 => 1 && DEBUGGING, PR_DB3 => 1 && DEBUGGING, }; my @tokens = qw/report_time report_day reset read/; my %min_distinct = LeadingDistinct::extract(@tokens); dd \%min_distinct if PR_DB1; for my $k (keys %min_distinct) { $min_distinct{$k} = max_optional_rx($k, $min_distinct{$k}); } dd \%min_distinct if PR_DB2; my ($find) = map qr{ (?) { next unless m{ \A ($find) }xms; print "$1 \n"; } # subroutines ###################################################### sub max_optional_rx { my ($whole, # full word $unique, # distinctive characters at start of word ) = @_; my @tail = split //, substr $whole, length $unique; my $opt_tail = optional_tail(@tail); return qr{ \Q$unique\E $opt_tail }xms; } sub optional_tail { my $rx = ''; return $rx unless @_; $rx = quotemeta shift @_; for my $c (@_) { $c = quotemeta $c; $rx .= " (?: $c)?"; } $rx = qr{ $rx }xms; return "$rx?"; } __DATA__ report_t 14:09:33 PDT report_d Fri Jun 12 2015 report (should not show up) res Resetting the time report_time 00:00:00 rea foo.bar Info: reading file foo.bar #### c:\@Work\Perl\monks\gator456>perl extract_min_uniq_2.pl report_t report_d res report_time rea #### sub diff { ($_[0] ^ $_[1]) =~ m{ \A \x00* }xms; return $+[0]; }