in reply to Script to reduce tokens to minimal unique characters
Here's an approach that also generates a hash of unique leading substrings for a set of strings. This hash is then used to generate a set of regexes for unique matching along the lines of QM's final thought above. This is more complex, but IMHO more flexible than some of the other approaches. YMMV.
File LeadingDistinct.pm:
# LeadingDistinct.pm 15jun15waw # for each token in a set of tokens, find minimum leading # substring that uniquely distinguishes the token # from all other tokens in the set. # works with Perl 5.8.9 package LeadingDistinct; use warnings; use strict; use List::Util qw(max); use List::MoreUtils qw(uniq); sub extract { my (@tokens, ) = @_; @tokens = ('', uniq(sort @tokens), ''); return map { $tokens[$_], distinguish(@tokens[ $_-1, $_, $_+1 ]) } 1 .. $#tokens-1 ; } sub distinguish { my ($before, $word, $after, ) = @_; # contiguous sequence of three alpha-sorted words return substr $word, 0, 1 + max diff($word, $before), diff($word, +$after); } sub diff { my ($w1, $w2, ) = @_; my $x = $w1 ^ $w2; $x =~ s{ [^\x00] .* \z }{}xms; return length $x; } 1;
File LeadingDistinct.t:
Output:# LeadingDistinct.t test LeadingDistinct.pm 15jun15waw use warnings; use strict; use Test::More # tests => ?? + 1 # Test::NoWarnings adds 1 test 'no_plan' ; use Test::NoWarnings; # normal tokens (alpha and underscore). use constant TOKENS => qw/ab abcd abcdef ghi ghij ghijk lnm lnmopq lnmopqrst uvw xyz dcba edcba xyzzy xyzzz pqrst pqrtu report_time report_day reset read /; use constant TOKENS_AND_UNIQUE => ( ab => 'ab', abcd => 'abcd', abcdef => 'abcde', dcba => 'd', edcba => 'e', ghi => 'ghi', ghij => 'ghij', ghijk => 'ghijk', lnm => 'lnm', lnmopq => 'lnmopq', lnmopqrst => 'lnmopqr', pqrst => 'pqrs', pqrtu => 'pqrt', read => 'rea', report_day => 'report_d', report_time => 'report_t', reset => 'res', uvw => 'u', xyz => 'xyz', xyzzy => 'xyzzy', xyzzz => 'xyzzz', ); # some tokens to test handling of metacharacters. use constant WEIRD_TOKENS => qw(foo? foo?* foo+++ foo?*+ foo???); use constant WEIRD_TOKENS_AND_UNIQUE => ( 'foo+++' => 'foo+', 'foo?' => 'foo?', 'foo?*' => 'foo?*', 'foo?*+' => 'foo?*+', 'foo???' => 'foo??', ); BEGIN { use_ok 'LeadingDistinct'; } is_deeply { LeadingDistinct::extract(TOKENS) }, { TOKENS_AND_UNIQUE }, "normal chars: alphas & underscores"; is_deeply { LeadingDistinct::extract(TOKENS, WEIRD_TOKENS) }, { TOKENS_AND_UNIQUE, WEIRD_TOKENS_AND_UNIQUE }, "normal chars and some metacharacters";
c:\@Work\Perl\monks\gator456>perl LeadingDistinct.t ok 1 - use LeadingDistinct; ok 2 - normal chars: alphas & underscores ok 3 - normal chars and some metacharacters ok 4 - no warnings 1..4
File extract_min_uniq_2.pl:
Output:# extract_min_uniq_2.pl 15jun15waw use warnings; use strict; use LeadingDistinct; use Data::Dump qw(dd); use constant DEBUGGING => 0; use constant { PR_DB1 => 1 && DEBUGGING, PR_DB2 => 1 && DEBUGGING, PR_DB3 => 1 && DEBUGGING, }; my @tokens = qw/report_time report_day reset read/; my %min_distinct = LeadingDistinct::extract(@tokens); dd \%min_distinct if PR_DB1; for my $k (keys %min_distinct) { $min_distinct{$k} = max_optional_rx($k, $min_distinct{$k}); } dd \%min_distinct if PR_DB2; my ($find) = map qr{ (?<! [[:alpha:]_]) (?: $_) (?! [[:alpha:]_]) }xms, join '|', values %min_distinct ; dd $find if PR_DB3; while (<DATA>) { next unless m{ \A ($find) }xms; print "$1 \n"; } # subroutines ###################################################### sub max_optional_rx { my ($whole, # full word $unique, # distinctive characters at start of word ) = @_; my @tail = split //, substr $whole, length $unique; my $opt_tail = optional_tail(@tail); return qr{ \Q$unique\E $opt_tail }xms; } sub optional_tail { my $rx = ''; return $rx unless @_; $rx = quotemeta shift @_; for my $c (@_) { $c = quotemeta $c; $rx .= " (?: $c)?"; } $rx = qr{ $rx }xms; return "$rx?"; } __DATA__ report_t 14:09:33 PDT report_d Fri Jun 12 2015 report (should not show up) res Resetting the time report_time 00:00:00 rea foo.bar Info: reading file foo.bar
c:\@Work\Perl\monks\gator456>perl extract_min_uniq_2.pl report_t report_d res report_time rea
Update: Here's a simpler (and I expect faster, but I've done no Benchmark-ing) definition of the LeadingDistinct::diff() subroutine. The (assumed) speed-up won't be significant unless you're processing millions of symbols. Tested.
sub diff { ($_[0] ^ $_[1]) =~ m{ \A \x00* }xms; return $+[0]; }
Give a man a fish: <%-(-(-(-<
|
|---|