There.
A trie is not only smaller, it's faster than a hash in this situation.
Then I build a tree that differentiates word branches from non-word branches. There are comments, but they're rather cryptic. I included debug output to help understand.
use strict;
use warnings;
use constant DEBUG => 1;
use constant DICT => "2of4brif.txt";
my $trie;
sub load_dict {
#
# Constructs a trie from the dictionary.
#
open(my $fh, '<', DICT)
or die("Unable to open dictionary \"" . DICT . "\": $!\n");
while (<$fh>) {
chomp;
my $p = \$trie;
for ( split(//, $_), "\0\0" ) {
$p = \( $$p->{$_} );
}
}
}
sub words_from {
my ($str) = @_;
my @letters = split(//, $str);
my @lengths;
my $p = $trie;
my $i = 0;
for my $i ( 0 .. $#letters ) {
last if !exists( $p->{ $letters[$i] } );
$p = $p->{ $letters[$i] };
push @lengths, $i+1 if exists( $p->{ "\0\0" } );
}
return @lengths;
}
sub find_substrs {
my ($str) = @_;
my @w_substrs;
{
#
# First, construct the following structure from the input:
#
# p e n i s l a n d
# -------------------
# [p e n]
# [p e n i s]
# [i s]
# [i s l a n d]
# [l a n d]
# [a n]
# [a n d]
# -------------------
# 3 2 4 2
# 5 6 3
#
for my $i ( 0 .. length( $str )-1 ) {
$w_substrs[$i] = [ words_from( substr( $str, $i ) ) ];
}
}
if (DEBUG) {
require Data::Dumper;
Data::Dumper->import(qw( Dumper ));
no warnings 'once';
local $Data::Dumper::Terse = 1;
local $Data::Dumper::Indent = 0;
print( 'w_substrs: ', Dumper(\@w_substrs), "\n" );
}
my @n_substrs;
{
#
# Then, construct the following structure from the input:
#
# -------------------
# p e n i s l a n d
# -------------------
# [p] <-- Delete (Leads to nothing)
# [p e] <-- Delete (Leads to nothing)
# [p e n] <-- Delete and proceed (Word)
# [e] <-- Delete (Leads to nothing)
# [e n] <-- Keep (Leads to "is")
# [e n i] <-- Delete (Leads to nothing)
# [e n i s] <-- Delete and proceed (Word)
# [n] <-- Keep (Leads to "is")
# [n i] <-- Delete (Leads to nothing)
# [n i s] <-- Delete and proceed (Word)
# [s] <-- Keep (Leads to "land")
# [s l] <-- Keep (Leads to "an")
# [s l a] <-- Delete (Leads to nothing)
# [s l a n] <-- Delete and proceed (Word)
# [l] <-- Delete (Leads to nothing)
# [l a] <-- Delete (Leads to nothing)
# [l a n] <-- Delete and proceed (Word)
# [a] <-- Delete (Leads to nothing)
# [a n] <-- Delete and proceed (Word)
# [n] <-- Delete (Leads to nothing)
# [n d] <-- Keep (Leads to end)
# [d] <-- Keep (Leads to end)
# -------------------
# 2 1 1 1 2 1
# 2
#
# The actual implementation differs from above.
# While the worse case is O(N^2), the usual
# case is far more likely to resemble O(N).
#
my $j = @w_substrs;
for my $i ( reverse 0 .. $#w_substrs ) {
if ( @{$w_substrs[$i]} && $j-$i >= $w_substrs[$i][0] ) {
$n_substrs[$i] = [ ];
} elsif ( $j == @w_substrs ) {
$n_substrs[$i] = [ $j-$i ];
} else {
$n_substrs[$i] = [ map { $_+($j-$i) } 0, @{ $n_substrs[$j]
+ } ];
}
if ( @{$w_substrs[$i]} ) {
$j = $i;
}
}
}
if (DEBUG) {
require Data::Dumper;
Data::Dumper->import(qw( Dumper ));
no warnings 'once';
local $Data::Dumper::Terse = 1;
local $Data::Dumper::Indent = 0;
print( 'n_substrs: ', Dumper(\@n_substrs), "\n" );
}
return [ \@w_substrs, \@n_substrs ];
}
sub list_substrs {
my ($str, $substrs) = @_;
my ($w_substrs, $n_substrs) = @$substrs;
local *w_helper = sub {
my ($i) = @_;
my @results;
for my $l ( @{ $w_substrs->[$i] } ) {
my $substr = substr( $str, $i, $l );
if ($i + $l == @$w_substrs) {
push @results, [ $substr ];
} else {
push @results, map [ $substr, @$_ ], n_helper( $i + $l );
}
}
return @results;
};
local *n_helper = sub {
my ($i) = @_;
my @results = w_helper( $i );
for my $l ( @{ $n_substrs->[$i] } ) {
my $substr = "[" . substr( $str, $i, $l ) . "]";
if ($i + $l == @$n_substrs) {
push @results, [ $substr ];
} else {
push @results, map [ $substr, @$_ ], w_helper( $i + $l );
}
}
return @results;
};
return map join( ' ', @{$_->[0]} ),
sort { $a->[1] <=> $b->[1]
|| $a->[2] <=> $b->[2] }
map [ $_, scalar(grep /^\[/, @$_), scalar(@$_) ],
n_helper(0);
}
{
load_dict();
for my $input (qw( penisland zatxtaz xapenx )) {
print( "$input\n" );
print( ( "-" x length($input) ), "\n" );
my $substrs = find_substrs( $input );
for ( list_substrs( $input, $substrs ) ) {
print( "$_\n" );
}
print( "\n" );
}
}
penisland
---------
w_substrs: [[3,5],[],[],[2,6],[],[4],[2,3],[],[]]
n_substrs: [[],[2],[1],[],[1,2],[1],[],[2],[1]]
pen island
penis land
pen is land
penis [l] and
pen is [l] and
penis [l] an [d]
pen is [l] an [d]
zatxtaz
-------
w_substrs: [[],[2],[],[],[2],[],[]]
n_substrs: [[1],[],[2],[1],[],[2],[1]]
[z] at [x] ta [z]
xapenx
------
w_substrs: [[],[3],[3],[],[],[]]
n_substrs: [[1,2],[1],[],[3],[2],[1]]
[x] ape [nx]
[xa] pen [x]
|