#!/your/perl/here
use strict;
use warnings;
use Benchmark qw(:all);
use List::Util qw/shuffle/;
my @words = 'a'..'g';
# create random pathnames
my @paths;
for (1..(shift or 1000))
{
@words = shuffle(@words);
push @paths, 'C:\\' . join '\\', @words[0..rand(@words)];
}
# original, for test purposes
#my @paths = (
# 'C:\\foo\\bar\\baz',
# 'C:\\foo\\bar',
# 'C:\\car',
# 'C:\\c'
# );
my $t0 = new Benchmark;
my @nested = grep {
my $candidate = $_;
grep $candidate =~ /^\Q$_\E\\/, @paths
} @paths;
my $t1 = new Benchmark;
print "$_\n" for @nested;
print "Processing Time:", timestr(timediff($t1,$t0)), "\n";
exit;
For 1000, this takes about 30 seconds on my laptop (1.7GHz), about 100X more than the case for 100. Lots of handwaving would give you 30 minutes for 10K, and 2 days for 100K.
Changing my init to match:
#!/your/perl/here
use strict;
use warnings;
use Benchmark qw(:all);
use List::Util qw/shuffle/;
my @words = 'a'..'g';
## create random pathnames
my @paths;
for (1..(shift or 1000))
{
@words = shuffle(@words);
push @paths, 'C:\\' . join '\\', @words[0..rand(@words)];
}
#my @paths = (
# 'C:\\foo\\bar\\baz',
# 'C:\\foo\\bar',
# 'C:\\car',
# 'C:\\c'
# );
my $t0 = new Benchmark;
# remove duplicates
my %paths;
@paths{@paths} = (1) x @paths;
my @nested;
# create the tree of paths seen
my $tree = {};
foreach my $path ( keys %paths )
{
my @dirs = split /\\+/, $path;
# insert in hash
my $node = $tree;
foreach my $dir ( @dirs )
{
if ( exists( $node->{LEAF} ) )
{
push @nested, $path;
}
# create the next nested hash if needed
$node->{$dir} = {} unless exists $node->{$dir};
# move pointer to that hash
$node = $node->{$dir};
}
# if there are keys at this level (other than LEAF)
my $keys = keys %{$node};
if ( ( $keys > 1 )
or ( ( $keys == 1 )
and ( exists $node->{LEAF} ) ) )
{
push @nested, $path;
}
# mark this as a leaf
$node->{LEAF}=1;
}
my $t1 = new Benchmark;
print "$_\n" for @nested;
print "Processing Time:", timestr(timediff($t1,$t0)), "\n";
1000 takes 50ms (not counting the print, just as above). It seems to scale well, with 100K taking 1.1s, 1M taking 2.3s. (Maybe the input needs scaling better -- the redundancy is probably high based on the word count.)
-QM
--
Quantum Mechanics: The dreams stuff is made of
|