in reply to Re^2: Finding nested directories from a string list.
in thread Finding nested directories from a string list.

#!/your/perl/here use strict; use warnings; use Benchmark qw(:all); use List::Util qw/shuffle/; my @words = 'a'..'g'; # create random pathnames my @paths; for (1..(shift or 1000)) { @words = shuffle(@words); push @paths, 'C:\\' . join '\\', @words[0..rand(@words)]; } # original, for test purposes #my @paths = ( # 'C:\\foo\\bar\\baz', # 'C:\\foo\\bar', # 'C:\\car', # 'C:\\c' # ); my $t0 = new Benchmark; my @nested = grep { my $candidate = $_; grep $candidate =~ /^\Q$_\E\\/, @paths } @paths; my $t1 = new Benchmark; print "$_\n" for @nested; print "Processing Time:", timestr(timediff($t1,$t0)), "\n"; exit;
For 1000, this takes about 30 seconds on my laptop (1.7GHz), about 100X more than the case for 100. Lots of handwaving would give you 30 minutes for 10K, and 2 days for 100K.

Changing my init to match:

#!/your/perl/here use strict; use warnings; use Benchmark qw(:all); use List::Util qw/shuffle/; my @words = 'a'..'g'; ## create random pathnames my @paths; for (1..(shift or 1000)) { @words = shuffle(@words); push @paths, 'C:\\' . join '\\', @words[0..rand(@words)]; } #my @paths = ( # 'C:\\foo\\bar\\baz', # 'C:\\foo\\bar', # 'C:\\car', # 'C:\\c' # ); my $t0 = new Benchmark; # remove duplicates my %paths; @paths{@paths} = (1) x @paths; my @nested; # create the tree of paths seen my $tree = {}; foreach my $path ( keys %paths ) { my @dirs = split /\\+/, $path; # insert in hash my $node = $tree; foreach my $dir ( @dirs ) { if ( exists( $node->{LEAF} ) ) { push @nested, $path; } # create the next nested hash if needed $node->{$dir} = {} unless exists $node->{$dir}; # move pointer to that hash $node = $node->{$dir}; } # if there are keys at this level (other than LEAF) my $keys = keys %{$node}; if ( ( $keys > 1 ) or ( ( $keys == 1 ) and ( exists $node->{LEAF} ) ) ) { push @nested, $path; } # mark this as a leaf $node->{LEAF}=1; } my $t1 = new Benchmark; print "$_\n" for @nested; print "Processing Time:", timestr(timediff($t1,$t0)), "\n";
1000 takes 50ms (not counting the print, just as above). It seems to scale well, with 100K taking 1.1s, 1M taking 2.3s. (Maybe the input needs scaling better -- the redundancy is probably high based on the word count.)

-QM
--
Quantum Mechanics: The dreams stuff is made of