in reply to Re^2: Finding nested directories from a string list.
in thread Finding nested directories from a string list.
For 1000, this takes about 30 seconds on my laptop (1.7GHz), about 100X more than the case for 100. Lots of handwaving would give you 30 minutes for 10K, and 2 days for 100K.#!/your/perl/here use strict; use warnings; use Benchmark qw(:all); use List::Util qw/shuffle/; my @words = 'a'..'g'; # create random pathnames my @paths; for (1..(shift or 1000)) { @words = shuffle(@words); push @paths, 'C:\\' . join '\\', @words[0..rand(@words)]; } # original, for test purposes #my @paths = ( # 'C:\\foo\\bar\\baz', # 'C:\\foo\\bar', # 'C:\\car', # 'C:\\c' # ); my $t0 = new Benchmark; my @nested = grep { my $candidate = $_; grep $candidate =~ /^\Q$_\E\\/, @paths } @paths; my $t1 = new Benchmark; print "$_\n" for @nested; print "Processing Time:", timestr(timediff($t1,$t0)), "\n"; exit;
Changing my init to match:
1000 takes 50ms (not counting the print, just as above). It seems to scale well, with 100K taking 1.1s, 1M taking 2.3s. (Maybe the input needs scaling better -- the redundancy is probably high based on the word count.)#!/your/perl/here use strict; use warnings; use Benchmark qw(:all); use List::Util qw/shuffle/; my @words = 'a'..'g'; ## create random pathnames my @paths; for (1..(shift or 1000)) { @words = shuffle(@words); push @paths, 'C:\\' . join '\\', @words[0..rand(@words)]; } #my @paths = ( # 'C:\\foo\\bar\\baz', # 'C:\\foo\\bar', # 'C:\\car', # 'C:\\c' # ); my $t0 = new Benchmark; # remove duplicates my %paths; @paths{@paths} = (1) x @paths; my @nested; # create the tree of paths seen my $tree = {}; foreach my $path ( keys %paths ) { my @dirs = split /\\+/, $path; # insert in hash my $node = $tree; foreach my $dir ( @dirs ) { if ( exists( $node->{LEAF} ) ) { push @nested, $path; } # create the next nested hash if needed $node->{$dir} = {} unless exists $node->{$dir}; # move pointer to that hash $node = $node->{$dir}; } # if there are keys at this level (other than LEAF) my $keys = keys %{$node}; if ( ( $keys > 1 ) or ( ( $keys == 1 ) and ( exists $node->{LEAF} ) ) ) { push @nested, $path; } # mark this as a leaf $node->{LEAF}=1; } my $t1 = new Benchmark; print "$_\n" for @nested; print "Processing Time:", timestr(timediff($t1,$t0)), "\n";
-QM
--
Quantum Mechanics: The dreams stuff is made of
|
|---|