in reply to Re^6: Strategy for randomizing large files via sysseek
in thread Strategy for randomizing large files via sysseek
use strict; use Tie::File; use Benchmark::Timer; our $N ||= 100000; sub shuffle { my $shuffled = (); my $numlines = scalar @{$_[0]}; while ($numlines > 0) { my $randomLineNum = int rand $numlines; push(@$shuffled, $randomLineNum); splice(@{$_[0]}, $randomLineNum, 1); $numlines--; } return $shuffled; } open OUT, '>', 'junk.dat' or die $!; printf OUT "%030d\n", $_ for 0 .. $N; close OUT; my @lines; tie @lines, 'Tie::File', 'junk.dat'; my @indexList = (0..scalar @lines); my $T = new Benchmark::Timer; $T->start( "shuffling $N lines" ); my $newOrder = shuffle scalar \@indexList; $T->stop( "shuffling $N lines" ); $T->start( "Writing New ordered File" ); open NEW, ">sortedJunk.dat"; foreach my $lineNum(@$newOrder){ print NEW $lines[$lineNum],"\n"; } $T->stop( "Writing New ordered File" ); $T->report();
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^8: Strategy for randomizing large files via sysseek
by BrowserUk (Patriarch) on Sep 16, 2004 at 08:34 UTC |