I wrote this module when I had to process a bunch of files and had 7 machines to do it in parallel. Obviously, the processing was fastest when I distributed equal amount of data to each of the machines.
=head1 NAME
C<FileSetPartitioner> - partitions a set of files into subsets of appr
+ox. equal size
=head1 SYNOPSIS
use FileSetPartitioner;
$fsp = new FileSetPartitioner (5);
$rlPartitions = $fsp->makePartition (\@myFiles);
$rlPartitions2 = $fsp->reusePartition (\@fileRelatedStuff);
=head1 DESCRIPTION
=head2 CONSTRUCTOR ARGUMENTS
=over 4
=item C<$numParts>
Number of subsets into which a set of files is to be split.
=item C<$maxSize>
Maximum size (bytes) of each subset. This may be violated if some of t
+he files are larger than C<$maxSize>.
B<NOTE:> Either C<$numParts> or C<$maxSize>, but not both, must be set
+ in the constructor.
=item C<$adaptSize>
If true, the maximum size of the subsets (either given explicitly in t
+he constructor or calculated by dividing the total size of all files
+by the number of requested subsets) will be replaced by the size of t
+he largest file in the set if this is greater than the original max.
+size.
=item C<$balanceSets>
If C<$balanceSets == 1>, the smallest sets will be joint if they are m
+uch smaller than the maximum subset size (resulting in sets that are
+larger than the max. subset size). If C<$balance == 2>, the maximum s
+ubset size will be adjusted to the size of the largest subset. This w
+ill lead to a more even distribution, but can give partitions that ar
+e quite off the originally requested parameters.
=back
=head2 METHODS
=over 4
=item C<makePartition ($rlFiles, $ext, $path)>
Tries to split the set of files C<$rlFiles> into subsets of equal size
+. The size and number of these subsets depend on the constructor argu
+ments C<$numParts>, C<$maxSize>, C<$adaptSize>, and C<$balanceSets>,
+see there. C<$ext> and C<$path> are prepended / appended to the filen
+ames in $rlFiles, if they are supplied - this facilitates processing
+of lists that contain only file name stems. Only one path and extensi
+on are supported.
C<makePartition> returns a reference to an array of arrays. The outer
+array contains the subsets, the innner one the files in one subset, i
+n the format in which they appear in the input list C<$rlFiles>. C<$r
+lFiles> is not changed.
Files with zero length are omitted and do not appear in any of the sub
+sets formed. A warning is output for each such file.
=item C<reusePartition ($rlItems)>
Splits the list C<$rlItems> in the same way as the last file list pass
+ed to C<makePartition>. The length of C<$rlItems> must be the same as
+ that of C<$rlFiles> passed to the last C<makePartition> call; otherw
+ise the method exits. This method is handy if you have additional inf
+ormation for each of the files in the original set stored in a separa
+te list. The return value is a reference to an array of arrays; the i
+nner arrays contain the entries of C<$rlItems> in the proper ordering
+. C<$rlItems> is not changed.
=back
=head1 AUTHOR
Robert Hecht
=cut
package FileSetPartitioner;
use strict;
use Carp qw(croak confess);
sub new {
my ($pkg, $numParts, $maxSize, $adaptSize, $balance) = @_;
die "Must set either number of required partitions or max size of
+one partition, but not both!\n" if (!($numParts || $maxSize) || ($num
+Parts && $maxSize));
bless {
_numPartitions => $numParts,
_maxSize => $maxSize,
_adaptSize => $adaptSize,
_balanceSets => $balance,
_lastSetSize => 0,
_rlLastPartition => []
}, $pkg;
}
sub makePartition {
my ($this, $rlFiles, $ext, $path) = @_;
#standardization for path and ext
$path = '' unless defined $path;
$path .= '/' if $path && $path !~ /\/$/;
$ext = defined $ext ? ".$ext" : '';
#handle path and extension
my @files = map {"$path$_$ext"} @$rlFiles;
#sort files for size
my ($totalsize, %indBySize);
$totalsize = $this->_getSizes (\@files, \%indBySize);
#compute max size of subset
my $maxSize = ($this->{_maxSize} ? $this->{_maxSize} :
int ($totalsize / $this->{_numPartitions})
+);
my (@partitions, @sortedSizes);
while (@sortedSizes = sort {$b <=> $a} keys %indBySize) {
my $size = shift @sortedSizes;
my @partition = ($indBySize{$size});
delete $indBySize{$size};
$maxSize = $size if $this->{_adaptSize} && ($size > $maxSize);
while (@sortedSizes && $size <= $maxSize) {
shift @sortedSizes while (@sortedSizes &&
($sortedSizes[0] + $size > $maxSize));
last unless @sortedSizes;
my $tmpSize = shift @sortedSizes;
$size += $tmpSize;
push @partition, $indBySize{$tmpSize};
delete $indBySize{$tmpSize};
}
push @partitions, {size => $size, partition => \@partition};
}
#balance sizes if required
my @sortedPartitions;
if ($this->{_balanceSets}) {
while (1) {
@sortedPartitions = sort {$a->{size} <=> $b->{size}} @part
+itions;
$maxSize = $sortedPartitions[-1]->{size} if $this->{_balan
+ceSets} > 1;
my $smallest = $sortedPartitions[0]->{size};
my $second = $sortedPartitions[1]->{size};
if (($smallest + $second - $maxSize) < ($maxSize - $smalle
+st)) {
my $smallestPart = shift @sortedPartitions;
$sortedPartitions[0]->{size} += $smallest;
push @{$sortedPartitions[0]->{partition}}, @{$smallest
+Part->{partition}};
}
last if $#sortedPartitions == $#partitions;
@partitions = @sortedPartitions;
};
@partitions = sort {$a->{size} <=> $b->{size}} @sortedPartitio
+ns;
}
#store the results
$this->{_lastSetSize} = scalar @$rlFiles;
$this->{_rlLastPartition} = \@partitions;
#get the files corresponding to the indices
return $this->_applyMapping ($rlFiles);
}
sub reusePartition {
my ($this, $rlItems) = @_;
#sanity check
my $lastSize = $this->{_lastSetSize};
if ($lastSize != @$rlItems) {
confess "Wrong Number of Items in List (", scalar @$rlItems,
" instead of $lastSize)!";
}
#apply mapping
return $this->_applyMapping ($rlItems);
}
sub _applyMapping {
my ($this, $rlItems) = @_;
my @result;
#get the entry of $rlItem with the corresponding index
foreach my $rlPartInd (@{$this->{_rlLastPartition}}) {
push (@result, [map {$rlItems->[$_]} @{$rlPartInd->{partition}
+}]);
}
return \@result;
}
sub _getSizes {
my ($this, $rlFiles, $rhIndBySize) = @_;
#sort files for size
my ($totalsize, $ind);
$ind = -1;
foreach my $file (@$rlFiles) {
$ind++;
if (!-e $file) {
croak ("File $file does not exist!\n");
}
my $size = -s $file;
if ($size == 0) {
warn ("File $file has size 0!\n");
next;
}
$totalsize += $size;
$size++ while exists $rhIndBySize->{$size};
$rhIndBySize->{$size} = $ind;
}
return $totalsize;
}
1;