=head1 NAME C - partitions a set of files into subsets of approx. equal size =head1 SYNOPSIS use FileSetPartitioner; $fsp = new FileSetPartitioner (5); $rlPartitions = $fsp->makePartition (\@myFiles); $rlPartitions2 = $fsp->reusePartition (\@fileRelatedStuff); =head1 DESCRIPTION =head2 CONSTRUCTOR ARGUMENTS =over 4 =item C<$numParts> Number of subsets into which a set of files is to be split. =item C<$maxSize> Maximum size (bytes) of each subset. This may be violated if some of the files are larger than C<$maxSize>. B Either C<$numParts> or C<$maxSize>, but not both, must be set in the constructor. =item C<$adaptSize> If true, the maximum size of the subsets (either given explicitly in the constructor or calculated by dividing the total size of all files by the number of requested subsets) will be replaced by the size of the largest file in the set if this is greater than the original max. size. =item C<$balanceSets> If C<$balanceSets == 1>, the smallest sets will be joint if they are much smaller than the maximum subset size (resulting in sets that are larger than the max. subset size). If C<$balance == 2>, the maximum subset size will be adjusted to the size of the largest subset. This will lead to a more even distribution, but can give partitions that are quite off the originally requested parameters. =back =head2 METHODS =over 4 =item C Tries to split the set of files C<$rlFiles> into subsets of equal size. The size and number of these subsets depend on the constructor arguments C<$numParts>, C<$maxSize>, C<$adaptSize>, and C<$balanceSets>, see there. C<$ext> and C<$path> are prepended / appended to the filenames in $rlFiles, if they are supplied - this facilitates processing of lists that contain only file name stems. Only one path and extension are supported. C returns a reference to an array of arrays. The outer array contains the subsets, the innner one the files in one subset, in the format in which they appear in the input list C<$rlFiles>. C<$rlFiles> is not changed. Files with zero length are omitted and do not appear in any of the subsets formed. A warning is output for each such file. =item C Splits the list C<$rlItems> in the same way as the last file list passed to C. The length of C<$rlItems> must be the same as that of C<$rlFiles> passed to the last C call; otherwise the method exits. This method is handy if you have additional information for each of the files in the original set stored in a separate list. The return value is a reference to an array of arrays; the inner arrays contain the entries of C<$rlItems> in the proper ordering. C<$rlItems> is not changed. =back =head1 AUTHOR Robert Hecht =cut package FileSetPartitioner; use strict; use Carp qw(croak confess); sub new { my ($pkg, $numParts, $maxSize, $adaptSize, $balance) = @_; die "Must set either number of required partitions or max size of one partition, but not both!\n" if (!($numParts || $maxSize) || ($numParts && $maxSize)); bless { _numPartitions => $numParts, _maxSize => $maxSize, _adaptSize => $adaptSize, _balanceSets => $balance, _lastSetSize => 0, _rlLastPartition => [] }, $pkg; } sub makePartition { my ($this, $rlFiles, $ext, $path) = @_; #standardization for path and ext $path = '' unless defined $path; $path .= '/' if $path && $path !~ /\/$/; $ext = defined $ext ? ".$ext" : ''; #handle path and extension my @files = map {"$path$_$ext"} @$rlFiles; #sort files for size my ($totalsize, %indBySize); $totalsize = $this->_getSizes (\@files, \%indBySize); #compute max size of subset my $maxSize = ($this->{_maxSize} ? $this->{_maxSize} : int ($totalsize / $this->{_numPartitions})); my (@partitions, @sortedSizes); while (@sortedSizes = sort {$b <=> $a} keys %indBySize) { my $size = shift @sortedSizes; my @partition = ($indBySize{$size}); delete $indBySize{$size}; $maxSize = $size if $this->{_adaptSize} && ($size > $maxSize); while (@sortedSizes && $size <= $maxSize) { shift @sortedSizes while (@sortedSizes && ($sortedSizes[0] + $size > $maxSize)); last unless @sortedSizes; my $tmpSize = shift @sortedSizes; $size += $tmpSize; push @partition, $indBySize{$tmpSize}; delete $indBySize{$tmpSize}; } push @partitions, {size => $size, partition => \@partition}; } #balance sizes if required my @sortedPartitions; if ($this->{_balanceSets}) { while (1) { @sortedPartitions = sort {$a->{size} <=> $b->{size}} @partitions; $maxSize = $sortedPartitions[-1]->{size} if $this->{_balanceSets} > 1; my $smallest = $sortedPartitions[0]->{size}; my $second = $sortedPartitions[1]->{size}; if (($smallest + $second - $maxSize) < ($maxSize - $smallest)) { my $smallestPart = shift @sortedPartitions; $sortedPartitions[0]->{size} += $smallest; push @{$sortedPartitions[0]->{partition}}, @{$smallestPart->{partition}}; } last if $#sortedPartitions == $#partitions; @partitions = @sortedPartitions; }; @partitions = sort {$a->{size} <=> $b->{size}} @sortedPartitions; } #store the results $this->{_lastSetSize} = scalar @$rlFiles; $this->{_rlLastPartition} = \@partitions; #get the files corresponding to the indices return $this->_applyMapping ($rlFiles); } sub reusePartition { my ($this, $rlItems) = @_; #sanity check my $lastSize = $this->{_lastSetSize}; if ($lastSize != @$rlItems) { confess "Wrong Number of Items in List (", scalar @$rlItems, " instead of $lastSize)!"; } #apply mapping return $this->_applyMapping ($rlItems); } sub _applyMapping { my ($this, $rlItems) = @_; my @result; #get the entry of $rlItem with the corresponding index foreach my $rlPartInd (@{$this->{_rlLastPartition}}) { push (@result, [map {$rlItems->[$_]} @{$rlPartInd->{partition}}]); } return \@result; } sub _getSizes { my ($this, $rlFiles, $rhIndBySize) = @_; #sort files for size my ($totalsize, $ind); $ind = -1; foreach my $file (@$rlFiles) { $ind++; if (!-e $file) { croak ("File $file does not exist!\n"); } my $size = -s $file; if ($size == 0) { warn ("File $file has size 0!\n"); next; } $totalsize += $size; $size++ while exists $rhIndBySize->{$size}; $rhIndBySize->{$size} = $ind; } return $totalsize; } 1;