That was the sort of thing I was thinking of. I should have realised that it wasn't trivial when it was so hard to actually describe :-)
In the end I went with a healthy dose of external knowledge, and got exactly what I was looking for:
#!/usr/bin/perl
use Algorithm::Bucketizer;
use Number::Format qw(:subs);
@patterns = (
qr/(.*)\(*dis[ck]\s\d+\)*$/i,
qr/(.*)cd\s*\d+$/,
qr/^([^-]+\s\-\s).*$/,
);
$mediasize = 4.4 * 1000 * 1024 * 1024;
print "Buckets are ", format_bytes($mediasize), " each.\n";
# Create a bucketizer
my $b = Algorithm::Bucketizer->new( bucketsize => $mediasize, algorith
+m => "retry" );
while (<DATA>) {
chomp; chomp;
( $name, $asize, $path ) = split(/\t/);
print $path,"\n";
if($path eq "") { $skipped_size += $asize; } else { $sizes{$pa
+th} = $asize; }
}
print "\n";
$groupsize=0; $whole_line="";
foreach $line (sort keys %sizes)
{
$group = 0;
$mysize = $sizes{$line};
foreach $re (@patterns)
{
($result) = ($line =~ m/$re/);
if( ($result ne "") && ($result eq $prevresults{$re}))
{
$group=1;
}
$prevresults{$re} = $result;
}
# override the pattern matching, if we're about to create an u
+nfileable lump
if( ($groupsize + $mysize) > $mediasize) { $group = 0; }
if( (!$group) && ($groupsize>0))
{
print "COMMITING $groupcount ($groupsize): $whole_line
+\n";
$b->add_item($whole_line,$groupsize) || die($!);
$whole_line = "";
$groupsize = 0;
$groupcount = 0;
}
$whole_line .= $line."\t";
$groupcount++;
$groupsize += $mysize;
}
print "COMMITING $groupcount ($groupsize): $whole_line\n";
$b->add_item($whole_line,$groupsize) || die($!);
$b->optimize( algorithm => "random", maxtime => 60 );
for my $bucket ( $b->buckets() ) {
print "\n\n# Bucket ",$bucket->serial(), " has ", format_bytes($bu
+cket->level()), " in it.\n";
for my $item ( $bucket->items() ) {
$item =~ s/\t$//;
print " ",join("\n ",split(/\t/,$item)),"\n";
}
print "\n";
}
print "\n\nWe skipped ",format_bytes($skipped_size)," of Misc.\n\n";
__DATA__
The Creatures - A bestiary of 85907653 /var/music/WholeAlbums
+2/The Creatures - A bestiary of
The Cure - The Head On The Door 54978322 /var/music/WholeAlbums
+2/The Cure - The Head On The Door
The Disposable Heroes Of Hiphoprisy - Hypocracy Is The Greatest Luxury
+ 99314506 /var/music/WholeAlbums2/The Disposable Heroes Of Hi
+phoprisy - Hypocracy Is The Greatest Luxury
The Hives - Your New Favourite Band 33943504 /var/music/Who
+leAlbums2/The Hives - Your New Favourite Band
The Jimi Hendrix Experience - Electric Ladyland 110201733 /var/m
+usic/WholeAlbums2/The Jimi Hendrix Experience - Electric Ladyland
Prince - Emancipation - Disc 1 78945501 /var/music/WholeAlbums
+2/Prince - Emancipation - Disc 1
Prince - Emancipation - Disc 2 80622538 /var/music/WholeAlbums
+2/Prince - Emancipation - Disc 2
Prince - Emancipation - Disc 3 77673429 /var/music/WholeAlbums
+2/Prince - Emancipation - Disc 3
Talking Heads - Stop Making Sense 1992195 /var/music/WholeAlbums
+2/Talking Heads - Stop Making Sense
Talking Heads - The Name Of This Band Is Talking Heads 125134992
+ /var/music/WholeAlbums2/Talking Heads - The Name Of This Band Is Ta
+lking Heads
Tracy Chapman - Matters of the Heart 63415367 /var/music/Who
+leAlbums2/Tracy Chapman - Matters of the Heart
Turin Brakes - the Optimist LP 52981417 /var/music/WholeAlbums
+2/Turin Brakes - the Optimist LP
The Vines - Highly Evolved 127977590 /var/music/WholeAlbums
+2/The Vines - Highly Evolved
The Wannadies - Bagsy Me 47384936 /var/music/WholeAlbums
+2/The Wannadies - Bagsy Me
|