package Cube;
use strict;
use Data::Dumper;
my $glue = ":";
sub new {
my $proto = shift;
my $class = ref($proto) || $proto;
my $self = {};
bless ($self, $class);
$self->init(@_);
return $self;
}
sub init {
my ($self, @dims) = @_;
$self->{dims} = [sort @dims];
}
# set a cell to a constant value
sub set {
my ($self, $value, %cell) = @_;
$self->checkvalid(%cell);
my $addr = $self->cell2addr(%cell);
$self->{data}{$addr} = $value;
}
# return value of one cell
sub get {
my ($self, %cell) = @_;
$self->checkvalid(%cell);
my $addr = $self->cell2addr(%cell);
return $self->{data}{$addr};
}
# check if cell is empty
sub isempty {
my ($self, %cell) = @_;
$self->checkvalid(%cell);
my $addr = $self->cell2addr(%cell);
return !defined($self->{data}{$addr});
}
# return values of dimension as a hash
sub gethash {
my ($self, %const) = @_;
my ($dim) = &set_diff([$self->dims], [keys %const]);
my @dvals = $self->dimvals($dim);
my %x;
foreach (@dvals) {
$x{$_} = $self->get(%const, $dim=>$_);
}
return %x;
}
# return populated values of dimension as a hash
sub gethash_notempty {
my ($self, %const) = @_;
my ($dim) = &set_diff([$self->dims], [keys %const]);
my @dvals = $self->dimvals($dim);
my %x;
foreach (@dvals) {
my $val = $self->get(%const, $dim=>$_);
if (defined($val)) {$x{$_} = $val;}
}
return %x;
}
# return the dim structure of this cube
sub dims {
my $self = shift;
return @{$self->{dims}};
}
# get the entries for a certain dim
sub dimvals {
my ($self, $dim) = @_;
my $dimnum = $self->dimnum($dim);
my %vals;
foreach my $addr (keys %{$self->{data}}) {
my @dvals = split($glue,$addr);
$vals{$dvals[$dimnum]}++
}
return sort (keys %vals);
}
# reduce over
sub reduce {
my ($self, $aggref, @crushdims) = @_;
my %temp;
my $tot = scalar(keys %{$self->{data}});
my $done = 0;
foreach my $addr (keys %{$self->{data}}) {
my $reduced_addr = $self->reduce_address($addr, @crushdims);
push(@{$temp{$reduced_addr}}, $self->{data}{$addr});
$done++; if ($done % 5000 == 0) {
print "reducing ",int(100*$done/$tot), "\% done...\n";
}
}
my @dims = $self->dims();
my @reduced_dims = &set_diff(\@dims,\@crushdims);
my $n = Cube->new(@reduced_dims);
foreach my $ra (keys %temp) {
$n->{data}{$ra}= &{$aggref}(@{$temp{$ra}});
}
return $n;
}
# remap
sub remap {
my ($self, $newdim, $aggref, $remapref, $olddim) = @_;
my %temp;
foreach my $addr (keys %{$self->{data}}) {
my $remapped_addr =
$self->remap_address($addr, $newdim, $remapref, $olddim);
push(@{$temp{$remapped_addr}}, $self->{data}{$addr});
}
my @dims = $self->dims();
my $n = Cube->new(&set_diff(\@dims,[$olddim]), $newdim);
foreach my $ra (keys %temp) {
my $result = &{$aggref}(@{$temp{$ra}});
$n->{data}{$ra}=$result;
}
return $n;
}
# dump cube for debugging
sub dump {
my $self = shift;
my $result;
for my $addr (sort keys %{$self->{data}}) {
my %cell = $self->addr2cell($addr);
my $value = $self->{data}{$addr};
$result .= join(" ", map {"$_=$cell{$_}"} sort keys %cell) . " --
+> $value\n";
}
return $result;
}
# dump the dim structure for debugging
sub dumpdims {
my $self = shift;
my $result;
foreach ($self->dims) {
$result .= "$_ : " . join(" ", $self->dimvals($_)) . "\n";
}
return $result;
}
# save cube to a file
sub save {
my ($self, $fname) = @_;
open(F,">$fname") or die "can't write open $fname";
print F join("\t", $self->dims), "\n";
my $tot = scalar(keys %{$self->{data}});
my $done = 0;
foreach (keys %{$self->{data}}) {
print F $_, "\t", $self->{data}{$_}, "\n";
$done++; if ($done % 5000 == 0) {
print "saving to $fname ", int(100*$done/$tot), "\% done...\n"
+;
}
}
close(F);
}
sub restore {
my ($self, $fname) = @_;
open(F,$fname) or die "can't read open $fname";
print "restoring cube from $fname...\n";
while (<F>) {
chomp;
if ($. == 1) {
foreach (keys %{$self->{data}}) {delete $self->{data}{$_};};
$self->{dims} = [sort split];
next;
}
my ($addr, $val) = split;
$self->{data}{$addr} = $val;
}
close(F);
}
# --------- support functions ---------
# convert a cell to an address
sub cell2addr {
my ($self, %cell) = @_;
return join($glue, map {$cell{$_}} sort keys %cell);
}
# convert an address to a cell
sub addr2cell {
my ($self, $addr) = @_;
my @dims = $self->dims();
my @vals = split($glue,$addr);
my %cell;
for (my $i=0; $i<@dims; $i++) {
$cell{$dims[$i]} = $vals[$i];
}
return %cell;
}
# check if a cell is valid
sub checkvalid {
my ($self, %cell) = @_;
my %check;
foreach (@{$self->{dims}}) {$check{$_}++;}
foreach (keys %cell) {$check{$_}--;}
foreach (keys %check) {
if ($check{$_} ne "0") {
die "invalid cell!\n\t", Dumper(\%cell), "\n\t", Dumper($self-
+>{dims});
}
}
}
# get the indexnumber of a dimension
sub dimnum {
my ($self, $dim) = @_;
my @dims = sort @{$self->{dims}};
my %dims = map {($dims[$_],$_)} 0..(scalar(@dims)-1);
if (!defined($dims{$dim})) {
die "bad dim $dim in dimnum";
}
return $dims{$dim};
}
# compute a remapped address
sub remap_address {
my ($self, $addr, $newdim, $remapref, $olddim) = @_;
my %cell = $self->addr2cell($addr);
if (!defined($cell{$olddim})) {
die "invalid remap!\n\t", Dumper(\%cell), "\n\t", $olddim;
}
my $val = $cell{$olddim};
delete $cell{$olddim};
$cell{$newdim} = &{$remapref}($val);
return $self->cell2addr(%cell);
}
# compute a reduced address
sub reduce_address {
my ($self, $addr, @crushdims) = @_;
my %cell = $self->addr2cell($addr);
foreach (@crushdims) {
if (!defined($cell{$_})) {
die "invalid reduced address!\n\t", Dumper(@crushdims), "\n\t"
+, Dumper($self->{dims});
}
delete $cell{$_};
}
return $self->cell2addr(%cell);
}
#return set A - set B
sub set_diff {
my ($aref, $bref) = @_;
my @a = @{$aref};
my @b = @{$bref};
my %a;
foreach (@a) {$a{$_}++;}
foreach (@b) {
if (!defined($a{$_})) {
die "bad set_diff\n\t", Dumper($aref), "\n\t", Dumper($bref),
+"\n";
}
delete $a{$_};};
return keys %a;
}
1;
=head1 NAME
Cube - Implements a simple data cube.
=head1 SYNOPSIS
use Cube;
my $c = Cube->new(@dimlist);
my @dimlist = $c->dims;
$c->set($value, %cell);
my $value = $c->get(%cell);
my @dimvals = $c->dimsvals($dim);
my %x = $c->gethash(%partialcell);
my %y = $c->gethash_notempty(%partialcell);
my $p = $c->reduce($agg_func_ref, @reduction_dims);
my $k = $c->remap($newdim, $agg_func_ref, $remap_func_ref, $olddim);
print $c->dump, "\n";
print $c->dumpdims, "\n";
$c->save("savecube.txt");
my $d=Cube->new; $d->restore("savecube.txt");
=head1 DESCRIPTION
The Cube module implements a simple data cube.
A data cube is a hybrid of a high-dimensional matrix and a hash.
Like a matrix, a data cube stores data in cells indexed by
an arbitrary number of dimensions, allows the extraction
of submatrices, and permits easy aggregation and remapping of dimensio
+ns.
Like a hash, a data cube uses unordered strings rather than successive
+
integers as dimension values.
=head1 CUBE METHODS
The following methods are available.
=head2 new
my $c = Cube->new(@dimlist);
Creates a new cube with the given dimensions.
=head2 dims
my @dimlist = $c->dims;
Returns the current names of the dimensions.
=head2 set
$c->set($value, %cell);
Sets the value of a given cell.
=head2 get
my $value = $c->get(%cell);
Returns the value of a given cell.
=head2 dimvals
my @dimvals = $c->dimsvals($dim);
Returns the values of a given dimension.
=head2 gethash
my %x = $c->gethash(%partialcell);
Returns a slice of the cube as a hash. Every dimension but one
must be specified; the remaining dimension comes back as the
hash key.
=head2 gethash_nonempty
my %y = $c->gethash_notempty(%partialcell);
As above, but only nonempty hash values.
=head2 reduce
my $p = $c->reduce($agg_func_ref, @reduction_dims);
Collapse the cube by applying an aggregator function across
the reduction dimensions. Returns a new cube.
=head2 remap
my $k = $c->remap($newdim, $agg_func_ref, $remap_func_ref, $olddim);
Remap one dimension of the cube (eg changing units) by applying
a remap function and an aggregation function. Returns a new cube.
=head2 dump
print $c->dump, "\n";
Dump the cube for debugging.
=head2 dumpdims
print $c->dumpdims, "\n";
Dump the dimensions and the values for debugging.
=head2 save
$c->save("savecube.txt");
Save the cube as a textfile.
=head2 restore
my $d=Cube->new; $d->restore("savecube.txt");
Restore the cube from the textfile.
=head1 EXAMPLE
The following is a general example which demonstrates most of the feat
+ures of the module:
use strict;
use Cube;
use Data::Dumper;
print "Create a cube...\n\n";
my $c=Cube->new(qw(size color region));
print "Set some values...\n\n";
$c->set(5,size=>"medium", color=>"red", region=>"south");
$c->set(7,size=>"medium", color=>"blue", region=>"south");
$c->set(10,size=>"large", color=>"red", region=>"south");
$c->set(20,size=>"large", color=>"blue", region=>"south");
$c->set(30,size=>"small", color=>"red", region=>"south");
$c->set(40,size=>"small", color=>"blue", region=>"south");
$c->set(50,size=>"small", color=>"green", region=>"south");
print "Dump the cube...\n";
print $c->dump, "\n";
print "Retrieve a cell..\n";
print $c->get(size=>"small", color=>"green", region=>"south"), "\n\n
+";
print "Retrieve all the dimensions...\n";
print join(", ", $c->dims), "\n\n";
print "Retrieve all values for one dimensions...\n";
print join(", ", ($c->dimvals("size"))), "\n\n";
print "Retrieve a slice...\n";
my %x = $c->gethash(region=>'south', size=>'small');
print Dumper(\%x), "\n";
print "Reduce the cube by summing away two dimensions...\n";
my $p = $c->reduce(\&sum, qw(size color));
print $p->dump, "\n";
print "Remap the cube by changing units on a dimension...\n";
my $k = $c->remap("issmall", \&sum, \&is_small, "size");
print $k->dump, "\n";
sub sum {
my $tot = 0;
foreach (@_) {$tot += $_;}
return $tot;
}
sub is_small {
my $x = shift;
return 0 + ($x eq "small");
}
=head1 PORTABILITY
Cube.pm is pure perl.
=head1 BUGS
Values stored within a cube must not contain the glue character.
For speed, the module does not check this.
Since the module builds addresses using the values of the dimensions,
long values lead to more memory usage and slower execution times.
The module is slow.
=head1 AUTHOR
=for html <a href="http://perlmonks.com/index.pl?node_id=29594">nop</a
+>
In reply to Data cube
by nop
Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
Read Where should I post X? if you're not absolutely sure you're posting in the right place.
Please read these before you post! —
Posts may use any of the Perl Monks Approved HTML tags:
- a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
| |
For: |
|
Use: |
| & | | & |
| < | | < |
| > | | > |
| [ | | [ |
| ] | | ] |
Link using PerlMonks shortcuts! What shortcuts can I use for linking?
See Writeup Formatting Tips and other pages linked from there for more info.