package Cube; use strict; use Data::Dumper; my $glue = ":"; sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self = {}; bless ($self, $class); $self->init(@_); return $self; } sub init { my ($self, @dims) = @_; $self->{dims} = [sort @dims]; } # set a cell to a constant value sub set { my ($self, $value, %cell) = @_; $self->checkvalid(%cell); my $addr = $self->cell2addr(%cell); $self->{data}{$addr} = $value; } # return value of one cell sub get { my ($self, %cell) = @_; $self->checkvalid(%cell); my $addr = $self->cell2addr(%cell); return $self->{data}{$addr}; } # check if cell is empty sub isempty { my ($self, %cell) = @_; $self->checkvalid(%cell); my $addr = $self->cell2addr(%cell); return !defined($self->{data}{$addr}); } # return values of dimension as a hash sub gethash { my ($self, %const) = @_; my ($dim) = &set_diff([$self->dims], [keys %const]); my @dvals = $self->dimvals($dim); my %x; foreach (@dvals) { $x{$_} = $self->get(%const, $dim=>$_); } return %x; } # return populated values of dimension as a hash sub gethash_notempty { my ($self, %const) = @_; my ($dim) = &set_diff([$self->dims], [keys %const]); my @dvals = $self->dimvals($dim); my %x; foreach (@dvals) { my $val = $self->get(%const, $dim=>$_); if (defined($val)) {$x{$_} = $val;} } return %x; } # return the dim structure of this cube sub dims { my $self = shift; return @{$self->{dims}}; } # get the entries for a certain dim sub dimvals { my ($self, $dim) = @_; my $dimnum = $self->dimnum($dim); my %vals; foreach my $addr (keys %{$self->{data}}) { my @dvals = split($glue,$addr); $vals{$dvals[$dimnum]}++ } return sort (keys %vals); } # reduce over sub reduce { my ($self, $aggref, @crushdims) = @_; my %temp; my $tot = scalar(keys %{$self->{data}}); my $done = 0; foreach my $addr (keys %{$self->{data}}) { my $reduced_addr = $self->reduce_address($addr, @crushdims); push(@{$temp{$reduced_addr}}, $self->{data}{$addr}); $done++; if ($done % 5000 == 0) { print "reducing ",int(100*$done/$tot), "\% done...\n"; } } my @dims = $self->dims(); my @reduced_dims = &set_diff(\@dims,\@crushdims); my $n = Cube->new(@reduced_dims); foreach my $ra (keys %temp) { $n->{data}{$ra}= &{$aggref}(@{$temp{$ra}}); } return $n; } # remap sub remap { my ($self, $newdim, $aggref, $remapref, $olddim) = @_; my %temp; foreach my $addr (keys %{$self->{data}}) { my $remapped_addr = $self->remap_address($addr, $newdim, $remapref, $olddim); push(@{$temp{$remapped_addr}}, $self->{data}{$addr}); } my @dims = $self->dims(); my $n = Cube->new(&set_diff(\@dims,[$olddim]), $newdim); foreach my $ra (keys %temp) { my $result = &{$aggref}(@{$temp{$ra}}); $n->{data}{$ra}=$result; } return $n; } # dump cube for debugging sub dump { my $self = shift; my $result; for my $addr (sort keys %{$self->{data}}) { my %cell = $self->addr2cell($addr); my $value = $self->{data}{$addr}; $result .= join(" ", map {"$_=$cell{$_}"} sort keys %cell) . " --> $value\n"; } return $result; } # dump the dim structure for debugging sub dumpdims { my $self = shift; my $result; foreach ($self->dims) { $result .= "$_ : " . join(" ", $self->dimvals($_)) . "\n"; } return $result; } # save cube to a file sub save { my ($self, $fname) = @_; open(F,">$fname") or die "can't write open $fname"; print F join("\t", $self->dims), "\n"; my $tot = scalar(keys %{$self->{data}}); my $done = 0; foreach (keys %{$self->{data}}) { print F $_, "\t", $self->{data}{$_}, "\n"; $done++; if ($done % 5000 == 0) { print "saving to $fname ", int(100*$done/$tot), "\% done...\n"; } } close(F); } sub restore { my ($self, $fname) = @_; open(F,$fname) or die "can't read open $fname"; print "restoring cube from $fname...\n"; while () { chomp; if ($. == 1) { foreach (keys %{$self->{data}}) {delete $self->{data}{$_};}; $self->{dims} = [sort split]; next; } my ($addr, $val) = split; $self->{data}{$addr} = $val; } close(F); } # --------- support functions --------- # convert a cell to an address sub cell2addr { my ($self, %cell) = @_; return join($glue, map {$cell{$_}} sort keys %cell); } # convert an address to a cell sub addr2cell { my ($self, $addr) = @_; my @dims = $self->dims(); my @vals = split($glue,$addr); my %cell; for (my $i=0; $i<@dims; $i++) { $cell{$dims[$i]} = $vals[$i]; } return %cell; } # check if a cell is valid sub checkvalid { my ($self, %cell) = @_; my %check; foreach (@{$self->{dims}}) {$check{$_}++;} foreach (keys %cell) {$check{$_}--;} foreach (keys %check) { if ($check{$_} ne "0") { die "invalid cell!\n\t", Dumper(\%cell), "\n\t", Dumper($self->{dims}); } } } # get the indexnumber of a dimension sub dimnum { my ($self, $dim) = @_; my @dims = sort @{$self->{dims}}; my %dims = map {($dims[$_],$_)} 0..(scalar(@dims)-1); if (!defined($dims{$dim})) { die "bad dim $dim in dimnum"; } return $dims{$dim}; } # compute a remapped address sub remap_address { my ($self, $addr, $newdim, $remapref, $olddim) = @_; my %cell = $self->addr2cell($addr); if (!defined($cell{$olddim})) { die "invalid remap!\n\t", Dumper(\%cell), "\n\t", $olddim; } my $val = $cell{$olddim}; delete $cell{$olddim}; $cell{$newdim} = &{$remapref}($val); return $self->cell2addr(%cell); } # compute a reduced address sub reduce_address { my ($self, $addr, @crushdims) = @_; my %cell = $self->addr2cell($addr); foreach (@crushdims) { if (!defined($cell{$_})) { die "invalid reduced address!\n\t", Dumper(@crushdims), "\n\t", Dumper($self->{dims}); } delete $cell{$_}; } return $self->cell2addr(%cell); } #return set A - set B sub set_diff { my ($aref, $bref) = @_; my @a = @{$aref}; my @b = @{$bref}; my %a; foreach (@a) {$a{$_}++;} foreach (@b) { if (!defined($a{$_})) { die "bad set_diff\n\t", Dumper($aref), "\n\t", Dumper($bref), "\n"; } delete $a{$_};}; return keys %a; } 1; =head1 NAME Cube - Implements a simple data cube. =head1 SYNOPSIS use Cube; my $c = Cube->new(@dimlist); my @dimlist = $c->dims; $c->set($value, %cell); my $value = $c->get(%cell); my @dimvals = $c->dimsvals($dim); my %x = $c->gethash(%partialcell); my %y = $c->gethash_notempty(%partialcell); my $p = $c->reduce($agg_func_ref, @reduction_dims); my $k = $c->remap($newdim, $agg_func_ref, $remap_func_ref, $olddim); print $c->dump, "\n"; print $c->dumpdims, "\n"; $c->save("savecube.txt"); my $d=Cube->new; $d->restore("savecube.txt"); =head1 DESCRIPTION The Cube module implements a simple data cube. A data cube is a hybrid of a high-dimensional matrix and a hash. Like a matrix, a data cube stores data in cells indexed by an arbitrary number of dimensions, allows the extraction of submatrices, and permits easy aggregation and remapping of dimensions. Like a hash, a data cube uses unordered strings rather than successive integers as dimension values. =head1 CUBE METHODS The following methods are available. =head2 new my $c = Cube->new(@dimlist); Creates a new cube with the given dimensions. =head2 dims my @dimlist = $c->dims; Returns the current names of the dimensions. =head2 set $c->set($value, %cell); Sets the value of a given cell. =head2 get my $value = $c->get(%cell); Returns the value of a given cell. =head2 dimvals my @dimvals = $c->dimsvals($dim); Returns the values of a given dimension. =head2 gethash my %x = $c->gethash(%partialcell); Returns a slice of the cube as a hash. Every dimension but one must be specified; the remaining dimension comes back as the hash key. =head2 gethash_nonempty my %y = $c->gethash_notempty(%partialcell); As above, but only nonempty hash values. =head2 reduce my $p = $c->reduce($agg_func_ref, @reduction_dims); Collapse the cube by applying an aggregator function across the reduction dimensions. Returns a new cube. =head2 remap my $k = $c->remap($newdim, $agg_func_ref, $remap_func_ref, $olddim); Remap one dimension of the cube (eg changing units) by applying a remap function and an aggregation function. Returns a new cube. =head2 dump print $c->dump, "\n"; Dump the cube for debugging. =head2 dumpdims print $c->dumpdims, "\n"; Dump the dimensions and the values for debugging. =head2 save $c->save("savecube.txt"); Save the cube as a textfile. =head2 restore my $d=Cube->new; $d->restore("savecube.txt"); Restore the cube from the textfile. =head1 EXAMPLE The following is a general example which demonstrates most of the features of the module: use strict; use Cube; use Data::Dumper; print "Create a cube...\n\n"; my $c=Cube->new(qw(size color region)); print "Set some values...\n\n"; $c->set(5,size=>"medium", color=>"red", region=>"south"); $c->set(7,size=>"medium", color=>"blue", region=>"south"); $c->set(10,size=>"large", color=>"red", region=>"south"); $c->set(20,size=>"large", color=>"blue", region=>"south"); $c->set(30,size=>"small", color=>"red", region=>"south"); $c->set(40,size=>"small", color=>"blue", region=>"south"); $c->set(50,size=>"small", color=>"green", region=>"south"); print "Dump the cube...\n"; print $c->dump, "\n"; print "Retrieve a cell..\n"; print $c->get(size=>"small", color=>"green", region=>"south"), "\n\n"; print "Retrieve all the dimensions...\n"; print join(", ", $c->dims), "\n\n"; print "Retrieve all values for one dimensions...\n"; print join(", ", ($c->dimvals("size"))), "\n\n"; print "Retrieve a slice...\n"; my %x = $c->gethash(region=>'south', size=>'small'); print Dumper(\%x), "\n"; print "Reduce the cube by summing away two dimensions...\n"; my $p = $c->reduce(\&sum, qw(size color)); print $p->dump, "\n"; print "Remap the cube by changing units on a dimension...\n"; my $k = $c->remap("issmall", \&sum, \&is_small, "size"); print $k->dump, "\n"; sub sum { my $tot = 0; foreach (@_) {$tot += $_;} return $tot; } sub is_small { my $x = shift; return 0 + ($x eq "small"); } =head1 PORTABILITY Cube.pm is pure perl. =head1 BUGS Values stored within a cube must not contain the glue character. For speed, the module does not check this. Since the module builds addresses using the values of the dimensions, long values lead to more memory usage and slower execution times. The module is slow. =head1 AUTHOR =for html nop