Category: Misc
Author/Contact Info nop
Description: Implements a data cube in perl. See Data cube in Perl? for details. Unlike the structure proposed in Data cube in Perl?, this version does not check its dimension values. See included pod documentation.
package Cube;

use strict;
use Data::Dumper;

my $glue = ":";

sub new {
    my $proto = shift;
    my $class = ref($proto) || $proto;
    my $self  = {};
    bless ($self, $class);
    $self->init(@_);
    return $self;
}

sub init {
    my ($self, @dims) = @_;
    $self->{dims} = [sort @dims];
}

# set a cell to a constant value

sub set {
    my ($self, $value, %cell) = @_;
    $self->checkvalid(%cell);
    my $addr = $self->cell2addr(%cell);
    $self->{data}{$addr} = $value;
}

# return value of one cell

sub get {
    my ($self, %cell) = @_;
    $self->checkvalid(%cell);
    my $addr = $self->cell2addr(%cell);
    return $self->{data}{$addr};
}

# check if cell is empty

sub isempty {
    my ($self, %cell) = @_;
    $self->checkvalid(%cell);
    my $addr = $self->cell2addr(%cell);
    return !defined($self->{data}{$addr});
}

# return values of dimension as a hash

sub gethash {
    my ($self, %const) = @_;
    my ($dim) = &set_diff([$self->dims], [keys %const]);
    my @dvals = $self->dimvals($dim);
    my %x;
    foreach (@dvals) {
    $x{$_} = $self->get(%const, $dim=>$_);
    }
    return %x;
}

# return populated values of dimension as a hash

sub gethash_notempty {
    my ($self, %const) = @_;
    my ($dim) = &set_diff([$self->dims], [keys %const]);
    my @dvals = $self->dimvals($dim);
    my %x;
    foreach (@dvals) {
    my $val = $self->get(%const, $dim=>$_); 
    if (defined($val)) {$x{$_} = $val;}
    }
    return %x;
}

# return the dim structure of this cube

sub dims {
    my $self = shift;
    return @{$self->{dims}};
}

# get the entries for a certain dim

sub dimvals {
    my ($self, $dim) = @_;
    my $dimnum = $self->dimnum($dim);
    my %vals;
    foreach my $addr (keys %{$self->{data}}) {
    my @dvals = split($glue,$addr);
    $vals{$dvals[$dimnum]}++
    }
    return sort (keys %vals);
}

# reduce over

sub reduce {
    my ($self, $aggref, @crushdims) = @_;
    my %temp;
    my $tot = scalar(keys %{$self->{data}});
    my $done = 0;
    foreach my $addr (keys %{$self->{data}}) {
    my $reduced_addr = $self->reduce_address($addr, @crushdims);
    push(@{$temp{$reduced_addr}}, $self->{data}{$addr});
    $done++; if ($done % 5000 == 0) {
        print "reducing ",int(100*$done/$tot), "\% done...\n";
    }
    }
    my @dims = $self->dims();
    my @reduced_dims = &set_diff(\@dims,\@crushdims);
    my $n = Cube->new(@reduced_dims);
    foreach my $ra (keys %temp) {
    $n->{data}{$ra}= &{$aggref}(@{$temp{$ra}});
    }
    return $n;
}

# remap 

sub remap {
    my ($self, $newdim, $aggref, $remapref, $olddim) = @_;
    my %temp;
    foreach my $addr (keys %{$self->{data}}) {
    my $remapped_addr =
        $self->remap_address($addr, $newdim, $remapref, $olddim);
    push(@{$temp{$remapped_addr}}, $self->{data}{$addr});
    }
    my @dims = $self->dims();
    
    my $n = Cube->new(&set_diff(\@dims,[$olddim]), $newdim);
    foreach my $ra (keys %temp) {
    my $result = &{$aggref}(@{$temp{$ra}});
    $n->{data}{$ra}=$result;
    }
    return $n;
}

# dump cube for debugging

sub dump {
    my $self = shift;
    my $result;
    for my $addr (sort keys %{$self->{data}}) {
    my %cell = $self->addr2cell($addr);
    my $value = $self->{data}{$addr};
    $result .= join(" ", map {"$_=$cell{$_}"} sort keys %cell) .  " --
+> $value\n";
    }
    return $result;
}

# dump the dim structure for debugging

sub dumpdims {
    my $self = shift;
    my $result;
    foreach ($self->dims) {
    $result .= "$_ : " . join(" ", $self->dimvals($_)) .  "\n";
    }
    return $result;
}

# save cube to a file

sub save {
    my ($self, $fname) = @_;
    open(F,">$fname") or die "can't write open $fname";
    print F join("\t", $self->dims), "\n";
    my $tot = scalar(keys %{$self->{data}});
    my $done = 0;
    foreach (keys %{$self->{data}}) {
    print F $_, "\t", $self->{data}{$_}, "\n";
    $done++; if ($done % 5000 == 0) {
        print "saving to $fname ", int(100*$done/$tot), "\% done...\n"
+;
    }
    }
    close(F);
}

sub restore {
    my ($self, $fname) = @_;
    open(F,$fname) or die "can't read open $fname";
    print "restoring cube from $fname...\n";
    while (<F>) {
    chomp;
    if ($. == 1) {
        foreach (keys %{$self->{data}}) {delete $self->{data}{$_};};
        $self->{dims} = [sort split];        
        next;
    }
    my ($addr, $val) = split;
    $self->{data}{$addr} = $val;
    }
    close(F);
}

# --------- support functions ---------
    
# convert a cell to an address

sub cell2addr {
    my ($self, %cell) = @_;
    return join($glue, map {$cell{$_}} sort keys %cell);
}

# convert an address to a cell

sub addr2cell {
    my ($self, $addr) = @_;
    my @dims = $self->dims();
    my @vals = split($glue,$addr);
    my %cell;
    for (my $i=0; $i<@dims; $i++) {
    $cell{$dims[$i]} = $vals[$i];
    }
    return %cell;
}

# check if a cell is valid

sub checkvalid {
    my ($self, %cell) = @_;
    my %check;
    foreach (@{$self->{dims}}) {$check{$_}++;}
    foreach (keys %cell) {$check{$_}--;}
    foreach (keys %check) {
    if ($check{$_} ne "0") {
        die "invalid cell!\n\t", Dumper(\%cell), "\n\t", Dumper($self-
+>{dims});
    }
    }
}

# get the indexnumber of a dimension

sub dimnum {
    my ($self, $dim) = @_;
    my @dims = sort @{$self->{dims}};
    my %dims = map {($dims[$_],$_)} 0..(scalar(@dims)-1);
    if (!defined($dims{$dim})) {
    die "bad dim $dim in dimnum";
    }
    return $dims{$dim};
}

# compute a remapped address

sub remap_address {
    my ($self, $addr, $newdim, $remapref, $olddim) = @_;
    my %cell = $self->addr2cell($addr);
    if (!defined($cell{$olddim})) {
    die "invalid remap!\n\t", Dumper(\%cell), "\n\t", $olddim;
    }
    my $val = $cell{$olddim};
    delete $cell{$olddim};
    $cell{$newdim} = &{$remapref}($val);
    return $self->cell2addr(%cell);
}   
     
# compute a reduced address

sub reduce_address {
    my ($self, $addr, @crushdims) = @_;
    my %cell = $self->addr2cell($addr);
    foreach (@crushdims) {
    if (!defined($cell{$_})) {
        die "invalid reduced address!\n\t", Dumper(@crushdims), "\n\t"
+, Dumper($self->{dims});        
    }
    delete $cell{$_};
    }
    return $self->cell2addr(%cell);
}

#return set A - set B

sub set_diff {
    my ($aref, $bref) = @_;
    my @a = @{$aref};
    my @b = @{$bref};
    my %a;
    foreach (@a) {$a{$_}++;}
    foreach (@b) {
    if (!defined($a{$_})) {
        die "bad set_diff\n\t", Dumper($aref), "\n\t", Dumper($bref), 
+"\n";
    }
    delete $a{$_};};
    return keys %a;
}

1;

=head1 NAME

Cube - Implements a simple data cube.

=head1 SYNOPSIS

  use Cube;

  my $c = Cube->new(@dimlist);

  my @dimlist = $c->dims;

  $c->set($value, %cell);

  my $value = $c->get(%cell);

  my @dimvals = $c->dimsvals($dim);

  my %x = $c->gethash(%partialcell);

  my %y = $c->gethash_notempty(%partialcell);

  my $p = $c->reduce($agg_func_ref, @reduction_dims);

  my $k = $c->remap($newdim, $agg_func_ref, $remap_func_ref, $olddim);

  print $c->dump, "\n";

  print $c->dumpdims, "\n";

  $c->save("savecube.txt");

  my $d=Cube->new; $d->restore("savecube.txt");

=head1 DESCRIPTION

The Cube module implements a simple data cube.  
A data cube is a hybrid of a high-dimensional matrix and a hash.
Like a matrix, a data cube stores data in cells indexed by
an arbitrary number of dimensions, allows the extraction
of submatrices, and permits easy aggregation and remapping of dimensio
+ns.
Like a hash, a data cube uses unordered strings rather than successive
+ 
integers as dimension values.

=head1 CUBE METHODS

The following methods are available.

=head2 new

  my $c = Cube->new(@dimlist);

Creates a new cube with the given dimensions.

=head2 dims

  my @dimlist = $c->dims;

Returns the current names of the dimensions.

=head2 set

  $c->set($value, %cell);

Sets the value of a given cell. 

=head2 get

  my $value = $c->get(%cell);

Returns the value of a given cell.

=head2 dimvals

  my @dimvals = $c->dimsvals($dim);

Returns the values of a given dimension.

=head2 gethash

  my %x = $c->gethash(%partialcell);

Returns a slice of the cube as a hash.  Every dimension but one
must be specified; the remaining dimension comes back as the 
hash key.

=head2 gethash_nonempty

  my %y = $c->gethash_notempty(%partialcell);

As above, but only nonempty hash values.

=head2 reduce

  my $p = $c->reduce($agg_func_ref, @reduction_dims);

Collapse the cube by applying an aggregator function across
the reduction dimensions.  Returns a new cube.

=head2 remap

  my $k = $c->remap($newdim, $agg_func_ref, $remap_func_ref, $olddim);

Remap one dimension of the cube (eg changing units) by applying
a remap function and an aggregation function.  Returns a new cube.

=head2 dump

  print $c->dump, "\n";

Dump the cube for debugging.

=head2 dumpdims

  print $c->dumpdims, "\n";

Dump the dimensions and the values for debugging.

=head2 save

  $c->save("savecube.txt");

Save the cube as a textfile.

=head2 restore

  my $d=Cube->new; $d->restore("savecube.txt");

Restore the cube from the textfile.

=head1 EXAMPLE

The following is a general example which demonstrates most of the feat
+ures of the module:

  use strict;
  use Cube;
  use Data::Dumper;
  
  print "Create a cube...\n\n";
  my $c=Cube->new(qw(size color region));
  
  print "Set some values...\n\n";
  $c->set(5,size=>"medium", color=>"red", region=>"south");
  $c->set(7,size=>"medium", color=>"blue", region=>"south");
  $c->set(10,size=>"large", color=>"red", region=>"south");
  $c->set(20,size=>"large", color=>"blue", region=>"south");
  $c->set(30,size=>"small", color=>"red", region=>"south");
  $c->set(40,size=>"small", color=>"blue", region=>"south");
  $c->set(50,size=>"small", color=>"green", region=>"south");
  
  print "Dump the cube...\n";
  print $c->dump, "\n";
  
  print "Retrieve a cell..\n";
  print $c->get(size=>"small", color=>"green", region=>"south"), "\n\n
+";
  
  print "Retrieve all the dimensions...\n";
  print join(", ", $c->dims), "\n\n";
  
  print "Retrieve all values for one dimensions...\n";
  print join(", ", ($c->dimvals("size"))), "\n\n";
  
  print "Retrieve a slice...\n";
  my %x = $c->gethash(region=>'south',  size=>'small');
  print Dumper(\%x), "\n";
  
  print "Reduce the cube by summing away two dimensions...\n";
  my $p = $c->reduce(\&sum, qw(size color));
  print $p->dump, "\n";
  
  print "Remap the cube by changing units on a dimension...\n";
  my $k = $c->remap("issmall", \&sum, \&is_small, "size");
  print $k->dump, "\n";
  
  sub sum {
      my $tot = 0;
      foreach (@_) {$tot += $_;}
      return $tot;
  }
  
  sub is_small {
      my $x = shift;
      return 0 + ($x eq "small");
  }



=head1 PORTABILITY

Cube.pm is pure perl.

=head1 BUGS

Values stored within a cube must not contain the glue character.
For speed, the module does not check this.

Since the module builds addresses using the values of the dimensions,
long values lead to more memory usage and slower execution times.

The module is slow.

=head1 AUTHOR

=for html <a href="http://perlmonks.com/index.pl?node_id=29594">nop</a
+>