http://qs1969.pair.com?node_id=383177
Category: Math/Statistics
Author/Contact Info

jdporter


Description: Statistics::SGT - Simple Good-Turing Frequency Estimator
use Statistics::SGT;
The function Statistics::SGT::sgt takes a set of ( int:frequency, int:frequency_of_frequency ) pairs, and applies the Simple Good-Turing technique for estimating the probabilities corresponding to the observed frequencies, and P.0, the joint probability of all unobserved species.

The input should consist of a list of array-refs, each of which contains two positive integers: an observed frequency, and the frequency of that frequency.

No checks are made for linearity; the routine simply assumes that the requirements for using the SGT estimator are met.

The output is a list of ( int:frequency, [0,1.0]:estimated_probability ) pairs. The set of frequencies is the same as in the input array, plus the addition of a ( 0, P.0 ) element.

See also: http://www.grsampson.net/RGoodTur.html

=head1 NAME

Statistics::SGT - Simple Good-Turing Frequency Estimator

=head1 SYNOPSIS

  use Statistics::SGT;

=head1 DESCRIPTION

The function C<Statistics::SGT::sgt> takes a set of
I<S<( int:frequency, int:frequency_of_frequency )>> pairs, and
applies the Simple Good-Turing technique for estimating
the probabilities corresponding to the observed frequencies,
and P.0, the joint probability of all unobserved species.

The input should consist of a list of array-refs, each of which
contains two positive integers: an observed frequency, and the
frequency of that frequency.

No checks are made for linearity; the routine simply assumes that the
requirements for using the SGT estimator are met.

The output is a list of I<S<( int:frequency, [0,1.0]:estimated_probabi
+lity )>> pairs.
The set of frequencies is the same as in the input array,
plus the addition of a I<S<( 0, P.0 )>> element.

=head1 REFERENCES

This is a Perl adaptation of a C program originally written by:

B<Geoffrey Sampson,
School of Cognitive and Computing Sciences,
University of Sussex (England),
1995-06-27.>

The Simple Good-Turing technique was devised by William A. Gale of AT&
+T Bell Labs.
It is described in:

B<Gale, William A., and Geoffrey Sampson,
'Good-Turing Frequency Estimation Without Tears',
I<Journal of Quantitative Linguistics> 2.217-37, 1995.>

See also: http://www.grsampson.net/RGoodTur.html

=cut

package Statistics::SGT;

use Exporter;
use strict;
use vars qw( @ISA @EXPORT_OK $VERSION $MIN_INPUT $EPSILON );

@ISA = qw(Exporter);
@EXPORT_OK = qw( &sgt );

$VERSION = '0.01';

$MIN_INPUT = 5;

$EPSILON = 0.00000001; # use for n, in place of 0, to prevent divide-b
+y-zero.


{
  package SGT_::Smoother;

  sub slope { $_[0]{'slope'} }
  sub intercept { $_[0]{'intercept'} }

  sub smoothed { # double (int)
    my $self = shift;
    my $i = shift;
    exp( $self->intercept + $self->slope * log($i) );
  }

  sub new {
    my $pkg = shift;
    my $X_name = shift;
    my $Y_name = shift;
    my $ar = shift; # array ref (\@recs)
    ref($ar) && $ar =~ /\bARRAY\b/ or die "SGT_::Smoother::new($ar) --
+ arg not an arrayref!";
    @$ar > 0 or die "SGT_::Smoother::new($ar) -- array passed is empty
+!";

    my $meanX = 0.0;
    my $meanY = 0.0;
    for ( @$ar ) {
      $meanX += $_->{ $X_name };
      $meanY += $_->{ $Y_name };
    }
    $meanX /= scalar(@$ar);
    $meanY /= scalar(@$ar);

    my $XYs = 0.0;
    my $Xsquares = 0.0;
    for ( @$ar ) {
      $XYs += ($_->{ $X_name } - $meanX) * ($_->{ $Y_name } - $meanY);
      $Xsquares += square($_->{ $X_name } - $meanX);
    }
    $Xsquares != 0 or die "SGT_::Smoother::new: \$Xsquares is zero!";
    my $slope = $XYs / $Xsquares;
    bless {
      'slope'     => $slope,
      'intercept' => $meanY - $slope * $meanX,
    }, $pkg;
  }

  sub square($) { $_[0] ** 2.0 }

}


=head1 FUNCTIONS

=head2 sgt()

The input consists of a set of pairs of numbers ( r, n ).

Each pair can be in the form of an array, in which case the [0] elemen
+t is
taken for r, and the [1] element is taken for n.

Alternatively, it can take the form of a hash, in which case it must
contain an {'r'} element and an {'n'} element, which are used directly
+.

If called in a list context, this function returns a list of 'records'
+,
each of which is a hash, described below.  The array is ordered by
the [0] (r) element of each record.

If called in a scalar context, this function returns the records in
a hash, which is returned by reference.  This hash is keyed by the 'r'
element of each record.

If called in a void context, nothing is returned; this may be useful i
+f the
package variable $SET_OUTVALS is set to true prior to calling this fun
+ction.
If that variable is true, the values of the input array are modified i
+n the
following way:  if the value is an arrayref (r in [0] and n in [1]), t
+hen
the derived 'p' value is placed in position [2]; if the value is a has
+href,
then an element 'p' is added to the hash.

$SET_OUTVALS is false by default: the input array is not modified in a
+ny way.

The resulting records are hashrefs, each with the following elements:

   r
   n
   p       -- final derived result for this r

The records also contain some derived values only used internally to a
+lgorithm:

   log_r
   log_Z
   rStar
   j       -- the record's index in the record array.

This function throws an exception on any detected error.

=cut

#
#     in_*   *_AofA => [ [ ] ]
#    out_*   *_AofH => [ { } ]
#  inout_*   *_HofA => [ [ ] ]
#            *_HofH => { { } }
#
# rtn => ( AofA | AofH | HofA | HofH )
#

sub sgt {

  my %args = @_;

  my( $in_key, $out_key, $rtn_type ) = do {

    my @in_keys  = grep { /^(-?)in/i   } keys %args;
    my @out_keys = grep { /out_/i      } keys %args;
    my @rtn_keys = grep { /^(-?)rtn$/i } keys %args;

    my $rrrr = @in_keys ? 'many' : 'few';
    @in_keys == 1 or  die "Too $rrrr IN ARG specifiers! (must be exact
+ly 1)";
    @out_keys > 1 and die "Too many OUT ARG specifiers! (must be at mo
+st 1)";
    @rtn_keys > 1 and die "Too many RETURN  specifiers! (must be at mo
+st 1)";

    ( $in_keys[0], $out_keys[0], @rtn_keys ? $args{$rtn_keys[0]} : und
+ef );
  };


  my $in_arg_ref  = $args{$in_key};
  my $out_arg_ref = defined $out_key ? $args{$out_key} : undef ; # may
+ be undef

  # $rtn_type is a string of the form "AofH".

  ref($in_arg_ref) or die "IN ARG value must be either a array ref or 
+an hash ref!";

  if ( $out_arg_ref ) {
    ref($out_arg_ref) or die "OUT ARG value must be either a array ref
+ or an hash ref!";
  }


  # returns a vector of two single-character strings, e.g. ( 'a', 'h' 
+).
  my $extype = sub {
    my $kt = shift;
    defined $kt or return(' ',' '); # undef: no key.
    lc($kt) =~ /(?:^|_)([ah])of?([ah])$/ or die "Arg key type '$kt' do
+esn't end with _[AH]of[AH] !!!";
    ( $1, $2 )
  };

  my @in_arg_type  = $extype->( $in_key );
  my @out_arg_type = $extype->( $out_key );
  my @rtn_type     = $extype->( $rtn_type );

  my $in_arg_type  = join '', @in_arg_type;
  my $out_arg_type = join '', @out_arg_type;
     $rtn_type     = join '', @rtn_type;

  #
  # barf if the stupid user does something stupid like
  #   in_HofA => \@foo
  # or
  #   inout_AofH => \%foo
  #

  my %ul = qw( a ARRAY h HASH );

  $in_arg_ref =~ /\b$ul{$in_arg_type[0]}\b/ or die "Actual IN ARG valu
+e does not match specified type!
($ul{$in_arg_type[0]} specified, $in_arg_ref passed.";

  if ( $out_arg_ref ) {
    $out_arg_ref =~ /\b$ul{$out_arg_type[0]}\b/ or die "Actual OUT ARG
+ value does not match specified type!
($ul{$out_arg_type[0]} specified, $out_arg_ref passed.";
  }

  #
  # r and n are both supposed to be non-zero positive integers ("natur
+al numbers").
  #
  my %get_elem_rn = (
      'a' => sub {
        my $r = shift;
        @$r >= 2 or die "Too few elements in input element array";
        @$r;
      },

      'h' => sub {
        my $rec_hr = shift;
    my $default_r = shift;
    my $r = $rec_hr->{'r'};
        exists $rec_hr->{'n'} or die "Missing 'n' field in input eleme
+nt hash";
    if ( defined $r and defined $default_r ) {
      $r == $default_r or 
            die "'r' value in element ($r) != key of input hash ($defa
+ult_r) !!!";
    }
    unless ( defined $r ) { # what to do if 'r' not present
      if ( defined $default_r ) {
        $r = $default_r;
      }
      else {
        die "Missing 'r' field in input element hash";
      }
    }
        ( $r, $rec_hr->{'n'} );
      },
  );

  #
  # our working data structures.
  # %recs is the main thing; it is keyed by 'r' value.
  # @recs is essentially a perlish kludge for a doubly-linked list;
  #   these are the only operations it needs to support:
  #     prev/next; is_first/is_last;
  #   the elements of @recs are ordered by 'r' value.
  #
  my %recs;
  my @recs;

  #
  # if input is AofA, (r,n) is taken from [0,1]; both must be defined.
  # if input is AofH, (r,n) is taken from {'r','n'}; both must be defi
+ned.
  # if input is HofA, (r,n) is taken from [0,1], and [0] must equal th
+e hash key.
  # if input is HofH, (r,n) is taken from {'r','n'}, according to the 
+following rules:
  #    {'r'} may be missing, in which case (r) is taken from the hash 
+key;
  #    if {'r'} is present, it must equal the hash key.
  #
{
  my @keys = $in_arg_type[0] eq 'a' ? () : keys %$in_arg_ref;
  my @in_recs = $in_arg_type[0] eq 'a' ? @$in_arg_ref : @{$in_arg_ref}
+{@keys};


  for my $i ( 0 .. $#in_recs ) {

    my $el = $in_recs[$i];
    ref($el) or die "Input element not a reference! ($el)";
    $el =~ /\b$ul{$in_arg_type[1]}\b/ or die "Input element ref does n
+ot match specified type!
($ul{$in_arg_type[1]} specified, $in_arg_ref passed.";

    my( $r, $n ) = $get_elem_rn{$in_arg_type[1]}->($el, $keys[$i]); # 
+default r

    $recs{$r} = {
      'r' => $r,
      'n' => $n || $EPSILON,
      'log_r' => log($r),
    };
  }
}


  #
  # %recs has now been initialized from the input data.
  #

  exists $recs{1} or
    die "Error: no value '1' in input!";

  #@recs = sort { $a->{'r'} <=> $b->{'r'} } values %recs;
  @recs = @recs{ sort { $a <=> $b } keys %recs };

  @recs >= $MIN_INPUT or
    die "Not enough input values (\$Statistics\::SGT\::MIN_INPUT = $MI
+N_INPUT)";

  for my $j ( 0 .. $#recs ) {
    $recs[$j]{'j'} = $j;  # let records find themselves in the @recs a
+rray.
  }

  #
  # now we're done initializing our working structures (@recs, %recs) 
+from the input data;
  # commence to analyse the data:
  #

  for ( @recs ) {
    my $j = $_->{'j'};
    my $i = (
      $j == 0                   # first row?
        ? 0
        : $recs[$j - 1]{'r'}    # 'r' of previous row
    );

    my $k = (
      $j == $#recs              # last row?
        ? (2 * $_->{'r'} - $i)  # use the same delta as on the last st
+ep.
        : $recs[$j + 1]{'r'}    # 'r' of next row
    );

    ($k - $i) or die "Statistics::SGT::sgt(): divide by zero immanent!
+";
    $_->{'log_Z'} = log( 2 * $_->{'n'} / ($k - $i) );
  }

  my $smoother = new SGT_::Smoother 'log_r', 'log_Z', \@recs;

  my $indiffValsSeen = 0; # false
  for ( @recs ) {
    $smoother->smoothed($_->{'r'})
      or die "Statistics::SGT::sgt(): divide by zero immanent!";

    my $y = ($_->{'r'} + 1) * $smoother->smoothed($_->{'r'} + 1) / $sm
+oother->smoothed($_->{'r'});

    exists $recs{$_->{'r'} + 1} or
      $indiffValsSeen = 1; # true

    unless ( $indiffValsSeen ) {
      my $next_n = $recs{$_->{'r'} + 1}{'n'};
      my $x = ($_->{'r'} + 1) * $next_n / $_->{'n'}; # n=0 elements ha
+ve been removed from input.
      if (
        abs($x - $y)
      <=
        1.96 * sqrt(
            SGT_::Smoother::square($_->{'r'} + 1.0) *      $next_n
          /
            SGT_::Smoother::square($_->{'n'}      ) * (1 + $next_n / $
+_->{'n'})
        )
      ) {
        $indiffValsSeen = 1; # true
      }
      else {
        $_->{'rStar'} = $x;
      }
    }

    $indiffValsSeen and
      $_->{'rStar'} = $y;
  }


  my $bigN = 0; # int
  for ( @recs ) {
    $bigN += $_->{'r'} * $_->{'n'};
  }
  $bigN or die "divide-by-zero immanent! (bigN)";
  my $P_0 = $recs{1}{'n'} / $bigN; # double
  my $bigNprime = 0.0; # double
  for ( @recs ) {
    $bigNprime += $_->{'rStar'} * $_->{'n'};
  }
  $bigNprime or die "divide-by-zero immanent! (bigNprime)";
  for ( @recs ) {
    $_->{'p'} = (1 - $P_0) * $_->{'rStar'} / $bigNprime;
  }

#
# P_0 gets added to the output list, in position [0] and {0}.
# but first we check that there's nothing there already.
#
  exists $recs{0} and die "Already a P.0 element in the working data s
+et!";

  $recs{0} = { 'r' => 0, 'n' => 0, 'p' => $P_0 };
  unshift @recs, $recs{0};

  #
  # Now process the out param and/or return requests.
  #

  #
  # we already have AofH and HofH, suitable for returning.
  #

  # note that the default case is 'ah'... but '  ' also falls into it!
  # thus effectively making 'ah' the default return type.

  if ( $rtn_type eq 'ha' ) {
    return(
      wantarray
        ? ( map { $_ => [ @{$recs{$_}}{qw(r n p)} ] } keys %recs )
        : { map { $_ => [ @{$recs{$_}}{qw(r n p)} ] } keys %recs }
    );
  }
  elsif ( $rtn_type eq 'aa' ) {
    return(
      wantarray
        ? ( map { [ @{$_}{qw(r n p)} ] } @recs )
        : [ map { [ @{$_}{qw(r n p)} ] } @recs ]
    );
  }
  elsif ( $rtn_type eq 'hh' ) {
    return( wantarray ? %recs : \%recs );
  }
  else { # 'ah' and '  ' fall into this case:
    return( wantarray ? @recs : \@recs );
  }
}


=head1 AUTHOR, DATE, COPYRIGHT

Perl code by JDPORTER@cpan.org (John Porter), 2000-01-31.

This Perl module is free software, and may be modified and/or
redistributed under the same terms as Perl itself.

=cut

1;