#!/usr/bin/env perl

use strict;
use Math::Random::MT;
use Statistics::R;
use Statistics::ChiSquare;
use Data::Dumper;

my $mt = Math::Random::MT->new();

our $N          //= 1e5; # number of shuffles to make, after that a chi-squared test is performed
our $ASIZE      //= 4; # nunber of items to shuffle
our $T          //= 20; # number of chi-squared tests (implies a new shuffle over $N each time)
our $PVALUE_THRESHOLD //= 0.05; # we accept bias if p-value of test is less than this threshold

# for large ASIZE perl's Statistics::ChiSquared spits out 'I can't handle 24 choices without a better table.'
# (note 4! = 24)
# additionally, in order to make a decision whether a set of counts/frequencies
# of the combinations the shuffle produced we need not only the chi-square statistic
# (the larger the more bias) but also we need the p-value (confidence interval within which
# this statistic holds). Perl's does not produce a p-value
# so skip chi-squared calculation in perl all together.
use constant DO_R_CHI_SQUARE => 1; 
use constant DO_PERL_CHI_SQUARE => 1;

my %tests_to_do = (
        'bad-rand' => {
                'rand-sub' => sub { rand() < 0.5 ? 0 : rand( $_[0] ) },
                'chi-squared-results-R' => [],
                'chi-squared-results-perl' => [],
        },
        'standard-rand' => {
                'rand-sub' => sub { rand($_[0]) },
                'chi-squared-results-R' => [],
                'chi-squared-results-perl' => [],
        },
        'MT-rand' => {
                'rand-sub' => sub { $mt->rand($_[0]) },
                'shuffle-counts' => [],
                'chi-squared-results-R' => [],
                'chi-squared-results-perl' => [],
        }
);

sub     shuffle {
        my $randsub = shift;
        $a = $_ + $randsub->( @_ - $_ ),
        $b = $_[$_],
        $_[$_] = $_[$a],
        $_[$a] = $b
                for 0 .. $#_;
        return @_;
}

my @data = ( 1 .. $ASIZE );
for my $a_test_name (keys %tests_to_do){
        my $atest = $tests_to_do{$a_test_name};
        my $arandsub = $atest->{'rand-sub'};
        for( 1 .. $T ) {
                my %results;
                ++$results{ join '', shuffle($arandsub, @data ) } for 1 .. $N;
                my $freqs = [values %results];
                push(@{$atest->{'chi-squared-results-R'}},
                        do_chi_squared_test_in_R($freqs)
                ) if DO_R_CHI_SQUARE;
                push(@{$atest->{'chi-squared-results-perl'}},
                        do_chi_squared_test_in_perl($freqs)
                ) if DO_PERL_CHI_SQUARE;
        }
}
print "\n$0 : number of cases with bias detected\n";
for my $a_test_name (keys %tests_to_do){
        if( DO_R_CHI_SQUARE ){
                my $pvalue_less_than_05 = 0;
                my $pvalues_string = "";
                foreach my $R_results (@{$tests_to_do{$a_test_name}->{'chi-squared-results-R'}}){
                        $pvalue_less_than_05++ if( $R_results->[1] < $PVALUE_THRESHOLD );
                        $pvalues_string .= ",".$R_results->[1];
                }
                print "$0 : $a_test_name (using R) -> number of bias detected: $pvalue_less_than_05 : $pvalues_string\n";
        }
        if( DO_PERL_CHI_SQUARE ){
                my $pvalue_less_than_05 = 0;
                my $pvalues_string = "";
                foreach my $R_results (@{$tests_to_do{$a_test_name}->{'chi-squared-results-perl'}}){
                        $pvalue_less_than_05++ if( $R_results->[1] < $PVALUE_THRESHOLD );
                        $pvalues_string .= ",".$R_results->[1];
                }
                print "$0 : $a_test_name (using Perl) -> number of bias detected: $pvalue_less_than_05 : $pvalues_string\n";
        }
        print "--------------------\n";
}
#print Dumper(\%tests_to_do);
exit(0);

sub     do_chi_squared_test_in_R {
        my $freqs = $_[0]; # arrayref of counts/frequencies for each choice
        my $R = Statistics::R->new(); # should really re-use R obj but let's not complicated things now.
        $R->send('library(MASS)');
        $R->set('freqs', $freqs);
        $R->run('res <- chisq.test(as.table(freqs))');
        my $chisquare = $R->get('res$statistic');
        my $pvalue = $R->get('res$p.value');
        $R->stop();
        print "do_chi_squared_test_in_R() : counts: ".join(",",@$freqs)." : x2=$chisquare, pv=$pvalue\n";
        # returns [chi-squared-statistic, p-value]
        return [$chisquare, $pvalue]
}
sub     do_chi_squared_test_in_perl {
        my $freqs = $_[0]; # arrayref of counts/frequencies for each choice
        my ($chisquare, $pvalue) = @{chisquaredVal(@$freqs)};
        print "do_chi_squared_test_in_perl() : counts: ".join(",",@$freqs)." : x2=$chisquare, pv=$pvalue\n";
        # returns [chi-squared-statistic, p-value] (but p-value is not calc here
        return [$chisquare, $pvalue]
}
sub chisquaredVal {
        # BrowserUK modified Statistics::ChiSquared 
        # NOTE: for this to work change my to our in Statistics/ChiSquare.pm or subclass
        my @data = @_;
        @data = @{$data[0]} if(@data == 1 and ref($data[0]));
        return "There's no data!" if(!@data);
        return "Not enough data!" if(@data == 1);
#       return "Malformed data!" if(grep { /\D/ } @data);

        my $degrees_of_freedom = scalar(@data) - 1;
        my ($chisquare, $num_samples, $expected, $i) = (0, 0, 0, 0);
        if (! ref($Statistics::ChiSquare::chitable[$degrees_of_freedom])) {
                print STDERR "chisquaredVal() : I can't handle ".scalar(@data)." choices without a better table.\n";
                return [-1,1]; # an error obviously
        }
        foreach (@data) { $num_samples += $_ }
        $expected = $num_samples / scalar(@data);
        # return "There's no data!" unless $expected;
        foreach (@data) {
                $chisquare += (($_ - $expected) ** 2) / $expected;
        }

        # bliako modification: estimate a pvalue
        my $pvalue = -1;
        foreach (@{$Statistics::ChiSquare::chitable[$degrees_of_freedom]}) {
                if ($chisquare < $_) {
                    $pvalue = $Statistics::ChiSquare::chilevels[$degrees_of_freedom]->[$i];
                    last
                }
                $i++;
        }
        $pvalue = (@{$Statistics::ChiSquare::chilevels[$degrees_of_freedom]})[-1] unless $pvalue > -1;
        $pvalue /= 100; # 0-1 e.g. 5% -> 0.05
        return [$chisquare, $pvalue]
}