#!/usr/bin/env perl use strict; use Math::Random::MT; use Statistics::R; use Statistics::ChiSquare; use Data::Dumper; my $mt = Math::Random::MT->new(); our $N //= 1e5; # number of shuffles to make, after that a chi-squared test is performed our $ASIZE //= 4; # nunber of items to shuffle our $T //= 20; # number of chi-squared tests (implies a new shuffle over $N each time) our $PVALUE_THRESHOLD //= 0.05; # we accept bias if p-value of test is less than this threshold # for large ASIZE perl's Statistics::ChiSquared spits out 'I can't handle 24 choices without a better table.' # (note 4! = 24) # additionally, in order to make a decision whether a set of counts/frequencies # of the combinations the shuffle produced we need not only the chi-square statistic # (the larger the more bias) but also we need the p-value (confidence interval within which # this statistic holds). Perl's does not produce a p-value # so skip chi-squared calculation in perl all together. use constant DO_R_CHI_SQUARE => 1; use constant DO_PERL_CHI_SQUARE => 1; my %tests_to_do = ( 'bad-rand' => { 'rand-sub' => sub { rand() < 0.5 ? 0 : rand( $_[0] ) }, 'chi-squared-results-R' => [], 'chi-squared-results-perl' => [], }, 'standard-rand' => { 'rand-sub' => sub { rand($_[0]) }, 'chi-squared-results-R' => [], 'chi-squared-results-perl' => [], }, 'MT-rand' => { 'rand-sub' => sub { $mt->rand($_[0]) }, 'shuffle-counts' => [], 'chi-squared-results-R' => [], 'chi-squared-results-perl' => [], } ); sub shuffle { my $randsub = shift; $a = $_ + $randsub->( @_ - $_ ), $b = $_[$_], $_[$_] = $_[$a], $_[$a] = $b for 0 .. $#_; return @_; } my @data = ( 1 .. $ASIZE ); for my $a_test_name (keys %tests_to_do){ my $atest = $tests_to_do{$a_test_name}; my $arandsub = $atest->{'rand-sub'}; for( 1 .. $T ) { my %results; ++$results{ join '', shuffle($arandsub, @data ) } for 1 .. $N; my $freqs = [values %results]; push(@{$atest->{'chi-squared-results-R'}}, do_chi_squared_test_in_R($freqs) ) if DO_R_CHI_SQUARE; push(@{$atest->{'chi-squared-results-perl'}}, do_chi_squared_test_in_perl($freqs) ) if DO_PERL_CHI_SQUARE; } } print "\n$0 : number of cases with bias detected\n"; for my $a_test_name (keys %tests_to_do){ if( DO_R_CHI_SQUARE ){ my $pvalue_less_than_05 = 0; my $pvalues_string = ""; foreach my $R_results (@{$tests_to_do{$a_test_name}->{'chi-squared-results-R'}}){ $pvalue_less_than_05++ if( $R_results->[1] < $PVALUE_THRESHOLD ); $pvalues_string .= ",".$R_results->[1]; } print "$0 : $a_test_name (using R) -> number of bias detected: $pvalue_less_than_05 : $pvalues_string\n"; } if( DO_PERL_CHI_SQUARE ){ my $pvalue_less_than_05 = 0; my $pvalues_string = ""; foreach my $R_results (@{$tests_to_do{$a_test_name}->{'chi-squared-results-perl'}}){ $pvalue_less_than_05++ if( $R_results->[1] < $PVALUE_THRESHOLD ); $pvalues_string .= ",".$R_results->[1]; } print "$0 : $a_test_name (using Perl) -> number of bias detected: $pvalue_less_than_05 : $pvalues_string\n"; } print "--------------------\n"; } #print Dumper(\%tests_to_do); exit(0); sub do_chi_squared_test_in_R { my $freqs = $_[0]; # arrayref of counts/frequencies for each choice my $R = Statistics::R->new(); # should really re-use R obj but let's not complicated things now. $R->send('library(MASS)'); $R->set('freqs', $freqs); $R->run('res <- chisq.test(as.table(freqs))'); my $chisquare = $R->get('res$statistic'); my $pvalue = $R->get('res$p.value'); $R->stop(); print "do_chi_squared_test_in_R() : counts: ".join(",",@$freqs)." : x2=$chisquare, pv=$pvalue\n"; # returns [chi-squared-statistic, p-value] return [$chisquare, $pvalue] } sub do_chi_squared_test_in_perl { my $freqs = $_[0]; # arrayref of counts/frequencies for each choice my ($chisquare, $pvalue) = @{chisquaredVal(@$freqs)}; print "do_chi_squared_test_in_perl() : counts: ".join(",",@$freqs)." : x2=$chisquare, pv=$pvalue\n"; # returns [chi-squared-statistic, p-value] (but p-value is not calc here return [$chisquare, $pvalue] } sub chisquaredVal { # BrowserUK modified Statistics::ChiSquared # NOTE: for this to work change my to our in Statistics/ChiSquare.pm or subclass my @data = @_; @data = @{$data[0]} if(@data == 1 and ref($data[0])); return "There's no data!" if(!@data); return "Not enough data!" if(@data == 1); # return "Malformed data!" if(grep { /\D/ } @data); my $degrees_of_freedom = scalar(@data) - 1; my ($chisquare, $num_samples, $expected, $i) = (0, 0, 0, 0); if (! ref($Statistics::ChiSquare::chitable[$degrees_of_freedom])) { print STDERR "chisquaredVal() : I can't handle ".scalar(@data)." choices without a better table.\n"; return [-1,1]; # an error obviously } foreach (@data) { $num_samples += $_ } $expected = $num_samples / scalar(@data); # return "There's no data!" unless $expected; foreach (@data) { $chisquare += (($_ - $expected) ** 2) / $expected; } # bliako modification: estimate a pvalue my $pvalue = -1; foreach (@{$Statistics::ChiSquare::chitable[$degrees_of_freedom]}) { if ($chisquare < $_) { $pvalue = $Statistics::ChiSquare::chilevels[$degrees_of_freedom]->[$i]; last } $i++; } $pvalue = (@{$Statistics::ChiSquare::chilevels[$degrees_of_freedom]})[-1] unless $pvalue > -1; $pvalue /= 100; # 0-1 e.g. 5% -> 0.05 return [$chisquare, $pvalue] }