use Statistics::R;
sub do_chi_squared_test_in_R {
my $freqs = $_[0]; # arrayref of counts/frequencies for each choice
my $R = Statistics::R->new(); # should really re-use R obj but let's not complicated things now.
$R->send('library(MASS)');
$R->set('freqs', $freqs);
$R->run('res <- chisq.test(as.table(freqs))');
my $chisquare = $R->get('res$statistic'); # single quotes please
my $pvalue = $R->get('res$p.value');
$R->stop();
print "do_chi_squared_test_in_R : ".join(",",@$freqs)." : x2=$chisquare, pv=$pvalue\n";
# returns [chi-squared-statistic, p-value]
return [$chisquare, $pvalue]
}
####
my $pvalue = -1;
foreach (@{$Statistics::ChiSquare::chitable[$degrees_of_freedom]}) {
if ($chisquare < $_) {
$pvalue = $Statistics::ChiSquare::chilevels[$degrees_of_freedom]->[$i+1];
last
}
$i++;
}
$pvalue = (@{$Statistics::ChiSquare::chilevels[$degrees_of_freedom]})[-1] unless $pvalue > -1;
$pvalue /= 100; # 0-1 e.g. 5% -> 0.05
####
./browser_uk_test_script.pl : number of cases with bias detected
./browser_uk_test_script.pl : standard-rand (using R) -> number of bias detected: 1 / 20
(pvalues=,0.1063036,0.4628186,0.01704211,0.45147,0.6672964,0.1163614,0.4996122,0.06030822,0.764618,0.08935117,0.1812506,0.1615832,0.3479303,0.9551496,0.1814904,0.1601532,0.3921217,0.9140581,0.621188,0.6104723)
./browser_uk_test_script.pl : standard-rand (using Perl) -> number of bias detected: 1 / 20
(pvalues=,0.1,0.25,0.01,0.25,0.5,0.1,0.25,0.05,0.75,0.05,0.1,0.1,0.25,0.95,0.1,0.1,0.25,0.9,0.5,0.5)
--------------------
./browser_uk_test_script.pl : MT-rand (using R) -> number of bias detected: 2 / 20
(pvalues=,0.8734286,0.4596782,0.0376436,0.483486,0.3135795,0.4501071,0.3534776,0.6427224,0.1530366,0.1850991,0.7265003,0.6379416,0.9933123,0.01561197,0.7942019,0.2008258,0.8592597,0.2874155,0.211755,0.6454129)
./browser_uk_test_script.pl : MT-rand (using Perl) -> number of bias detected: 2 / 20
(pvalues=,0.75,0.25,0.01,0.25,0.25,0.25,0.25,0.5,0.1,0.1,0.5,0.5,0.99,0.01,0.75,0.1,0.75,0.25,0.1,0.5)
--------------------
./browser_uk_test_script.pl : bad-rand (using R) -> number of bias detected: 16 / 20
(pvalues=,0.00362283,6.262224e-05,4.627376e-06,1.718484e-06,0.07608554,0.002237529,0.01988714,0.01380236,0.07770095,0.04196283,0.01394789,6.257283e-05,0.608835,0.00027414,0.09803577,0.01257798,0.005433616,0.02350288,0.00299678,0.004870376)
./browser_uk_test_script.pl : bad-rand (using Perl) -> number of bias detected: 16 / 20
(pvalues=,0.01,0.01,0.01,0.01,0.05,0.01,0.01,0.01,0.05,0.01,0.01,0.01,0.5,0.01,0.05,0.01,0.01,0.01,0.01,0.01)
--------------------
####
./browser_uk_test_script.pl : number of cases with bias detected
./browser_uk_test_script.pl : standard-rand (using R) -> number of bias detected: 0 / 20
(pvalues=,0.5337195,0.5921474,0.1788502,0.7560645,0.1524058,0.9634717,0.3517389,0.1133675,0.8631978,0.1689414,0.1829178,0.9446986,0.7018559,0.3831006,0.422912,0.2526592,0.6182125,0.7644635,0.4913882,0.1258621)
./browser_uk_test_script.pl : standard-rand (using Perl) -> number of bias detected: 0 / 20
(pvalues=,0.5,0.5,0.1,0.75,0.1,0.95,0.25,0.1,0.75,0.1,0.1,0.9,0.5,0.25,0.25,0.25,0.5,0.75,0.25,0.1)
--------------------
./browser_uk_test_script.pl : bad-rand (using R) -> number of bias detected: 12 / 20
(pvalues=,0.06574112,0.00283819,6.942476e-06,5.380104e-05,1.697578e-05,0.001277564,0.01886317,0.0003957457,8.826589e-06,0.07862541,0.0001284624,0.2741576,0.1136995,0.1550829,0.007600639,0.00122368,0.0001251429,0.1258741,0.234761,0.1011256)
./browser_uk_test_script.pl : bad-rand (using Perl) -> number of bias detected: 12 / 20
(pvalues=,0.05,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.05,0.01,0.25,0.1,0.1,0.01,0.01,0.01,0.1,0.1,0.1)
--------------------
./browser_uk_test_script.pl : MT-rand (using R) -> number of bias detected: 1 / 20
(pvalues=,0.5276704,0.06869204,0.3610061,0.9570774,0.0468298,0.514135,0.3893473,0.9621195,0.8655001,0.4279424,0.2365762,0.6020179,0.2011017,0.7721139,0.310637,0.3746697,0.2099162,0.1343461,0.7025518,0.2929772)
./browser_uk_test_script.pl : MT-rand (using Perl) -> number of bias detected: 1 / 20
(pvalues=,0.5,0.05,0.25,0.95,0.01,0.5,0.25,0.95,0.75,0.25,0.1,0.5,0.1,0.75,0.25,0.25,0.1,0.1,0.5,0.25)
--------------------
####
./browser_uk_test_script.pl : number of cases with bias detected over 20 tests
./browser_uk_test_script.pl : MT-rand (using R) -> number of bias detected: 0 / 20
(pvalues=,0.4869334,0.4961692,0.8251134,0.2657584,0.84692,0.1296479,0.504212,0.9028209,0.8082847,0.2999607,0.154672,0.1660518,0.5143663,0.8120685,0.4452244,0.6561128,0.6123136,0.6994308,0.9302561,0.4757345)
./browser_uk_test_script.pl : MT-rand (using Perl) -> number of bias detected: 0 / 20
(pvalues=,0.25,0.25,0.75,0.25,0.75,0.1,0.5,0.9,0.75,0.25,0.1,0.1,0.5,0.75,0.25,0.5,0.5,0.5,0.9,0.25)
--------------------
./browser_uk_test_script.pl : standard-rand (using R) -> number of bias detected: 2 / 20
(pvalues=,0.3855934,0.4056266,0.9905815,0.9134483,0.007583821,0.374439,0.3795054,0.4661071,0.08190948,0.617658,0.9640541,0.9402085,0.883329,0.517288,0.04139933,0.3008889,0.4068793,0.963351,0.8351011,0.7187029)
./browser_uk_test_script.pl : standard-rand (using Perl) -> number of bias detected: 2 / 20
(pvalues=,0.25,0.25,0.99,0.9,0.01,0.25,0.25,0.25,0.05,0.5,0.95,0.9,0.75,0.5,0.01,0.25,0.25,0.95,0.75,0.5)
--------------------
./browser_uk_test_script.pl : bad-rand (using R) -> number of bias detected: 12 / 20
(pvalues=,0.0001945661,0.5148868,4.572803e-05,0.08857331,0.0002981317,0.1119708,0.09084117,0.005959205,0.2835337,5.089226e-08,0.0009614042,0.2044863,0.004434155,0.002847202,0.01542375,0.0105589,0.002659498,0.1273001,0.07932075,4.602725e-05)
./browser_uk_test_script.pl : bad-rand (using Perl) -> number of bias detected: 12 / 20
(pvalues=,0.01,0.5,0.01,0.05,0.01,0.1,0.05,0.01,0.25,0.01,0.01,0.1,0.01,0.01,0.01,0.01,0.01,0.1,0.05,0.01)
--------------------
####
do_chi_squared_test_in_R() : counts: 4317,4188,4099,4231,4258,4195,4076,4330,4135,4143,4152,4191,3995,4176,4113,4088,4171,4132,4115,4154,4165,4140,4246,4190 : x2=30.8192, pv=0.1273001
do_chi_squared_test_in_perl() : counts: 4317,4188,4099,4231,4258,4195,4076,4330,4135,4143,4152,4191,3995,4176,4113,4088,4171,4132,4115,4154,4165,4140,4246,4190 : x2=30.8192, pv=0.1
do_chi_squared_test_in_R() : counts: 4032,4106,4154,4136,4234,4065,4166,4039,4279,4210,4031,4171,4279,4291,4132,4190,4281,4129,4179,4128,4188,4172,4175,4233 : x2=33.10112, pv=0.07932075
do_chi_squared_test_in_perl() : counts: 4032,4106,4154,4136,4234,4065,4166,4039,4279,4210,4031,4171,4279,4291,4132,4190,4281,4129,4179,4128,4188,4172,4175,4233 : x2=33.10112, pv=0.05
####
#!/usr/bin/env perl
use strict;
use Math::Random::MT;
use Statistics::R;
use Statistics::ChiSquare;
use Data::Dumper;
my $mt = Math::Random::MT->new();
our $N //= 1e5; # number of shuffles to make, after that a chi-squared test is performed
our $ASIZE //= 4; # nunber of items to shuffle
our $T //= 20; # number of chi-squared tests (implies a new shuffle over $N each time)
our $PVALUE_THRESHOLD //= 0.05; # we accept bias if p-value of test is less than this threshold
# for large ASIZE perl's Statistics::ChiSquared spits out 'I can't handle 24 choices without a better table.'
# (note 4! = 24)
# additionally, in order to make a decision whether a set of counts/frequencies
# of the combinations the shuffle produced we need not only the chi-square statistic
# (the larger the more bias) but also we need the p-value (confidence interval within which
# this statistic holds). Perl's does not produce a p-value
# so skip chi-squared calculation in perl all together.
use constant DO_R_CHI_SQUARE => 1;
use constant DO_PERL_CHI_SQUARE => 1;
my %tests_to_do = (
'bad-rand' => {
'rand-sub' => sub { rand() < 0.5 ? 0 : rand( $_[0] ) },
'chi-squared-results-R' => [],
'chi-squared-results-perl' => [],
},
'standard-rand' => {
'rand-sub' => sub { rand($_[0]) },
'chi-squared-results-R' => [],
'chi-squared-results-perl' => [],
},
'MT-rand' => {
'rand-sub' => sub { $mt->rand($_[0]) },
'shuffle-counts' => [],
'chi-squared-results-R' => [],
'chi-squared-results-perl' => [],
}
);
sub shuffle {
my $randsub = shift;
$a = $_ + $randsub->( @_ - $_ ),
$b = $_[$_],
$_[$_] = $_[$a],
$_[$a] = $b
for 0 .. $#_;
return @_;
}
my @data = ( 1 .. $ASIZE );
for my $a_test_name (keys %tests_to_do){
my $atest = $tests_to_do{$a_test_name};
my $arandsub = $atest->{'rand-sub'};
for( 1 .. $T ) {
my %results;
++$results{ join '', shuffle($arandsub, @data ) } for 1 .. $N;
my $freqs = [values %results];
push(@{$atest->{'chi-squared-results-R'}},
do_chi_squared_test_in_R($freqs)
) if DO_R_CHI_SQUARE;
push(@{$atest->{'chi-squared-results-perl'}},
do_chi_squared_test_in_perl($freqs)
) if DO_PERL_CHI_SQUARE;
}
}
print "\n$0 : number of cases with bias detected\n";
for my $a_test_name (keys %tests_to_do){
if( DO_R_CHI_SQUARE ){
my $pvalue_less_than_05 = 0;
my $pvalues_string = "";
foreach my $R_results (@{$tests_to_do{$a_test_name}->{'chi-squared-results-R'}}){
$pvalue_less_than_05++ if( $R_results->[1] < $PVALUE_THRESHOLD );
$pvalues_string .= ",".$R_results->[1];
}
print "$0 : $a_test_name (using R) -> number of bias detected: $pvalue_less_than_05 : $pvalues_string\n";
}
if( DO_PERL_CHI_SQUARE ){
my $pvalue_less_than_05 = 0;
my $pvalues_string = "";
foreach my $R_results (@{$tests_to_do{$a_test_name}->{'chi-squared-results-perl'}}){
$pvalue_less_than_05++ if( $R_results->[1] < $PVALUE_THRESHOLD );
$pvalues_string .= ",".$R_results->[1];
}
print "$0 : $a_test_name (using Perl) -> number of bias detected: $pvalue_less_than_05 : $pvalues_string\n";
}
print "--------------------\n";
}
#print Dumper(\%tests_to_do);
exit(0);
sub do_chi_squared_test_in_R {
my $freqs = $_[0]; # arrayref of counts/frequencies for each choice
my $R = Statistics::R->new(); # should really re-use R obj but let's not complicated things now.
$R->send('library(MASS)');
$R->set('freqs', $freqs);
$R->run('res <- chisq.test(as.table(freqs))');
my $chisquare = $R->get('res$statistic');
my $pvalue = $R->get('res$p.value');
$R->stop();
print "do_chi_squared_test_in_R() : counts: ".join(",",@$freqs)." : x2=$chisquare, pv=$pvalue\n";
# returns [chi-squared-statistic, p-value]
return [$chisquare, $pvalue]
}
sub do_chi_squared_test_in_perl {
my $freqs = $_[0]; # arrayref of counts/frequencies for each choice
my ($chisquare, $pvalue) = @{chisquaredVal(@$freqs)};
print "do_chi_squared_test_in_perl() : counts: ".join(",",@$freqs)." : x2=$chisquare, pv=$pvalue\n";
# returns [chi-squared-statistic, p-value] (but p-value is not calc here
return [$chisquare, $pvalue]
}
sub chisquaredVal {
# BrowserUK modified Statistics::ChiSquared
# NOTE: for this to work change my to our in Statistics/ChiSquare.pm or subclass
my @data = @_;
@data = @{$data[0]} if(@data == 1 and ref($data[0]));
return "There's no data!" if(!@data);
return "Not enough data!" if(@data == 1);
# return "Malformed data!" if(grep { /\D/ } @data);
my $degrees_of_freedom = scalar(@data) - 1;
my ($chisquare, $num_samples, $expected, $i) = (0, 0, 0, 0);
if (! ref($Statistics::ChiSquare::chitable[$degrees_of_freedom])) {
print STDERR "chisquaredVal() : I can't handle ".scalar(@data)." choices without a better table.\n";
return [-1,1]; # an error obviously
}
foreach (@data) { $num_samples += $_ }
$expected = $num_samples / scalar(@data);
# return "There's no data!" unless $expected;
foreach (@data) {
$chisquare += (($_ - $expected) ** 2) / $expected;
}
# bliako modification: estimate a pvalue
my $pvalue = -1;
foreach (@{$Statistics::ChiSquare::chitable[$degrees_of_freedom]}) {
if ($chisquare < $_) {
$pvalue = $Statistics::ChiSquare::chilevels[$degrees_of_freedom]->[$i];
last
}
$i++;
}
$pvalue = (@{$Statistics::ChiSquare::chilevels[$degrees_of_freedom]})[-1] unless $pvalue > -1;
$pvalue /= 100; # 0-1 e.g. 5% -> 0.05
return [$chisquare, $pvalue]
}