use Statistics::R; sub do_chi_squared_test_in_R { my $freqs = $_[0]; # arrayref of counts/frequencies for each choice my $R = Statistics::R->new(); # should really re-use R obj but let's not complicated things now. $R->send('library(MASS)'); $R->set('freqs', $freqs); $R->run('res <- chisq.test(as.table(freqs))'); my $chisquare = $R->get('res$statistic'); # single quotes please my $pvalue = $R->get('res$p.value'); $R->stop(); print "do_chi_squared_test_in_R : ".join(",",@$freqs)." : x2=$chisquare, pv=$pvalue\n"; # returns [chi-squared-statistic, p-value] return [$chisquare, $pvalue] } #### my $pvalue = -1; foreach (@{$Statistics::ChiSquare::chitable[$degrees_of_freedom]}) { if ($chisquare < $_) { $pvalue = $Statistics::ChiSquare::chilevels[$degrees_of_freedom]->[$i+1]; last } $i++; } $pvalue = (@{$Statistics::ChiSquare::chilevels[$degrees_of_freedom]})[-1] unless $pvalue > -1; $pvalue /= 100; # 0-1 e.g. 5% -> 0.05 #### ./browser_uk_test_script.pl : number of cases with bias detected ./browser_uk_test_script.pl : standard-rand (using R) -> number of bias detected: 1 / 20 (pvalues=,0.1063036,0.4628186,0.01704211,0.45147,0.6672964,0.1163614,0.4996122,0.06030822,0.764618,0.08935117,0.1812506,0.1615832,0.3479303,0.9551496,0.1814904,0.1601532,0.3921217,0.9140581,0.621188,0.6104723) ./browser_uk_test_script.pl : standard-rand (using Perl) -> number of bias detected: 1 / 20 (pvalues=,0.1,0.25,0.01,0.25,0.5,0.1,0.25,0.05,0.75,0.05,0.1,0.1,0.25,0.95,0.1,0.1,0.25,0.9,0.5,0.5) -------------------- ./browser_uk_test_script.pl : MT-rand (using R) -> number of bias detected: 2 / 20 (pvalues=,0.8734286,0.4596782,0.0376436,0.483486,0.3135795,0.4501071,0.3534776,0.6427224,0.1530366,0.1850991,0.7265003,0.6379416,0.9933123,0.01561197,0.7942019,0.2008258,0.8592597,0.2874155,0.211755,0.6454129) ./browser_uk_test_script.pl : MT-rand (using Perl) -> number of bias detected: 2 / 20 (pvalues=,0.75,0.25,0.01,0.25,0.25,0.25,0.25,0.5,0.1,0.1,0.5,0.5,0.99,0.01,0.75,0.1,0.75,0.25,0.1,0.5) -------------------- ./browser_uk_test_script.pl : bad-rand (using R) -> number of bias detected: 16 / 20 (pvalues=,0.00362283,6.262224e-05,4.627376e-06,1.718484e-06,0.07608554,0.002237529,0.01988714,0.01380236,0.07770095,0.04196283,0.01394789,6.257283e-05,0.608835,0.00027414,0.09803577,0.01257798,0.005433616,0.02350288,0.00299678,0.004870376) ./browser_uk_test_script.pl : bad-rand (using Perl) -> number of bias detected: 16 / 20 (pvalues=,0.01,0.01,0.01,0.01,0.05,0.01,0.01,0.01,0.05,0.01,0.01,0.01,0.5,0.01,0.05,0.01,0.01,0.01,0.01,0.01) -------------------- #### ./browser_uk_test_script.pl : number of cases with bias detected ./browser_uk_test_script.pl : standard-rand (using R) -> number of bias detected: 0 / 20 (pvalues=,0.5337195,0.5921474,0.1788502,0.7560645,0.1524058,0.9634717,0.3517389,0.1133675,0.8631978,0.1689414,0.1829178,0.9446986,0.7018559,0.3831006,0.422912,0.2526592,0.6182125,0.7644635,0.4913882,0.1258621) ./browser_uk_test_script.pl : standard-rand (using Perl) -> number of bias detected: 0 / 20 (pvalues=,0.5,0.5,0.1,0.75,0.1,0.95,0.25,0.1,0.75,0.1,0.1,0.9,0.5,0.25,0.25,0.25,0.5,0.75,0.25,0.1) -------------------- ./browser_uk_test_script.pl : bad-rand (using R) -> number of bias detected: 12 / 20 (pvalues=,0.06574112,0.00283819,6.942476e-06,5.380104e-05,1.697578e-05,0.001277564,0.01886317,0.0003957457,8.826589e-06,0.07862541,0.0001284624,0.2741576,0.1136995,0.1550829,0.007600639,0.00122368,0.0001251429,0.1258741,0.234761,0.1011256) ./browser_uk_test_script.pl : bad-rand (using Perl) -> number of bias detected: 12 / 20 (pvalues=,0.05,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.05,0.01,0.25,0.1,0.1,0.01,0.01,0.01,0.1,0.1,0.1) -------------------- ./browser_uk_test_script.pl : MT-rand (using R) -> number of bias detected: 1 / 20 (pvalues=,0.5276704,0.06869204,0.3610061,0.9570774,0.0468298,0.514135,0.3893473,0.9621195,0.8655001,0.4279424,0.2365762,0.6020179,0.2011017,0.7721139,0.310637,0.3746697,0.2099162,0.1343461,0.7025518,0.2929772) ./browser_uk_test_script.pl : MT-rand (using Perl) -> number of bias detected: 1 / 20 (pvalues=,0.5,0.05,0.25,0.95,0.01,0.5,0.25,0.95,0.75,0.25,0.1,0.5,0.1,0.75,0.25,0.25,0.1,0.1,0.5,0.25) -------------------- #### ./browser_uk_test_script.pl : number of cases with bias detected over 20 tests ./browser_uk_test_script.pl : MT-rand (using R) -> number of bias detected: 0 / 20 (pvalues=,0.4869334,0.4961692,0.8251134,0.2657584,0.84692,0.1296479,0.504212,0.9028209,0.8082847,0.2999607,0.154672,0.1660518,0.5143663,0.8120685,0.4452244,0.6561128,0.6123136,0.6994308,0.9302561,0.4757345) ./browser_uk_test_script.pl : MT-rand (using Perl) -> number of bias detected: 0 / 20 (pvalues=,0.25,0.25,0.75,0.25,0.75,0.1,0.5,0.9,0.75,0.25,0.1,0.1,0.5,0.75,0.25,0.5,0.5,0.5,0.9,0.25) -------------------- ./browser_uk_test_script.pl : standard-rand (using R) -> number of bias detected: 2 / 20 (pvalues=,0.3855934,0.4056266,0.9905815,0.9134483,0.007583821,0.374439,0.3795054,0.4661071,0.08190948,0.617658,0.9640541,0.9402085,0.883329,0.517288,0.04139933,0.3008889,0.4068793,0.963351,0.8351011,0.7187029) ./browser_uk_test_script.pl : standard-rand (using Perl) -> number of bias detected: 2 / 20 (pvalues=,0.25,0.25,0.99,0.9,0.01,0.25,0.25,0.25,0.05,0.5,0.95,0.9,0.75,0.5,0.01,0.25,0.25,0.95,0.75,0.5) -------------------- ./browser_uk_test_script.pl : bad-rand (using R) -> number of bias detected: 12 / 20 (pvalues=,0.0001945661,0.5148868,4.572803e-05,0.08857331,0.0002981317,0.1119708,0.09084117,0.005959205,0.2835337,5.089226e-08,0.0009614042,0.2044863,0.004434155,0.002847202,0.01542375,0.0105589,0.002659498,0.1273001,0.07932075,4.602725e-05) ./browser_uk_test_script.pl : bad-rand (using Perl) -> number of bias detected: 12 / 20 (pvalues=,0.01,0.5,0.01,0.05,0.01,0.1,0.05,0.01,0.25,0.01,0.01,0.1,0.01,0.01,0.01,0.01,0.01,0.1,0.05,0.01) -------------------- #### do_chi_squared_test_in_R() : counts: 4317,4188,4099,4231,4258,4195,4076,4330,4135,4143,4152,4191,3995,4176,4113,4088,4171,4132,4115,4154,4165,4140,4246,4190 : x2=30.8192, pv=0.1273001 do_chi_squared_test_in_perl() : counts: 4317,4188,4099,4231,4258,4195,4076,4330,4135,4143,4152,4191,3995,4176,4113,4088,4171,4132,4115,4154,4165,4140,4246,4190 : x2=30.8192, pv=0.1 do_chi_squared_test_in_R() : counts: 4032,4106,4154,4136,4234,4065,4166,4039,4279,4210,4031,4171,4279,4291,4132,4190,4281,4129,4179,4128,4188,4172,4175,4233 : x2=33.10112, pv=0.07932075 do_chi_squared_test_in_perl() : counts: 4032,4106,4154,4136,4234,4065,4166,4039,4279,4210,4031,4171,4279,4291,4132,4190,4281,4129,4179,4128,4188,4172,4175,4233 : x2=33.10112, pv=0.05 #### #!/usr/bin/env perl use strict; use Math::Random::MT; use Statistics::R; use Statistics::ChiSquare; use Data::Dumper; my $mt = Math::Random::MT->new(); our $N //= 1e5; # number of shuffles to make, after that a chi-squared test is performed our $ASIZE //= 4; # nunber of items to shuffle our $T //= 20; # number of chi-squared tests (implies a new shuffle over $N each time) our $PVALUE_THRESHOLD //= 0.05; # we accept bias if p-value of test is less than this threshold # for large ASIZE perl's Statistics::ChiSquared spits out 'I can't handle 24 choices without a better table.' # (note 4! = 24) # additionally, in order to make a decision whether a set of counts/frequencies # of the combinations the shuffle produced we need not only the chi-square statistic # (the larger the more bias) but also we need the p-value (confidence interval within which # this statistic holds). Perl's does not produce a p-value # so skip chi-squared calculation in perl all together. use constant DO_R_CHI_SQUARE => 1; use constant DO_PERL_CHI_SQUARE => 1; my %tests_to_do = ( 'bad-rand' => { 'rand-sub' => sub { rand() < 0.5 ? 0 : rand( $_[0] ) }, 'chi-squared-results-R' => [], 'chi-squared-results-perl' => [], }, 'standard-rand' => { 'rand-sub' => sub { rand($_[0]) }, 'chi-squared-results-R' => [], 'chi-squared-results-perl' => [], }, 'MT-rand' => { 'rand-sub' => sub { $mt->rand($_[0]) }, 'shuffle-counts' => [], 'chi-squared-results-R' => [], 'chi-squared-results-perl' => [], } ); sub shuffle { my $randsub = shift; $a = $_ + $randsub->( @_ - $_ ), $b = $_[$_], $_[$_] = $_[$a], $_[$a] = $b for 0 .. $#_; return @_; } my @data = ( 1 .. $ASIZE ); for my $a_test_name (keys %tests_to_do){ my $atest = $tests_to_do{$a_test_name}; my $arandsub = $atest->{'rand-sub'}; for( 1 .. $T ) { my %results; ++$results{ join '', shuffle($arandsub, @data ) } for 1 .. $N; my $freqs = [values %results]; push(@{$atest->{'chi-squared-results-R'}}, do_chi_squared_test_in_R($freqs) ) if DO_R_CHI_SQUARE; push(@{$atest->{'chi-squared-results-perl'}}, do_chi_squared_test_in_perl($freqs) ) if DO_PERL_CHI_SQUARE; } } print "\n$0 : number of cases with bias detected\n"; for my $a_test_name (keys %tests_to_do){ if( DO_R_CHI_SQUARE ){ my $pvalue_less_than_05 = 0; my $pvalues_string = ""; foreach my $R_results (@{$tests_to_do{$a_test_name}->{'chi-squared-results-R'}}){ $pvalue_less_than_05++ if( $R_results->[1] < $PVALUE_THRESHOLD ); $pvalues_string .= ",".$R_results->[1]; } print "$0 : $a_test_name (using R) -> number of bias detected: $pvalue_less_than_05 : $pvalues_string\n"; } if( DO_PERL_CHI_SQUARE ){ my $pvalue_less_than_05 = 0; my $pvalues_string = ""; foreach my $R_results (@{$tests_to_do{$a_test_name}->{'chi-squared-results-perl'}}){ $pvalue_less_than_05++ if( $R_results->[1] < $PVALUE_THRESHOLD ); $pvalues_string .= ",".$R_results->[1]; } print "$0 : $a_test_name (using Perl) -> number of bias detected: $pvalue_less_than_05 : $pvalues_string\n"; } print "--------------------\n"; } #print Dumper(\%tests_to_do); exit(0); sub do_chi_squared_test_in_R { my $freqs = $_[0]; # arrayref of counts/frequencies for each choice my $R = Statistics::R->new(); # should really re-use R obj but let's not complicated things now. $R->send('library(MASS)'); $R->set('freqs', $freqs); $R->run('res <- chisq.test(as.table(freqs))'); my $chisquare = $R->get('res$statistic'); my $pvalue = $R->get('res$p.value'); $R->stop(); print "do_chi_squared_test_in_R() : counts: ".join(",",@$freqs)." : x2=$chisquare, pv=$pvalue\n"; # returns [chi-squared-statistic, p-value] return [$chisquare, $pvalue] } sub do_chi_squared_test_in_perl { my $freqs = $_[0]; # arrayref of counts/frequencies for each choice my ($chisquare, $pvalue) = @{chisquaredVal(@$freqs)}; print "do_chi_squared_test_in_perl() : counts: ".join(",",@$freqs)." : x2=$chisquare, pv=$pvalue\n"; # returns [chi-squared-statistic, p-value] (but p-value is not calc here return [$chisquare, $pvalue] } sub chisquaredVal { # BrowserUK modified Statistics::ChiSquared # NOTE: for this to work change my to our in Statistics/ChiSquare.pm or subclass my @data = @_; @data = @{$data[0]} if(@data == 1 and ref($data[0])); return "There's no data!" if(!@data); return "Not enough data!" if(@data == 1); # return "Malformed data!" if(grep { /\D/ } @data); my $degrees_of_freedom = scalar(@data) - 1; my ($chisquare, $num_samples, $expected, $i) = (0, 0, 0, 0); if (! ref($Statistics::ChiSquare::chitable[$degrees_of_freedom])) { print STDERR "chisquaredVal() : I can't handle ".scalar(@data)." choices without a better table.\n"; return [-1,1]; # an error obviously } foreach (@data) { $num_samples += $_ } $expected = $num_samples / scalar(@data); # return "There's no data!" unless $expected; foreach (@data) { $chisquare += (($_ - $expected) ** 2) / $expected; } # bliako modification: estimate a pvalue my $pvalue = -1; foreach (@{$Statistics::ChiSquare::chitable[$degrees_of_freedom]}) { if ($chisquare < $_) { $pvalue = $Statistics::ChiSquare::chilevels[$degrees_of_freedom]->[$i]; last } $i++; } $pvalue = (@{$Statistics::ChiSquare::chilevels[$degrees_of_freedom]})[-1] unless $pvalue > -1; $pvalue /= 100; # 0-1 e.g. 5% -> 0.05 return [$chisquare, $pvalue] }