in reply to Fisher's Exact Test
I was really desperate for a perl implemetation of FET, Text::NSP::Measures::2D::Fisher just didn't stand up in any way - the 'basic usage' they give doesn't even work, even once you have fixed their syntax errors and got it to compile!
So that was the end of that trail as far as i was concerned!use Text::NSP::Measures::2D::Fisher2::twotailed; my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10; $twotailed_value = calculateStatistic( n11=>$n11, n1p=>$n1p, np1=>$np1, npp=>$npp); if( ($errorCode = getErrorCode())) { print STDERR $errorCode." - ".getErrorMessage(); } else { print getStatisticName."value for bigram is ".$twotailed_value; }
So obviously i end up looking in perlmonks for something better though through! So, following up from resonator's comments, i made some test code for the posted code :
#! usr/bin/perl use warnings; use strict; use Test; # use a BEGIN block so we print our plan before MyModule is loaded BEGIN { plan tests => 40, todo => [], } use lib "$ENV{HOME}/Desktop/perl/lib"; use Statistics::FET; use Statistics::FishersExactTest; print "***888 - Original Statistics::FET - 888***\n"; ## two tailed print "\n***\ntwotailed a-b-c-d\n***\n"; ok( Statistics::FET::fishers_exact(100 , 345 , 150 , 450, 'true', ), q +r/^0.378973935/ ); ok( Statistics::FET::fishers_exact(150 , 345 , 150 , 500, 'true', ), q +r/^0.006634280/ ); ok( Statistics::FET::fishers_exact(110 , 345 , 250 , 500, 'true', ), q +r/^0.000737131/ ); ok( Statistics::FET::fishers_exact(11 , 34 , 25 , 50, 'true', ), qr/^0 +.410784065/ ); ok( Statistics::FET::fishers_exact(11 , 34 , 25 , 60, 'true', ), qr/^0 +.680962824/ ); print "\n***\ntwotailed (b-a-d-c)\n***\n"; ok( Statistics::FET::fishers_exact(345 , 100 , 450 , 150, 'true', ), q +r/^0.378973935/ ); ok( Statistics::FET::fishers_exact(345 , 150 , 500 , 150, 'true', ), q +r/^0.006634280/ ); ok( Statistics::FET::fishers_exact(345 , 110 , 500 , 250, 'true', ), q +r/^0.000737131/ ); ok( Statistics::FET::fishers_exact(34 , 11 , 50 , 25, 'true', ), qr/^0 +.410784065/ ); ok( Statistics::FET::fishers_exact(34 , 11 , 60 , 25, 'true', ), qr/^0 +.680962824/ ); ## one tailed ## - return 'Right' tail print "\n***\nright tailed (a-b-c-d)\n***\n"; ok( Statistics::FET::fishers_exact(100 , 345 , 150 , 450, ), qr/^0.846 +286197/ ); ok( Statistics::FET::fishers_exact(150 , 345 , 150 , 500, ), qr/^0.003 +674851/ ); ok( Statistics::FET::fishers_exact(110 , 345 , 250 , 500, ), qr/^0.999 +730774/ ); ok( Statistics::FET::fishers_exact(11 , 34 , 25 , 50, ), qr/^0.8923225 +47/ ); ok( Statistics::FET::fishers_exact(11 , 34 , 25 , 60, ), qr/^0.7893588 +18/ ); ## one tailed ## - return 'left' tail print "\n***\nleft tailed b-a-c-d\n***\n"; ok( Statistics::FET::fishers_exact(345 , 100 , 450 , 150, ), qr/^0.191 +219881/ ); ok( Statistics::FET::fishers_exact(345 , 150 , 500 , 150, ), qr/^0.997 +565862/ ); ok( Statistics::FET::fishers_exact(345 , 110 , 500 , 250, ), qr/^0.000 +436655/ ); ok( Statistics::FET::fishers_exact(34 , 11 , 50 , 25, ), qr/^0.2060577 +17/ ); ok( Statistics::FET::fishers_exact(34 , 11 , 60 , 25, ), qr/^0.3492183 +59/ ); print "***888 - bugfixed Statistics::FishersExactTest - 888***\n"; ## two tailed print "\n***\ntwotailed a-b-c-d\n***\n"; ok( Statistics::FishersExactTest::fishers_exact(100 , 345 , 150 , 450, + 'true', ), qr/^0.378973935/ ); ok( Statistics::FishersExactTest::fishers_exact(150 , 345 , 150 , 500, + 'true', ), qr/^0.006634280/ ); ok( Statistics::FishersExactTest::fishers_exact(110 , 345 , 250 , 500, + 'true', ), qr/^0.000737131/ ); ok( Statistics::FishersExactTest::fishers_exact(11 , 34 , 25 , 50, 'tr +ue', ), qr/^0.410784065/ ); ok( Statistics::FishersExactTest::fishers_exact(11 , 34 , 25 , 60, 'tr +ue', ), qr/^0.680962824/ ); print "\n***\ntwotailed (b-a-d-c)\n***\n"; ok( Statistics::FishersExactTest::fishers_exact(345 , 100 , 450 , 150, + 'true', ), qr/^0.378973935/ ); ok( Statistics::FishersExactTest::fishers_exact(345 , 150 , 500 , 150, + 'true', ), qr/^0.006634280/ ); ok( Statistics::FishersExactTest::fishers_exact(345 , 110 , 500 , 250, + 'true', ), qr/^0.000737131/ ); ok( Statistics::FishersExactTest::fishers_exact(34 , 11 , 50 , 25, 'tr +ue', ), qr/^0.410784065/ ); ok( Statistics::FishersExactTest::fishers_exact(34 , 11 , 60 , 25, 'tr +ue', ), qr/^0.680962824/ ); ## one tailed ## - return 'Right' tail print "\n***\nright tailed (a-b-c-d)\n***\n"; ok( Statistics::FishersExactTest::fishers_exact(100 , 345 , 150 , 450, + ), qr/^0.846286197/ ); ok( Statistics::FishersExactTest::fishers_exact(150 , 345 , 150 , 500, + ), qr/^0.003674851/ ); ok( Statistics::FishersExactTest::fishers_exact(110 , 345 , 250 , 500, + ), qr/^0.999730774/ ); ok( Statistics::FishersExactTest::fishers_exact(11 , 34 , 25 , 50, ), +qr/^0.892322547/ ); ok( Statistics::FishersExactTest::fishers_exact(11 , 34 , 25 , 60, ), +qr/^0.789358818/ ); ## one tailed ## - return 'left' tail print "\n***\nleft tailed b-a-c-d\n***\n"; ok( Statistics::FishersExactTest::fishers_exact(345 , 100 , 450 , 150, + ), qr/^0.191219881/ ); ok( Statistics::FishersExactTest::fishers_exact(345 , 150 , 500 , 150, + ), qr/^0.997565862/ ); ok( Statistics::FishersExactTest::fishers_exact(345 , 110 , 500 , 250, + ), qr/^0.000436655/ ); ok( Statistics::FishersExactTest::fishers_exact(34 , 11 , 50 , 25, ), +qr/^0.206057717/ ); ok( Statistics::FishersExactTest::fishers_exact(34 , 11 , 60 , 25, ), +qr/^0.349218359/ ); __DATA__ Fisher's Exact Test http://www.langsrud.com/fisher.htm # 1 + 6 + 11 + 16 ------------------------------------------ TABLE = [ 100 , 345 , 150 , 450 ] Left : p-value = 0.191219881514147 Right : p-value = 0.8462861978699979 2-Tail : p-value = 0.37897393517037725 # 2 + 7 + 12 + 17 ------------------------------------------ TABLE = [ 150 , 345 , 150 , 500 ] Left : p-value = 0.9975658620326552 Right : p-value = 0.0036748512483445626 2-Tail : p-value = 0.006634280583017804 # 3 + 8 + 13 + 18 ------------------------------------------ TABLE = [ 110 , 345 , 250 , 500 ] Left : p-value = 0.00043665577543502157 Right : p-value = 0.9997307748788292 2-Tail : p-value = 0.0007371317326469082 # 4 + 9 + 14 + 19 ------------------------------------------ TABLE = [ 11 , 34 , 25 , 50 ] Left : p-value = 0.20605771741014312 Right : p-value = 0.8923225471119328 2-Tail : p-value = 0.41078406593596795 # 5 + 10 + 15 + 20 ------------------------------------------ TABLE = [ 11 , 34 , 25 , 60 ] Left : p-value = 0.34921835927372824 Right : p-value = 0.7893588188856397 2-Tail : p-value = 0.6809628247605259 ------------------------------------------
And it seemed to backup what resonator said, but tlm's code did get the answer right for *some* of the tests, so i was encouraged and made the following changes (extremely minor!). Mainly (erm... only) adding in :
TEST: ## simplified test equation my $test = $a*$d - $b*$c; ## introduced switching around of input pairs if ($test < 0 && $ts){ ($a, $b, $c, $d, ) = ($b, $a, $d, $c,); goto TEST; }
right after the 'test' bit that had people confused!So, now it passes all tests!
perl Desktop/perl/lib/Statistics/FET_test_complete.pl 1..40 # Running under perl version 5.010000 for linux # Current time local: Wed Jul 15 11:39:44 2009 # Current time GMT: Wed Jul 15 10:39:44 2009 # Using Test.pm version 1.25 ***888 - Original Statistics::FET - 888*** *** twotailed a-b-c-d *** not ok 1 # Test 1 got: "1" (Desktop/perl/lib/Statistics/FET_test_complete.pl at + line 18) # Expected: "(?-xism:^0.378973935)" # Desktop/perl/lib/Statistics/FET_test_complete.pl line 18 is: ok( St +atistics::FET::fishers_exact(100 , 345 , 150 , 450, 'true', ), qr/^0. +378973935/ ); ok 2 not ok 3 # Test 3 got: "1" (Desktop/perl/lib/Statistics/FET_test_complete.pl at + line 20) # Expected: "(?-xism:^0.000737131)" # Desktop/perl/lib/Statistics/FET_test_complete.pl line 20 is: ok( St +atistics::FET::fishers_exact(110 , 345 , 250 , 500, 'true', ), qr/^0. +000737131/ ); not ok 4 # Test 4 got: "1" (Desktop/perl/lib/Statistics/FET_test_complete.pl at + line 21) # Expected: "(?-xism:^0.410784065)" # Desktop/perl/lib/Statistics/FET_test_complete.pl line 21 is: ok( St +atistics::FET::fishers_exact(11 , 34 , 25 , 50, 'true', ), qr/^0.4107 +84065/ ); not ok 5 # Test 5 got: "1" (Desktop/perl/lib/Statistics/FET_test_complete.pl at + line 22) # Expected: "(?-xism:^0.680962824)" # Desktop/perl/lib/Statistics/FET_test_complete.pl line 22 is: ok( St +atistics::FET::fishers_exact(11 , 34 , 25 , 60, 'true', ), qr/^0.6809 +62824/ ); *** twotailed (b-a-d-c) *** ok 6 not ok 7 # Test 7 got: "1" (Desktop/perl/lib/Statistics/FET_test_complete.pl at + line 27) # Expected: "(?-xism:^0.006634280)" # Desktop/perl/lib/Statistics/FET_test_complete.pl line 27 is: ok( St +atistics::FET::fishers_exact(345 , 150 , 500 , 150, 'true', ), qr/^0. +006634280/ ); ok 8 ok 9 ok 10 *** right tailed (a-b-c-d) *** ok 11 ok 12 ok 13 ok 14 ok 15 *** left tailed b-a-c-d *** ok 16 ok 17 ok 18 ok 19 ok 20 ***888 - bugfixed Statistics::FishersExactTest - 888*** *** twotailed a-b-c-d *** ok 21 ok 22 ok 23 ok 24 ok 25 *** twotailed (b-a-d-c) *** ok 26 ok 27 ok 28 ok 29 ok 30 *** right tailed (a-b-c-d) *** ok 31 ok 32 ok 33 ok 34 ok 35 *** left tailed b-a-c-d *** ok 36 ok 37 ok 38 ok 39 ok 40
So here is some updated code for anyone who is interested!
=head1 NAME Statistics::FET - Fisher's Exact Test statistic (2x2 case) =head1 SYNOPSIS use Statistics::FET 'fishers_exact'; =head1 DESCRIPTION This module exports only one function, C<fishers_exact>, which computes the one- or two-sided Fisher's Exact Test statistic for the 2 x 2 case. In the following documentation I will be referring to the following family of 2 x 2 contingency tables * | * | r1 = a+b --------+-------+---------- * | * | r2 = c+d --------+-------+---------- c1 | c2 | N = a+c | = b+d | = a+b+c+d The *'s mark the cells, N is the total number of points represented by the table, and r1, r2, c1, c2 are the marginals. As suggested by the equalities, the letters a, b, c, d refer to the various cells (reading them left-to-right, top-to-bottom). Depending on context, the letter c (for example) will refer *either* to the lower left cell of the table *or* to a specific value in that cell. Same for a, b, and d. In what follows, H(x) (or more precisely, H(x; r1, r2, c1)) refers to the hypergeometric expression r1!*r2!*c1!*c2! ------------------------------------- (r1+r2)!*x!*(r1-x)!*(c1-x)!*(r2-c1+x)! (I omit c2 from the parametrization of H because c2 = r1 + r2 - c1.) =head1 FUNCTION =over 4 =item fishers_exact( $a, $b, $c, $d, $two_sided ) The paramater C<$two_sided> is optional. If missing or false C<fishers_exact> computes the one-sided FET p-value. More specifically, it computes the sum of H(x; a+b, c+d, a+c) for x = a to x = min(a+b, a+c) - "the right side". (If you want "the left side", i +.e. the sum of H(x; a+b, c+d, a+c) for x = max(0, a-d) to x = a, then compute C<fishers_exact( $b, $a, $d, $c ) +> or C<fishers_exact( $c, $d, $a, $b )> (these two are equivalent).) If C<$two_sided> is true, the returned p-value will be for the two-sid +ed FET. =back =cut ## let the code begin... use strict; use warnings; package Statistics::FishersExactTest; use Exporter 'import'; use Math::Pari (); my $Tolerance = 1; $Tolerance /= 2 while 1 + $Tolerance/2 > 1; @Statistics::FET::EXPORT_OK = 'fishers_exact'; sub fishers_exact { my ( $a, $b, $c, $d, $ts ) = @_; TEST: ## simplified test equation my $test = $a*$d - $b*$c; ## introduced switching around of input pairs if ($test < 0 && $ts){ ($a, $b, $c, $d, ) = ($b, $a, $d, $c,); goto TEST; } if ($test < 0 and $ts){ die "Cannot compute two tailed FET for input values given \( ".(jo +in ", ", @_)."\)\n"; } # below here, $test < 0 implies !$ts; my $p_val; if ( $test < 0 ) { if ( $d < $a ) { $p_val = _fishers_exact( $d, $c, $b, $a, $ts, 1 ); } else { $p_val = _fishers_exact( $a, $b, $c, $d, $ts, 1 ); } } else { if ( $b < $c ) { $p_val = _fishers_exact( $b, $a, $d, $c, $ts, 0 ); } else { $p_val = _fishers_exact( $c, $d, $a, $b, $ts, 0 ); } } return $p_val; } sub _fishers_exact { my ( $a, $b, $c, $d, $ts, $complement ) = @_; die "Bad args\n" if $ts && $complement; my ( $aa, $bb, $cc, $dd ) = ( $a, $b, $c, $d ); my $first = my $delta = _single_term( $aa, $bb, $cc, $dd ); my $p_val = 0; { $p_val += $delta; last if $aa < 1; $delta *= ( ( $aa-- * $dd-- )/( ++$bb * ++$cc ) ); redo; } if ( $ts ) { my $m = $b < $c ? $b : $c; ($aa, $bb, $cc, $dd ) = ( $a + $m, $b - $m, $c - $m, $d + $m ); $delta = _single_term( $aa, $bb, $cc, $dd ); my $bound = -$Tolerance; while ( $bound <= ( $first - $delta )/$first && $aa > $a ) { $p_val += $delta; $delta *= ( ( $aa-- * $dd-- )/( ++$bb * ++$cc ) ); } } elsif ( $complement ) { $p_val = 1 - $p_val + $first; } return $p_val; } sub _single_term { my ( $a, $b, $c, $d ) = @_; my ( $r1, $r2 ) = ($a + $b, $c + $d); my ( $c1, $c2 ) = ($a + $c, $b + $d); my $N = $r1 + $r2; return exp( _ln_fact( $r1 ) + _ln_fact( $r2 ) + _ln_fact( $c1 ) + _ln_fact( $c2 ) - _ln_fact( $N ) - ( _ln_fact( $a ) + _ln_fact( $b ) + _ln_fact( $c ) + _ln_fact( $d ) ) ); } { my $two_pi; my $pi_over_3; my $half; BEGIN { $two_pi = Math::Pari::PARI( 2 * atan2 0, -1 ); $pi_over_3 = Math::Pari::PARI( atan2( 0, -1 )/3 ); $half = Math::Pari::PARI( 0.5 ); } sub _ln_fact { my $n = Math::Pari::PARI( shift() ); die "Bad args to _ln_fact: $n" if $n < 0; my $ln_fact; eval { $ln_fact = log Math::Pari::factorial( $n ); }; if ( $@ ) { die $@ unless $@ =~ /\QPARI: *** exponent overflow/; # Gosper's approximation; from # http://mathworld.wolfram.com/StirlingsApproximation.html $ln_fact = $half * log( $two_pi*$n + $pi_over_3 ) + $n * log( $n ) - $n; } return $ln_fact; } } 1; __END__
Further tests|opinions are always welcome! I was surprised there was not more FET mods there, at least that i could find!