my hash is like this: (term is the key and the numbers are value of each term)$MI_T=MI($hypo,$text,\%hash_es,\%hash_en); sub MI { my ($string_es,$string_en,$hash_es,$hash_en)=@_; my @array_es= my @array_en = (); @array_es = split ' ', $hash_es{$string_es}; @array_en = split ' ', $hash_en{$string_en}; my $prob_es = ($#array_es+1)/6939873; my $prob_en = ($#array_en+1)/6939873; my $intersection= Intersection(\@array_es,\@array_en); my $prob_es_en= ($intersection)/6939873; $prob_es_en = ($prob_es_en + ($prob_es*$prob_en*0.1))/1.1; my $mi= $prob_es_en * log( $prob_es_en / ($prob_es * $prob_en) ); return $mi; } sub Intersection { my( $refA, $refB ) = @_; my %counts; ++$counts{ $_ } for @$refA; ++$counts{ $_ } for @$refB; my $intersects = 0; $counts{ $_ } > 1 and ++$intersects for keys %counts; return $intersects; }
while the number of digits corresponding to each term can vary form 1 to n.term1 1 3 5 8 9 15 90 term2 23 56 789 23 1 54 89 term3 23 345 677 456 23 .... termn 54 6 768 5678 34 56 78
In reply to improving the speed by perl_lover_always
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |