After splitting up the message into different words, you call score with the words to retrieve the probability that the message is spam. Or at least I believe that's how it works. Oh, and by the way: the given code is a new dialect of lisp that's being developed named arc.sub min { # returns the smallest argument given my $min = shift; for (@_) { $min = $_ if $_ < $min; } return $min; } sub max { # returns the largest argument given my $max = shift; for (@_) { $max = $_ if $_ > $max; } return $max; } my %good; #number of occurences for each word in normal email my %bad; #number of occurences for each word in spam my $ngood; # number of good messages my $nbad; # number of bad messages sub score { # returns score for words given my @words = @_;#words in email my @probs; #probabilities for each word (not defined yet); for my $word (@words) { my $g = 2 * ($good{$word} || 0); my $b = $bad{$word} || 0; unless ($g + $b < 5) { push @probs, max( .01, min( .99, 1 / min($b / $nbad) ), min( 1, $g / $ngood ) + min( 1, $b / $nbad) ); } } my $prod = 1; $prod *= $_ for (@probs); my $prod2 = 1; $prod2 *= $_ for (map {1 - $_ } @probs); return $prod / ($prod + $prod2); }
elusion : http://matt.diephouse.com
In reply to Re: Bayesian Filtering for Spam
by elusion
in thread Bayesian Filtering for Spam
by oakbox
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |