hiddenOx has asked for the wisdom of the Perl Monks concerning the following question:
#Developed by: Mahmoud Elsaid #Specs: Code to measure the closest word to a misspelt one using two a +lgorithms, #BIGRAMS: split the word to bigrams and compare the bigrams together. +Calculate intersection and give score #Levenshtein Distance: Calculate how many movements required from the +word to reach original. #"Damerau distance" was used too but gave the same results of levenesh +tein. # #This work is covered by creative commons(cc) thus it is agreed to giv +e the total #ability for anybody to copy, distribute, display, and perform the wor +k and to #make derivative works under the condition of: You must give the origi +nal author credit. #http://creativecommons.org/licenses/by/2.0/uk/ # # Last Modified: 29/4/2008 10:24 AM use strict; use warnings; no warnings 'uninitialized'; use re 'eval'; #we use Damerau distance too print "Press 1 to check single word \nOR \nPress 2 to check list of wo +rds from text file\n"; chomp(my $choice = <STDIN>); my $keyword; our $fileName; use Switch; switch ($choice){ case 1{ print "Please enter the word that you would like to check: \n"; chomp(my $keyword = <STDIN>); $fileName = $keyword; print("Processing Started, This will take a while, Please be patie +nt \n"); open(STDOUT, ">$keyword Results.txt"); main($keyword); close STDOUT; } case 2{ print "Enter the file name to read words from \n"; chomp(my $keyword = <STDIN>); open (my $listWord, $keyword) || die "cant open list file.txt\n"; print("Processing Started, This will take a while, Please be patie +nt \n"); open(STDOUT, ">$keyword Results.txt"); while (my $testWord = <$listWord>){ print STDERR "$testWord"; chomp($testWord); print ("Results for: $testWord \n \n"); main(my $testword); print ("\n \n"); } close STDOUT; close $listWord; } case 3{ print "Analysis: enter the file name to read words from \n"; chomp(my $keyword = <STDIN>); print "Enter the right word to analyse the system through \n"; chomp(my $rightWord = <STDIN>); open (my $listWord, $keyword) || die "cant open list file.txt\n"; my $bigramPoints = 0; my $distancePoints = 0; print("Processing Started, This will take a while, Please be patie +nt \n"); open(STDOUT, ">$keyword Results.txt"); while (my $testWord = <$listWord>){ print ("Results for: $testWord \n \n"); chomp($testWord); my($bigramSuccess, $distanceSuccess, ) = main(my $testword, $r +ightWord); $bigramPoints=$bigramPoints + $bigramSuccess; my $distancePoints = $distancePoints + $distanceSuccess; print ("\n \n"); } print "Numbers successfuly detected by Bigrams are $bigramPoints o +f total %testWord \n"; print "Numbers successfuly detected by Edit(levenshtein) Distance +are $distancePoints of total %testWord \n"; close $listWord; close STDOUT; } else{print"choice is wrong"} } open(FILE, "$fileName Results.txt") || die "cant open $fileName Resul +ts.txt\n"; while ( <FILE> ) { print STDERR; } close FILE; sub main{ my($keyword, $rightWord) = @_; $keyword = lc($keyword); $rightWord = lc($rightWord); my (@wordValid); my($bigramScore, @distanceMatches, %bigramScore, %editDistanceScore, % +damDistanceScore); @wordValid=($keyword =~ m/ \w/g ); if(@wordValid){ print "Illegal keyword $keyword, Please enter only alphanumerics w +ithout spaces"; }else{ open (my $dict, "dictionary.txt") || die "cant open dictionary.txt +\n"; while (my $dictionWord = <$dict>) { chomp($dictionWord); $bigramScore=bigram($keyword,$dictionWord); push @{$bigramScore{$bigramScore}}, $dictionWord; my $editDistanceScore; $editDistanceScore=editDistance($keyword,$dictionWord); push @{$editDistanceScore{$editDistanceScore}}, $dictionWord; } my $x = 0; print "Top bigram coefficients for $keyword:\n"; for my $bigramScore ((sort { $b <=> $a } keys %bigramScore)[0..4]) + { next if ! $bigramScore; print "$bigramScore: ", join " ", @{$bigramScore{$bigramScore}} +, "\n"; if ( grep { $_ eq $rightWord} @{$bigramScore{$bigramSco +re}} ) { $x++; } } my $y = 0; print "\n Top distance coefficients for $keyword:\n"; for my $editDistanceScore ((sort { $a <=> $b } keys %editDistanceS +core)[0..2]) { print "$editDistanceScore: ", join " ", @{$editDistanceScore{$e +ditDistanceScore}}, "\n"; if ( grep { $_ eq $rightWord} @{$editDistanceScore{$editDist +anceScore}} ) { $y++; } next if ! $editDistanceScore; } my @results; push (@results, $x); push (@results, $y); return(@results); } } return($x, $x); } } sub bigram{ my ($key, $dictionWord) = @_; my %seen; my $matches = 0; my @matches; my $match; my %count; my @pairs = $key =~ /(?=(..))/g; my $matcher = qr/(?=(@{[join "|", @pairs]}))/; $count{$_}++ foreach (@pairs); foreach $match ($dictionWord =~ /$matcher/g) { if ($seen{$match}++ < $count{$match}) { $matches ++; push @matches, $match; } } my $bigramCoef = 2 * $matches / (@pairs + length($dictionWord)-1); return($bigramCoef) } sub editDistance{ my ($w1, $w2) = @_; my ($len1, $len2) = (length $w1, length $w2); return $len2 if ($len1 == 0); return $len1 if ($len2 == 0); my %scoreMat; my %scoreMatLev; my $countOne; my $countTwo; for (my $countOne = 0; $countOne <= $len1; ++$countOne) { for (my $countTwo = 0; $countTwo <= $len2; ++$countTwo) { $scoreMat{$countOne}{$countTwo} = 0; $scoreMat{0}{$countTwo} = $countTwo; } $scoreMat{$countOne}{0} = $countOne; } my @chars1 = split(//, $w1); my @chars2 = split(//, $w2); for (my $countOne = 1; $countOne <= $len1; ++$countOne) { for (my $countTwo = 1; $countTwo <= $len2; ++$countTwo) { my $cost = ($chars1[$countOne-1] eq $chars2[$countTwo-1]) +? 0 : 1; $scoreMat{$countOne}{$countTwo} = minimalElement([$scoreMa +t{ $countOne-1}{$countTwo} + 1, # delet +ion $scoreMat{$countOne}{$countTwo-1} + 1, + #insertion $scoreMat{$countOne-1}{$countTwo-1} + +$cost]); #insertion # if($countOne > 1 && $countTwo >1 && ($chars1[$countOne] e +q $chars2[$countTwo - 1]) && ($chars1[$countOne - 1] eq $chars2[$coun +tTwo])){ # $scoreMat{$countOne}{$countTwo} = minimalElement([$sc +oreMat{$countOne}{$countTwo}, # $scoreMat{$countOne - 2}{$countTwo-2} + $cost]); # } } } return ($scoreMat{$len1}{$len2}); } sub minimalElement { my @list = @{$_[0]}; my $min = $list[0]; foreach my $count (@list) { $min = $count if ($count < $min); } return $min; } #Author:Mahmod
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Can't divide by zero bug?
by Corion (Patriarch) on Apr 29, 2008 at 07:30 UTC | |
|
Re: Can't divide by zero bug?
by CountZero (Bishop) on Apr 29, 2008 at 09:09 UTC | |
|
Re: Can't divide by zero bug? (no, Switch.pm bug)
by grinder (Bishop) on Apr 29, 2008 at 08:35 UTC | |
|
Re: Can't divide by zero bug?
by mscharrer (Hermit) on Apr 29, 2008 at 08:30 UTC |