#Developed by: Mahmoud Elsaid #Specs: Code to measure the closest word to a misspelt one using two algorithms, #BIGRAMS: split the word to bigrams and compare the bigrams together. Calculate intersection and give score #Levenshtein Distance: Calculate how many movements required from the word to reach original. #"Damerau distance" was used too but gave the same results of leveneshtein. # #This work is covered by creative commons(cc) thus it is agreed to give the total #ability for anybody to copy, distribute, display, and perform the work and to #make derivative works under the condition of: You must give the original author credit. #http://creativecommons.org/licenses/by/2.0/uk/ # # Last Modified: 29/4/2008 10:24 AM use strict; use warnings; no warnings 'uninitialized'; use re 'eval'; #we use Damerau distance too print "Press 1 to check single word \nOR \nPress 2 to check list of words from text file\n"; chomp(my $choice = ); my $keyword; our $fileName; use Switch; switch ($choice){ case 1{ print "Please enter the word that you would like to check: \n"; chomp(my $keyword = ); $fileName = $keyword; print("Processing Started, This will take a while, Please be patient \n"); open(STDOUT, ">$keyword Results.txt"); main($keyword); close STDOUT; } case 2{ print "Enter the file name to read words from \n"; chomp(my $keyword = ); open (my $listWord, $keyword) || die "cant open list file.txt\n"; print("Processing Started, This will take a while, Please be patient \n"); open(STDOUT, ">$keyword Results.txt"); while (my $testWord = <$listWord>){ print STDERR "$testWord"; chomp($testWord); print ("Results for: $testWord \n \n"); main(my $testword); print ("\n \n"); } close STDOUT; close $listWord; } case 3{ print "Analysis: enter the file name to read words from \n"; chomp(my $keyword = ); print "Enter the right word to analyse the system through \n"; chomp(my $rightWord = ); open (my $listWord, $keyword) || die "cant open list file.txt\n"; my $bigramPoints = 0; my $distancePoints = 0; print("Processing Started, This will take a while, Please be patient \n"); open(STDOUT, ">$keyword Results.txt"); while (my $testWord = <$listWord>){ print ("Results for: $testWord \n \n"); chomp($testWord); my($bigramSuccess, $distanceSuccess, ) = main(my $testword, $rightWord); $bigramPoints=$bigramPoints + $bigramSuccess; my $distancePoints = $distancePoints + $distanceSuccess; print ("\n \n"); } print "Numbers successfuly detected by Bigrams are $bigramPoints of total %testWord \n"; print "Numbers successfuly detected by Edit(levenshtein) Distance are $distancePoints of total %testWord \n"; close $listWord; close STDOUT; } else{print"choice is wrong"} } open(FILE, "$fileName Results.txt") || die "cant open $fileName Results.txt\n"; while ( ) { print STDERR; } close FILE; sub main{ my($keyword, $rightWord) = @_; $keyword = lc($keyword); $rightWord = lc($rightWord); my (@wordValid); my($bigramScore, @distanceMatches, %bigramScore, %editDistanceScore, %damDistanceScore); @wordValid=($keyword =~ m/ \w/g ); if(@wordValid){ print "Illegal keyword $keyword, Please enter only alphanumerics without spaces"; }else{ open (my $dict, "dictionary.txt") || die "cant open dictionary.txt\n"; while (my $dictionWord = <$dict>) { chomp($dictionWord); $bigramScore=bigram($keyword,$dictionWord); push @{$bigramScore{$bigramScore}}, $dictionWord; my $editDistanceScore; $editDistanceScore=editDistance($keyword,$dictionWord); push @{$editDistanceScore{$editDistanceScore}}, $dictionWord; } my $x = 0; print "Top bigram coefficients for $keyword:\n"; for my $bigramScore ((sort { $b <=> $a } keys %bigramScore)[0..4]) { next if ! $bigramScore; print "$bigramScore: ", join " ", @{$bigramScore{$bigramScore}}, "\n"; if ( grep { $_ eq $rightWord} @{$bigramScore{$bigramScore}} ) { $x++; } } my $y = 0; print "\n Top distance coefficients for $keyword:\n"; for my $editDistanceScore ((sort { $a <=> $b } keys %editDistanceScore)[0..2]) { print "$editDistanceScore: ", join " ", @{$editDistanceScore{$editDistanceScore}}, "\n"; if ( grep { $_ eq $rightWord} @{$editDistanceScore{$editDistanceScore}} ) { $y++; } next if ! $editDistanceScore; } my @results; push (@results, $x); push (@results, $y); return(@results); } } return($x, $x); } } sub bigram{ my ($key, $dictionWord) = @_; my %seen; my $matches = 0; my @matches; my $match; my %count; my @pairs = $key =~ /(?=(..))/g; my $matcher = qr/(?=(@{[join "|", @pairs]}))/; $count{$_}++ foreach (@pairs); foreach $match ($dictionWord =~ /$matcher/g) { if ($seen{$match}++ < $count{$match}) { $matches ++; push @matches, $match; } } my $bigramCoef = 2 * $matches / (@pairs + length($dictionWord)-1); return($bigramCoef) } sub editDistance{ my ($w1, $w2) = @_; my ($len1, $len2) = (length $w1, length $w2); return $len2 if ($len1 == 0); return $len1 if ($len2 == 0); my %scoreMat; my %scoreMatLev; my $countOne; my $countTwo; for (my $countOne = 0; $countOne <= $len1; ++$countOne) { for (my $countTwo = 0; $countTwo <= $len2; ++$countTwo) { $scoreMat{$countOne}{$countTwo} = 0; $scoreMat{0}{$countTwo} = $countTwo; } $scoreMat{$countOne}{0} = $countOne; } my @chars1 = split(//, $w1); my @chars2 = split(//, $w2); for (my $countOne = 1; $countOne <= $len1; ++$countOne) { for (my $countTwo = 1; $countTwo <= $len2; ++$countTwo) { my $cost = ($chars1[$countOne-1] eq $chars2[$countTwo-1]) ? 0 : 1; $scoreMat{$countOne}{$countTwo} = minimalElement([$scoreMat{ $countOne-1}{$countTwo} + 1, # deletion $scoreMat{$countOne}{$countTwo-1} + 1, #insertion $scoreMat{$countOne-1}{$countTwo-1} + $cost]); #insertion # if($countOne > 1 && $countTwo >1 && ($chars1[$countOne] eq $chars2[$countTwo - 1]) && ($chars1[$countOne - 1] eq $chars2[$countTwo])){ # $scoreMat{$countOne}{$countTwo} = minimalElement([$scoreMat{$countOne}{$countTwo}, # $scoreMat{$countOne - 2}{$countTwo-2} + $cost]); # } } } return ($scoreMat{$len1}{$len2}); } sub minimalElement { my @list = @{$_[0]}; my $min = $list[0]; foreach my $count (@list) { $min = $count if ($count < $min); } return $min; } #Author:Mahmod