Hello holy monks. I gave it a try, but using the following script I made just prints out every word, even if it does exist in the dictionary file. Could you point me to where I'm wrong ? I'd really appreciate it.
#!/usr/bin/perl
#This script is used to extract words not found in the dictionary file
+ from corpus data. For this, we use binary search. Linear source woul
+d take too long and use too much resources.
use strict;
use warnings;
#Use encode because of special characters.
use encoding "utf8";
use open IN => "utf8";
use open OUT => "utf8";
binmode STDIN => "utf8";
binmode STDOUT => "utf8";
my $wordlist = shift;
my @allwords; #array containing all dictionary words.
#First, I open the dictionary file. I then push all words into the all
+words array.
open WORDLIST, $wordlist;
while (<WORDLIST>){
chomp;
s/\r//;
my $word = $_;
push (@allwords,$word)
}
close WORDLIST;
#I then sort the array in alphabetic order.
my @sorted_wordlist = sort {$a cmp $b} @allwords;
#I create a subroutine to use binary search.
sub binary_search {
my ($array, $target) = @_;
#set arguments for future use : $array will be the sorted wordlist a
+nd $target, the word we will be looking for.
my ($low, $high) = (0, @$array - 1);
#Declare high and low indexes. Low index = 0 and high index = last i
+ndex of the array.
while ($low < $high) { # If high index is higher than the low index,
+ keep the window open.
my $cur = int($low+$high)/2; #Declare a middle, which is the total
+ of high index and low index /2.
if ($array->[$cur] lt $target) {
$low = $cur + 1; #If the target is too small, try lower.
} else {
$high = $cur; #Else, try higher.
}
}
}
# Open the corpus data.
while (<>){
chomp;
s/\r//;
my $corpus_word = $_; #Declare the read line as a corpus word.
my $index = binary_search (\@sorted_wordlist, $corpus_word); #use
+the binary search to find the index
if($index < @sorted_wordlist && $sorted_wordlist[$index] eq $corpu
+s_word){
#If found, do nothing.
} else{
print "$corpus_word\n"; #If not, print.
}
}
|