#!/usr/bin/perl #This script is used to extract words not found in the dictionary file from corpus data. For this, we use binary search. Linear source would take too long and use too much resources. use strict; use warnings; #Use encode because of special characters. use encoding "utf8"; use open IN => "utf8"; use open OUT => "utf8"; binmode STDIN => "utf8"; binmode STDOUT => "utf8"; my $wordlist = shift; my @allwords; #array containing all dictionary words. #First, I open the dictionary file. I then push all words into the allwords array. open WORDLIST, $wordlist; while (){ chomp; s/\r//; my $word = $_; push (@allwords,$word) } close WORDLIST; #I then sort the array in alphabetic order. my @sorted_wordlist = sort {$a cmp $b} @allwords; #I create a subroutine to use binary search. sub binary_search { my ($array, $target) = @_; #set arguments for future use : $array will be the sorted wordlist and $target, the word we will be looking for. my ($low, $high) = (0, @$array - 1); #Declare high and low indexes. Low index = 0 and high index = last index of the array. while ($low < $high) { # If high index is higher than the low index, keep the window open. my $cur = int(($low+$high)/2); #Declare a middle, which is the total of high index and low index /2. if ($array->[$cur] lt $target) { $low = $cur + 1; #If the target is too small, try lower. } elsif ($array->[$cur] gt $target) { $high = $cur - 1; #Else, try higher. } else{ return $cur; #Got it! } } return; #It doesn't exist. } # Open the corpus data. while (<>){ chomp; s/\r//; my $corpus_word = $_; #Declare the read line as a corpus word. my $index = binary_search (\@sorted_wordlist, $corpus_word); #use the binary search to find the index if($index == 0){ #if index is not returned, then the word doesn't exist. print "$corpus_word\n"; } else{ } }