in reply to Re^3: Check word presence WITHOUT hashes or grep
in thread Check word presence WITHOUT hashes or grep
#!/usr/bin/perl #This script is used to extract words not found in the dictionary file + from corpus data. For this, we use binary search. Linear source woul +d take too long and use too much resources. use strict; use warnings; #Use encode because of special characters. use encoding "utf8"; use open IN => "utf8"; use open OUT => "utf8"; binmode STDIN => "utf8"; binmode STDOUT => "utf8"; my $wordlist = shift; my @allwords; #array containing all dictionary words. #First, I open the dictionary file. I then push all words into the all +words array. open WORDLIST, $wordlist; while (<WORDLIST>){ chomp; s/\r//; my $word = $_; push (@allwords,$word) } close WORDLIST; #I then sort the array in alphabetic order. my @sorted_wordlist = sort {$a cmp $b} @allwords; #I create a subroutine to use binary search. sub binary_search { my ($array, $target) = @_; #set arguments for future use : $array will be the sorted wordlist a +nd $target, the word we will be looking for. my ($low, $high) = (0, @$array - 1); #Declare high and low indexes. Low index = 0 and high index = last i +ndex of the array. while ($low < $high) { # If high index is higher than the low index, + keep the window open. my $cur = int($low+$high)/2; #Declare a middle, which is the total + of high index and low index /2. if ($array->[$cur] lt $target) { $low = $cur + 1; #If the target is too small, try lower. } else { $high = $cur; #Else, try higher. } } } # Open the corpus data. while (<>){ chomp; s/\r//; my $corpus_word = $_; #Declare the read line as a corpus word. my $index = binary_search (\@sorted_wordlist, $corpus_word); #use +the binary search to find the index if($index < @sorted_wordlist && $sorted_wordlist[$index] eq $corpu +s_word){ #If found, do nothing. } else{ print "$corpus_word\n"; #If not, print. } }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^5: Check word presence WITHOUT hashes or grep
by ysth (Canon) on May 01, 2008 at 03:38 UTC | |
by gojippo (Novice) on May 01, 2008 at 05:08 UTC | |
by ysth (Canon) on May 01, 2008 at 06:08 UTC | |
by gojippo (Novice) on May 01, 2008 at 06:55 UTC | |
|
Re^5: Check word presence WITHOUT hashes or grep
by GrandFather (Saint) on May 01, 2008 at 04:13 UTC |