in reply to Comparing Lines within a Word List
So you have a looming deadline and no idea how to solve your problem (every CS problem can be solved "using regular expressions in a Perl script"), so you come here to ask us to do your homework for you... well today is your lucky day :-P
My first thought was "spell checker", so I investigated how they work. Apparently one possible implementation is the use of a Trie. I've never worked with a trie before, so I decided to use this chance to try it out. Since it's the first time I'm working with tries, the code might have some bugs. I compared it to a plain, brute-force linear search regex match. In my first tests, the trie data structure takes roughly 5x the memory (as reported by Devel::Size), but on large word lists I saw a 100x to 700x speedup. (Note: My implementation of AnomalousMonk's xor+tr search seems to be roughly twice as fast as the regex implementation.)
If you want to understand what's going on in the code, the book Learning Perl is a good start, as is perldsc. Also, ask here.
(Protip: Don't get thrown out of your class or school for plagiarism and cite your sources.)
#!/usr/bin/env perl use warnings; use strict; use open qw/:std :utf8/; use Time::HiRes qw/gettimeofday tv_interval/; use Devel::Size qw/total_size/; die "Usage: $0 [WORDFILE]\n" unless @ARGV<2; my $WORDFILE = shift || '/usr/share/dict/words'; my %TRIE; # for trie_search() my @ALLWORDS; # for re_search() open my $ifh, '<:utf8', $WORDFILE or die $!; my $cnt=0; my $t0_build = [gettimeofday]; while (<$ifh>) { chomp; next if substr($_,-2) eq "'s"; push @ALLWORDS, $_; my $ptr = \%TRIE; # pointer into trie $ptr = ($$ptr{$_} ||= {}) for split //; $$ptr{"\0"}=1; # end-of-word marker $cnt++; } my $build_time_s = tv_interval($t0_build); close $ifh; print "Built a trie out of $cnt words in $build_time_s s\n"; print "Size of trie: ".total_size(\%TRIE) ." bytes\n"; print "Size of array: ".total_size(\@ALLWORDS)." bytes\n"; # just some stats, comment out if not needed use List::Util qw/max/; my $maxlen = max(map {length} @ALLWORDS); print "Longest word(s): $maxlen letters\n"; print "Those are:\n"; print "\t$_\n" for grep {length==$maxlen} @ALLWORDS; # user input loop my ($avg_trie_s,$avg_re_s,$avg_cnt) = (0,0,0); while(1) { print "Enter a word (blank=exit): "; chomp(my $word = <STDIN>); last unless $word; # time trie search my $t0_trie = [gettimeofday]; my @found = trie_search($word); my $trie_time_s = tv_interval($t0_trie); @found = sort @found; print "$_\n" for @found; print "--- trie: $trie_time_s s\n"; $avg_trie_s += $trie_time_s; # time regex search my $t0_re = [gettimeofday]; # swap re_search with xor_search here to test that my @re_found = re_search($word); my $trie_re_s = tv_interval($t0_re); @re_found = sort @re_found; print "$_\n" for @re_found; print "--- regex: $trie_re_s s\n"; $avg_re_s += $trie_re_s; $avg_cnt++; # compare results warn "Error: found/re_found array length mismatch\n" unless @found==@re_found; $found[$_] eq $re_found[$_] or warn "Error: element $_ mismatch: found=$found[$_], " ."re_found=$re_found[$_]\n" for 0..$#found; } if ($avg_cnt>1) { $avg_trie_s /= $avg_cnt; $avg_re_s /= $avg_cnt; print "Avg. trie time=$avg_trie_s s, ", "Avg. regex time=$avg_re_s s\n"; } sub trie_search { my ($word) = @_; my @found; my @lett = split //, $word; my $ptr = \%TRIE; # pointer into trie for my $i (0..$#lett) { # for each letter in the word # inspect all other options at letter $i for my $opt (keys %$ptr) { next if $opt eq "\0"; my $tptr = $$ptr{$opt}; # temp pointer # walk through the rest of the letters in the trie $tptr = $$tptr{$lett[$_]} or last for $i+1..$#lett; if ($tptr && $$tptr{"\0"}) { # is this a full word? # swap out the one letter substr(my $match = $word, $i, 1) = $opt; push @found, $match unless $match eq $word; } } $ptr = $$ptr{$lett[$i]}; # walk trie } return @found; } sub re_search { my ($word) = @_; my $re = join '|', # build a regex for this word map {quotemeta(substr($word,0,$_)).'.' .quotemeta(substr($word,$_+1))} 0 .. length($word)-1; $re = qr/^(?:$re)$/; my @found = grep {/$re/ && $_ ne $word} @ALLWORDS; return @found; } sub xor_search { my ($word) = @_; # by AnomalousMonk, http://perlmonks.org/?node_id=1161596 my @found = grep { length($_)==length($word) && ( ($word ^ $_) =~ tr/\x00//c ) == 1 } @ALLWORDS; return @found; }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^2: Comparing Lines within a Word List
by dominick_t (Acolyte) on Apr 27, 2016 at 16:19 UTC | |
by poj (Abbot) on Apr 27, 2016 at 16:40 UTC | |
by Anonymous Monk on Apr 27, 2016 at 16:40 UTC |