comment on

I used the three word dictionary as you suggested. Dict-compare is only recognizing the last entry in the dictionary.

dict.gz = cat big dog

file.txt = fish bird big lizard mammal

"big" is recognized with -dictionary only when it is listed last.

I've used print statements in the dictionary (sub readdict) to confirm that it is being read correctly as well as in the glossary (sub findwords).

I'm pasting dict.gz and file.txt into my perl directory. Dict.gz is just a one-word-per line text file that is gzipped in gzip's directory and pasted into the perl directory.

Thanks again. If you have a tip on where I should try to debug I'll definitely try it.

#!/usr/bin/perl

#  POD can be found at the bottom of this script

use strict;
use warnings;
use Compress::Zlib;
use Getopt::Long;
use Pod::Usage;

my $VERSION = 0.81;
my $dictfile = 'dict.gz';

#  Process command-line options

my %cl_options = (
    help             =>     '',
    version         =>     '',
    token_debug         =>     '',
    glossary_output     =>     '',
    dictionary_output     =>     ''
);

GetOptions(
        'help|?'      => \$cl_options{help},
        'version'     => \$cl_options{version},
        'man'         => \$cl_options{man},
        'token-debug' => \$cl_options{token_debug}, 
        'glossary'    => \$cl_options{glossary_output},
        'dictionary'  => \$cl_options{dictionary_output}    
);

print "This is version $VERSION of $0.\n" if $cl_options{version};
exit(0) if ($cl_options{version});
pod2usage(-exitstatus => 0, -verbose => 1, -msg => "Help for $0") if $
+cl_options{help};
pod2usage(-exitstatus => 0, -verbose => 2, -msg => "Man page for $0") 
+if $cl_options{man};

my $file = shift;
my %dictionary = readdict(\$dictfile);
my %glossary;

findwords();

printlexicon(\%dictionary) if $cl_options{dictionary_output};
printlexicon(\%glossary) if $cl_options{glossary_output};


#  Readdict reads in the dictionary file defined above using
#  the Compress:Zlib CPAN module.  It returns a hash that is
#  used for all further dictionary operations.
#
sub readdict {
    my $dict = shift;
    my %dicthash;

    my $gz = gzopen($$dict, "rb") or die "Cannot open $$dict: $gzerrno
+\n" ;
    while ($gz->gzreadline($_) > 0) {
        chomp;
        $dicthash{lc($_)} = 0;
        print "Dictionary $_\n";
    }
    die "Error reading from $$dict: $gzerrno\n" if $gzerrno != Z_STREA
+M_END ;
    return %dicthash;
}

#  findwords() reads in a file and compares words found in the file
#  with the contents of the dictionary read in by the readdict
#  function.  It assigns counts to the elements of %dictionary and
#  creates %glossary elements and increases its values according to
#  the number of matches.

sub findwords {
    open my $if, "<", $file || die "Could not open $file: $!";
    while (<$if>) {
        chomp;
    my @elements = split(/[ '-]/,$_); # split on hyphens, too
        foreach my $element (@elements) {
            next if $element =~ /\d/; #  Don't need digits
            print "[$element]->" if $cl_options{token_debug};
            $element = lc($element);
            $element =~ s/[\s,!?._;Ťť)("'-]//g; 
            print "[$element]\n" if $cl_options{token_debug};
            next if $element eq '';
            if ( exists $dictionary{$element} ) {
                $dictionary{$element}++;
            } else {
                $glossary{$element}++;
                print "Text: @elements\n"; 
            }
        }
    }
}

#  Showmatches reads in a lexicon hash via a reference and prints all 
+words out 
#  that have been seen in the findwords() function along with a freque
+ncy count.
#
sub  printlexicon {
    my $lexicon = shift;
    my $counter = 0;
    foreach my $key (sort keys %$lexicon) {
        if ( $$lexicon{$key} > 0 ) {
            print $key . " : " . $$lexicon{$key} . "\n";
            $counter++;
        }
    }
    print "\n$counter entries total\n";
}

__END__
[download]

In reply to Re^4: Dict - Compare by drno
in thread Dict - Compare by drno

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.