#!/usr/bin/perl # POD can be found at the bottom of this script use strict; use warnings; use Compress::Zlib; use Getopt::Long; use Pod::Usage; my $VERSION = 0.81; my $dictfile = 'dict.gz'; # Process command-line options my %cl_options = ( help => '', version => '', token_debug => '', glossary_output => '', dictionary_output => '' ); GetOptions( 'help|?' => \$cl_options{help}, 'version' => \$cl_options{version}, 'man' => \$cl_options{man}, 'token-debug' => \$cl_options{token_debug}, 'glossary' => \$cl_options{glossary_output}, 'dictionary' => \$cl_options{dictionary_output} ); print "This is version $VERSION of $0.\n" if $cl_options{version}; exit(0) if ($cl_options{version}); pod2usage(-exitstatus => 0, -verbose => 1, -msg => "Help for $0") if $cl_options{help}; pod2usage(-exitstatus => 0, -verbose => 2, -msg => "Man page for $0") if $cl_options{man}; my $file = shift; my %dictionary = readdict(\$dictfile); my %glossary; findwords(); printlexicon(\%dictionary) if $cl_options{dictionary_output}; printlexicon(\%glossary) if $cl_options{glossary_output}; # Readdict reads in the dictionary file defined above using # the Compress:Zlib CPAN module. It returns a hash that is # used for all further dictionary operations. # sub readdict { my $dict = shift; my %dicthash; my $gz = gzopen($$dict, "rb") or die "Cannot open $$dict: $gzerrno\n" ; while ($gz->gzreadline($_) > 0) { chomp; $dicthash{lc($_)} = 0; print "Dictionary $_\n"; } die "Error reading from $$dict: $gzerrno\n" if $gzerrno != Z_STREAM_END ; return %dicthash; } # findwords() reads in a file and compares words found in the file # with the contents of the dictionary read in by the readdict # function. It assigns counts to the elements of %dictionary and # creates %glossary elements and increases its values according to # the number of matches. sub findwords { open my $if, "<", $file || die "Could not open $file: $!"; while (<$if>) { chomp; my @elements = split(/[ '-]/,$_); # split on hyphens, too foreach my $element (@elements) { next if $element =~ /\d/; # Don't need digits print "[$element]->" if $cl_options{token_debug}; $element = lc($element); $element =~ s/[\s,!?._;«»)("'-]//g; print "[$element]\n" if $cl_options{token_debug}; next if $element eq ''; if ( exists $dictionary{$element} ) { $dictionary{$element}++; } else { $glossary{$element}++; print "Text: @elements\n"; } } } } # Showmatches reads in a lexicon hash via a reference and prints all words out # that have been seen in the findwords() function along with a frequency count. # sub printlexicon { my $lexicon = shift; my $counter = 0; foreach my $key (sort keys %$lexicon) { if ( $$lexicon{$key} > 0 ) { print $key . " : " . $$lexicon{$key} . "\n"; $counter++; } } print "\n$counter entries total\n"; } __END__