Anonymous Monk has asked for the wisdom of the Perl Monks concerning the following question:
#! /usr/local/bin/perl -w use strict; use lib qw(.); use Lingua::Stem::En; my $stopfile = 'stopwords'; my $base= shift @ARGV; open STOP, "<$stopfile"; chomp( my @stop= <STOP> ); close STOP; my %stopwords=(); add the empty string '' to the stopwords as well @stopwords{@stop,''} = (); read in basefile my %D1=(); my $result=''; my $top=0; open BASETEXT, "<$base"; while ( <BASETEXT> ) { @D1{ map { my $l = lc; exists $stopwords{$l}?():$l } split /\W+/ } += (); } my %D2=(); while ( <> ) { my %frequency=(); my @D2 = map { my $l = lc; exists $stopwords{$l}?():$l } split /\W ++/ ; #= (); %D2 = @D2; foreach my $word ( @D2 ) { $frequency{$word} = 0 ; } foreach my $word ( @D2 ) { $frequency{$word} = $frequency{$word} + 1 ; } foreach my $word (keys %frequency) { if ( $frequency {$word} > $top) { $result = $word; $top= $frequency {$word}; } } print "file $ARGV testing: $result\n"; print "number of times: $top\n"; foreach my $word (@D2) { print "$word\n"; } } continue { if (eof) { my $total = (scalar keys %D1) + (scalar keys %D1); my $total = scalar keys %D2; my $intersect = 0; foreach my $key (keys %D1) { $intersect++ if exists $D2{$key}; } my $similarity = 2*$intersect/$total; print "Similarity between $base and $ARGV = $similarity\n"; #print "\t@{[keys %D2]}\n"; #print "\t@{[keys %D1]}\n"; %D2 = (); $similarity = 0; } }
Edited 2003-03-05 by mirod: added <code> tags
|
|---|