Most frequently occuring word no of times occured
second most frequent word no of times occured
.... ....
.... ....
####
%count = ();
foreach $element (@words) {
$count{$element}++;
}
while ( ($k,$v) = each %count ) {
print "$k => $v\n";
}
####
the 150
it 85
we 60
are 40
####
sub lexicon_generate
{
open CP, 'tcorpus.txt' or die $!; #Open file.
my @words;
while(){
chomp;
push @words,split;
}
close CP;
#print "\n@words\n";
$lwords=@words;
#print "\n$lwords";
for($i=0;$i<$lwords;$i++)
{
#print "\nThis is the next token:";
#print "\n$words[$i]";
}
#Remove punctuation marks.
foreach my $item(@words){
$item=~ tr/*//d;
$item=~ tr/(//d;
$item=~ tr/)//d;
$item=~ tr/""//d;
$item=~ tr/''//d;
$item=~ tr/?//d;
$item=~ tr/,//d;
$item=~ tr/. //d;
$item=~ tr/-//d;
$item=~ tr/"//d;
$item=~ tr/'//d;
$item=~ tr/!//d;
$item=~ tr/;//d;
$item= '' unless defined $item;
#print "\nThe token after removing punctuation marks:";
#print "\n$item\n";
}
#Number of words in @words before removing duplicates.
$lnwords=@words;
#print "\n$lnwords";
foreach my $final_thing(@words){ #print "$final_thing\n";
}
#Remove duplicate strings.
my %seen = ();
my @uniq = ();
foreach my $u_thing(@words)
{
unless ($seen{$u_thing})
{
#if we get here, we have not seen it before
$seen{$u_thing} = 1;
push (@uniq,$u_thing);
}
}
#print"\nThe unique list:";
#print "\n@uniq";
#Number of words in @words after removing duplicates.
$luniq=@uniq;
#print "\n$luniq";
open LEX,'>tcorpus_unique.txt' or die $!;
foreach my $u_elt(@uniq){
#print "\n$u_elt";
print LEX "\n$u_elt";
}
close LEX;
}
&lexicon_generate();