#!/usr/bin/perl -w # Find most frequent signals amidst political noise use strict; $|=1; my (@noise,%stop,%count,$word,$total,$percent); @noise=qw/ a about after all also among an and any are as at be been before being between both but by can could during each every for from further had has have his i if in into is it its made many may more most must my not of on only or other our over own shall should since so some still such than that the their them there these they this those through to under upon us was we well were what when where which while who will with within without would you your /; foreach(@noise) { $stop{$_}=1;} # single file of State of Union addresses open(IN,"){ chomp; s/&(.*?);//g; s/\s+/ /g; s/[^A-Za-z ]//g; s/^ | $//g; foreach(split(/ /,$_)){ $word=lc($_); $word=~s/[^A-Za-z]//g; next if !$word||$stop{$word}; $total++; $count{$word}++; } } foreach(sort {$count{$b}<=>$count{$a}} keys %count){ $percent=int(10000 * $count{$_} / $total)/100; print "$count{$_} :$_ ($percent \%)\n"; } #### 76 :applause (3.46 %) 33 :america (1.5 %) 19 :security (0.86 %) 17 :world (0.77 %) 15 :american (0.68 %) 14 :terror (0.63 %) 13 :good (0.59 %) 13 :new (0.59 %) 12 :people (0.54 %) 12 :weapons (0.54 %) 12 :war (0.54 %)