In particular, this doesn't sort by time domains, to do a true 'burst' analysis as new words enter the conversation pool. There are likely CPAN modules that shorten (Text::Stem; Text::Scan, Text::Document, etc) or harden (Spam Assassin) the code.#!/usr/bin/perl # Find most frequent signals amidst political noise $|=1; @noise=qq/ the of and to in a that for be is our by it which as this with have we has i will are on been not their from at all an its or was but should they these such can upon other so them may any made must than there were under those who if only us his my most had into every some between during shall when own more would you without many also over before well what while through both within being your could about each where still among after since further /; $stops=join('',@noise); # single file of State of Union addresses open(IN,"<soufile.txt"); foreach(<IN>){ chomp; # clean left-overs s/^\s+|\s+$//g; s/[^A-z\s]//g; s/&(.*?);//g; s/\[|\]//g; s/\_//g; s/\`//g; s/\\//g; s/\s+/ /g; @words=split(/ /,$_); foreach(@words){ $word=lc($_); push(@total,$word); if($seen{$word} !=1){ push(@unique,$word); $seen{$word} =1; } else { $count{$word}=$count{$word}+1; } } } @sorted=sort {$a cmp $b} @unique; $total=@total; foreach(@sorted){ chomp; s/^\s+|\s+$//g; $percent=100 * $count{$_} / $total; $percent=substr($percent,0,4); $counts="$count{$_} : $_ ($percent) \% "; if($stops !~ /$_/i){ push(@freq,$counts); } } @histogram=sort {$b <=> $a} @freq; for($j=0;$j<100;$j++){ print "$histogram[$j]\n"; } close IN;
The top ten terms overall would be: count : word (%) 6174 : government (0.39) % 5564 : states (0.35) % 4524 : congress (0.29) % 4247 : united (0.27) % 3639 : year (0.23) % 3379 : people (0.21) % 2845 : great (0.18) % 2806 : country (0.18) % 2754 : now (0.17) % 2703 : public (0.17) %
In reply to Conversation Pools by astrobio
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |