/* -*- mode: c -*- * $Id: wordfreq.gcc,v 1.5 2001/09/18 17:25:18 doug Exp $ * http://www.bagley.org/~doug/shootout/ * Changed by Adrian Merrill 2001/08/22 */ #include #include #include #include #include #include "../../Include/simple_hash.h" #define QUICKIE_STRCMP(a, b) (*(a) != *(b) ? *(a) - *(b) : strcmp((a), (b))) typedef int (*comparator)(const void *, const void *); int cmp_hash(struct ht_node **a, struct ht_node **b) { int val = (*b)->val - (*a)->val; return((val == 0) ? QUICKIE_STRCMP((*b)->key, (*a)->key) : val); } int main() { int readbufsize = 4096; int wordbufsize=16; char *readbuf = (char *)malloc(readbufsize + 1); char *wordbuf = (char *)malloc(wordbufsize + 1); int i = 0; struct ht_ht *ht = ht_create(2048); struct ht_node **sort_array, **sort_tmp, *node; /*new code*/ int nread =0; int wordlen=0; readbuf[0]=0; while (readbuf[i] > 0||(nread = fread(readbuf, sizeof(char), readbufsize, stdin),readbuf[nread] = '\0',i=0,nread > 0) ) { if (isalpha(readbuf[i])){ wordbuf[wordlen++] = tolower(readbuf[i]); if (wordlen == wordbufsize) { wordbufsize *= 2; wordbuf = realloc(wordbuf, wordbufsize + 1); } } else{ if (wordlen > 0) { wordbuf[wordlen] = '\0'; ++(ht_find_new(ht, wordbuf)->val); wordlen = 0; } } i++; } free(readbuf); free(wordbuf); sort_array = sort_tmp = malloc(sizeof(struct ht_node *) * ht_count(ht)); for (node=ht_first(ht); (*sort_tmp++ = node) != 0; node=ht_next(ht)) ; qsort(sort_array, ht_count(ht), sizeof(struct ht_node *), (comparator)cmp_hash); for (i=0; i## // -*- mode: c++ -*- // $Id: wordfreq.g++,v 1.5 2001/07/21 23:51:05 doug Exp $ // http://www.bagley.org/~doug/shootout/ // By Tamás Benkő #include #include #include #include #include #include using namespace std; int const bufsize = 4096; int const wsize = 64; class word_reader { int ws; char buf[bufsize+1], *bptr, *word; FILE *input; bool fill(); public: word_reader(FILE *i): ws(wsize), bptr(buf), word(new char[ws+1]), input(i) {*bptr = *word = '\0';} int operator()(char const **); }; inline bool word_reader::fill() { int nread = fread(buf, sizeof(char), bufsize, input); buf[nread] = '\0'; bptr = buf; return nread > 0; } int word_reader::operator()(char const **w) { int len = 0; char c; while (*bptr || fill()) { if (isalpha(c = *bptr++)) { word[len] = tolower(c); if (++len == ws) { char *nword = new char[(ws *= 2)+1]; memcpy(nword, word, len); delete[] word; word = nword; } } else if (len > 0) break; } *w = word; word[len] = '\0'; return len; } typedef hash_map counter; typedef pair hpair; namespace std { inline bool operator<(hpair const &lhs, hpair const &rhs) { return lhs.second != rhs.second ? lhs.second > rhs.second : strcmp(lhs.first, rhs.first) > 0; } template<> struct equal_to { bool operator()(char const *s1, char const *s2) const {return strcmp(s1, s2) == 0;} }; } int main() { int len; const char *w; counter hist; word_reader wr(stdin); while ((len = wr(&w)) > 0) { counter::iterator i = hist.find(w); if (i == hist.end()) hist[strcpy(new char[len+1], w)] = 1; else ++i->second; } vector v(hist.begin(), hist.end()); sort(v.begin(), v.end()); for (size_t i = 0; i < v.size(); ++i) printf("%7d\t%s\n", v[i].second, v[i].first); return 0; } #### // $Id: wordfreq.java,v 1.3 2000/12/17 21:40:53 doug Exp $ // http://www.bagley.org/~doug/shootout/ // Collection class code is from my friend Phil Chu, Thanks Phil! import java.io.*; import java.util.*; import java.text.*; class Counter { int count = 1; } public class wordfreq { public static void main(String[] args) { wf(); } public static String padleft(String s,int n,char c) { int len = s.length(); if( len>=n ) return s; char[] buf = new char[n]; for( int i=0;i## #!/usr/local/bin/python # $Id: wordfreq.python,v 1.9 2001/05/11 17:44:00 doug Exp $ # http://www.bagley.org/~doug/shootout/ # # adapted from Bill Lear's original python word frequency counter # # Joel Rosdahl suggested using translate table to speed up # word splitting. That change alone sped this program up by # at least a factor of 3. # # with further speedups from Mark Baker import sys def main(): count = {} i_r = map(chr, range(256)) trans = [' '] * 256 o_a, o_z = ord('a'), (ord('z')+1) trans[ord('A'):(ord('Z')+1)] = i_r[o_a:o_z] trans[o_a:o_z] = i_r[o_a:o_z] trans = ''.join(trans) rl = sys.stdin.readlines lines = rl(4095) while lines: for line in lines: for word in line.translate(trans).split(): try: count[word] += 1 except KeyError: count[word] = 1 lines = rl(4095) l = zip(count.values(), count.keys()) l.sort() l.reverse() print '\n'.join(["%7s\t%s" % (count, word) for (count, word) in l]) main() #### -- $Id: wordfreq.ghc,v 1.2 2001/02/27 04:04:35 doug Exp $ -- http://www.bagley.org/~doug/shootout/ -- from Julian Assange -- compile with: -- ghc -O -package data wordfreq.hs -o wordfreq module Main where import List(sortBy) import Char(toLower,isLower) import FiniteMap(fmToList,emptyFM,addToFM_C) main = interact $ unlines . pretty . sort . fmToList . makemap . cwords . lower where pretty l = [pad 7 (show n) ++ "\t" ++ w | (w,n) <- l] where pad n s = replicate (n - length s) ' ' ++ s sort = sortBy (\(w0,n0) (w1,n1) -> case compare n1 n0 of EQ -> compare w1 w0 x -> x) makemap = addFM emptyFM where addFM fm [] = fm addFM fm (x:xs) = addFM (addToFM_C (+) fm x 1) xs cwords s = case dropWhile (not . isLower) s of "" -> [] s' -> w : (cwords s'') where (w, s'') = span isLower s' lower = map toLower #### perl -nle"y/a-zA-Z/ /cs; ++$h{$_} for split }{ print qq[$_:$h{$_}] for sort keys %h" theFile break:1 brief:1 bring:3 brought:2 buffalo:16 burden:1 but:20 by:16 call:2 called:6 came:2 campaign:1 can:36 cannot:2 capable:1 capitals:1 career:2 cart:1 case:2 ...