#!/usr/bin/perl -w print "Content-type: text/html\n\n"; $databasefile = "/home/virtual/admin16/var/www/cgi-bin/DB_Search/Data_files/top.xml"; $wordfile = "/home/virtual/admin16/var/www/html/news/wordburst.txt"; my $file = $databasefile; open (FILE,$file) || die "Cannot read from $file"; flock(FILE, 2); # Locking file open (OUTFILE, ">$wordfile") || die "error opening $wordfile $!\n"; $lastword = "BEGINNING_OF_TEXT"; $wordcounts{$lastword}++; while () { s/[^\w\s]//g; foreach $word (split /\s+/) { # we only want to deal with normal words # replace all non-alphabetic characters in the word $word =~ s/\W//g; # add one to the count of each word in this file # the curly brackets mean an associative array; # indexed by the word name $wordcounts{$word}++; $totalwords++; # we can make an associative array on the pair of words # if it's the first time we've seen this pair, # record how to split it back into two words $word_pair_counts{"$lastword,$word"}++ or $word_pair_split{"$lastword,$word"} = [ $lastword, $word ]; # now remember what word we saw last for the next pair $lastword = $word; } } # now look at the most frequent word pairs $word_pairs_printed = 0; foreach (sort { $word_pair_counts{$b} <=> $word_pair_counts{$a} } keys %word_pair_counts) { ($word1, $word2) = split(/,/); printf OUTFILE ("\ $word1 $word2 $word_pair_counts{$_}\n"); $word_pairs_printed++; # last ends the loop last if ($word_pairs_printed > 39); } flock(FILE, 8); # Unlocking file close(OUTFILE);