#!/usr/bin/perl -w

print "Content-type: text/html\n\n";

$databasefile = "/home/virtual/admin16/var/www/cgi-bin/DB_Search/Data_files/top.xml";
$wordfile = "/home/virtual/admin16/var/www/html/news/wordburst.txt";

my $file = $databasefile;

open (FILE,$file) || die "Cannot read from $file";
flock(FILE, 2); # Locking file

open (OUTFILE, ">$wordfile") || die "error opening $wordfile $!\n";

$lastword = "BEGINNING_OF_TEXT";
$wordcounts{$lastword}++;
while (<FILE>) {


    s/[^\w\s]//g;
    foreach $word (split /\s+/) {
	# we only want to deal with normal words
	# replace all non-alphabetic characters in the word 
	$word =~ s/\W//g;
	# add one to the count of each word in this file
	# the curly brackets mean an associative array;
    # indexed by the word name
	$wordcounts{$word}++;
	$totalwords++;
	# we can make an associative array on the pair of words
	# if it's the first time we've seen this pair,
	# record how to split it back into two words
	$word_pair_counts{"$lastword,$word"}++
	    or
		$word_pair_split{"$lastword,$word"} = [ $lastword, $word ];
	# now remember what word we saw last for the next pair
	$lastword = $word;
    }
}


# now look at the most frequent word pairs

 
 $word_pairs_printed = 0;
 foreach  (sort  { $word_pair_counts{$b} <=> $word_pair_counts{$a} } keys %word_pair_counts) {
     ($word1, $word2) = split(/,/);
     
       
        printf OUTFILE ("\ $word1 $word2 $word_pair_counts{$_}\n");

          $word_pairs_printed++;
     # last ends the loop
     last if ($word_pairs_printed > 39);
 }
    
    flock(FILE, 8);                # Unlocking file

    close(OUTFILE);