<doc id="some_id_string" title="title string" date="date_string" ...>
<text>
This is the text of the article.  Blah blah...
</text>
</doc>
...

##</code><code>##

create table content_index (
   search_term  varchar(50),
   doc_id       varchar(30),
   in_title     char(1),
   in_body      char(1),
   how_many     integer
)

##</code><code>##

#!/usr/bin/perl

local $/ = "</doc>\n";
while (<>) {         # reading from the corpus text file...
    my %tknhist = ();

    # get the docid and title:
    my ($id,$title) = (/id=\"(.*?)\" title=\"(.*?)\"/);

    # isolate the text
    my ($text) = ( m{<text>\s+(.*?)</text>}s );

    # downcase, remove punctuation, tokenize, count
    my $in = "ttl";
    for ( $title, $text ) {
        tr/A-Z'".,;:!?#&%$[]()0-9/a-z/d;  # everything from ' on is removed
        for my $tkn ( grep /\w{3,}/, split( /\s+/ )) {
            $tknhist{$in}{$tkn}++;  # only count words >3 letters
        }
        $in = "bdy";
    }
    for my $tkn ( keys %{$tknhist{bdy}} ) {
        my $in = "N,Y";  # "not_in_title,in_body"
        if ( exists( $tknhist{ttl}{$tkn} )) {
            $in =~ s/N/Y/;  # this token is in both places
            $tknhist{bdy}{$tkn} += $tknhist{ttl}{$tkn};
            delete $tknhist{ttl}{$tkn};
        }
        print join( ",", $tkn, $id, $in, $tknhist{bdy}{$tkn} ), "\n";
    }
    for my $tkn ( keys %{$tknhist{ttl}} ) {  # tokens in title only (if any)
        print join( ",", $tkn, $id, "Y,N", $tknhist{ttl}{$tkn} ), "\n";
    }
}

##</code><code>##

cut -f1 -d, table-data | sort | uniq -c | sort -nr > word.doc-freqs