This is the text of the article. Blah blah...
...
####
create table content_index (
search_term varchar(50),
doc_id varchar(30),
in_title char(1),
in_body char(1),
how_many integer
)
####
#!/usr/bin/perl
local $/ = "\n";
while (<>) { # reading from the corpus text file...
my %tknhist = ();
# get the docid and title:
my ($id,$title) = (/id=\"(.*?)\" title=\"(.*?)\"/);
# isolate the text
my ($text) = ( m{\s+(.*?)}s );
# downcase, remove punctuation, tokenize, count
my $in = "ttl";
for ( $title, $text ) {
tr/A-Z'".,;:!?#&%$[]()0-9/a-z/d; # everything from ' on is removed
for my $tkn ( grep /\w{3,}/, split( /\s+/ )) {
$tknhist{$in}{$tkn}++; # only count words >3 letters
}
$in = "bdy";
}
for my $tkn ( keys %{$tknhist{bdy}} ) {
my $in = "N,Y"; # "not_in_title,in_body"
if ( exists( $tknhist{ttl}{$tkn} )) {
$in =~ s/N/Y/; # this token is in both places
$tknhist{bdy}{$tkn} += $tknhist{ttl}{$tkn};
delete $tknhist{ttl}{$tkn};
}
print join( ",", $tkn, $id, $in, $tknhist{bdy}{$tkn} ), "\n";
}
for my $tkn ( keys %{$tknhist{ttl}} ) { # tokens in title only (if any)
print join( ",", $tkn, $id, "Y,N", $tknhist{ttl}{$tkn} ), "\n";
}
}
####
cut -f1 -d, table-data | sort | uniq -c | sort -nr > word.doc-freqs