This is the text of the article. Blah blah... ... #### create table content_index ( search_term varchar(50), doc_id varchar(30), in_title char(1), in_body char(1), how_many integer ) #### #!/usr/bin/perl local $/ = "\n"; while (<>) { # reading from the corpus text file... my %tknhist = (); # get the docid and title: my ($id,$title) = (/id=\"(.*?)\" title=\"(.*?)\"/); # isolate the text my ($text) = ( m{\s+(.*?)}s ); # downcase, remove punctuation, tokenize, count my $in = "ttl"; for ( $title, $text ) { tr/A-Z'".,;:!?#&%$[]()0-9/a-z/d; # everything from ' on is removed for my $tkn ( grep /\w{3,}/, split( /\s+/ )) { $tknhist{$in}{$tkn}++; # only count words >3 letters } $in = "bdy"; } for my $tkn ( keys %{$tknhist{bdy}} ) { my $in = "N,Y"; # "not_in_title,in_body" if ( exists( $tknhist{ttl}{$tkn} )) { $in =~ s/N/Y/; # this token is in both places $tknhist{bdy}{$tkn} += $tknhist{ttl}{$tkn}; delete $tknhist{ttl}{$tkn}; } print join( ",", $tkn, $id, $in, $tknhist{bdy}{$tkn} ), "\n"; } for my $tkn ( keys %{$tknhist{ttl}} ) { # tokens in title only (if any) print join( ",", $tkn, $id, "Y,N", $tknhist{ttl}{$tkn} ), "\n"; } } #### cut -f1 -d, table-data | sort | uniq -c | sort -nr > word.doc-freqs