Term Number
----------------------
term1 10
term2 1
my term3 16
####
term1
term2
my term 3
####
#!/usr/bin/perl -w
use warnings;
use strict;
use Getopt::Long;
my $terms = "";
my $pathway_abstracts = "";
my %word_counts;
my $count = 0;
my %term_score;
my $term_frequency_score = 0;
my $word_number = 0;
##################### GET USER INPUT ###########################################
GetOptions( "pathway_abstracts=s" => \$pathway_abstracts,
"terms_file=s" => \$terms
);
################### STORE IN ARRAYS ############################################
# store pathway abstract in arrays
open(IN, "$pathway_abstracts" ) || die "$!";
my @array_1 = ;
close(IN);
# store terms in array
open(IN2, "$terms" ) || die "$!";
my @array_2 = ;
close(IN2);
#################### CREATE HASHES OF TERMS ####################################
foreach my $key (@array_2) # assign a score of 0 to each term
{
chomp($key);
$term_score{$key} = 0;
}
################################################################################
print("Term\t| ");
print("Number\t| ");
print("Frequency\n");
print("----------------------------------------------------\n");
for (my $j = 0; $j < @array_2; $j++) # loop through each search term
{
chomp($array_2[$j]);
my $phenotype_term = $array_2[$j]; # set the search term
for(my $i = 0; $i < @array_1; $i++) # loop through each line in the document
{
my @word_array = split(/\s/, $array_1[$i]); # split abstracts on each word
$word_number = $word_number + scalar(@word_array); # find out how many words are in abstracts
foreach my $word (@word_array) # look through each word in current line
{
if($word =~ /\b\Q$phenotype_term\E\b/) # does line contain filter term
{
$term_score{$array_2[$j]} = $term_score{$array_2[$j]} + 1; # increment term count
}
}
}
$term_frequency_score = $term_score{$array_2[$j]} / $word_number; # calculate term frequency
print($array_2[$j]."\t "); # print the term
print($term_score{$array_2[$j]}."\t "); # print the number of term occurances
print($term_frequency_score."\n"); # print the frequency
}