Term Number ---------------------- term1 10 term2 1 my term3 16 ##

##

term1
term2
my term 3

##

##

#!/usr/bin/perl -w

use warnings;
use strict;
use Getopt::Long;

my $terms = "";
my $pathway_abstracts = "";
my %word_counts;
my $count = 0;
my %term_score;
my $term_frequency_score = 0;
my $word_number = 0;


##################### GET USER INPUT ###########################################

GetOptions( "pathway_abstracts=s" => \$pathway_abstracts,
            "terms_file=s" => \$terms
          );


################### STORE IN ARRAYS ############################################
# store pathway abstract in arrays
open(IN, "$pathway_abstracts" ) || die "$!";
my @array_1 = ;
close(IN);

# store terms in array
open(IN2, "$terms" ) || die "$!";
my @array_2 = ;
close(IN2);


#################### CREATE HASHES OF TERMS ####################################
foreach my $key (@array_2)    # assign a score of 0 to each term
{
  chomp($key);
  $term_score{$key} = 0;
}


################################################################################

print("Term\t| ");
print("Number\t| ");
print("Frequency\n");
print("----------------------------------------------------\n");


for (my $j = 0; $j < @array_2; $j++)        # loop through each search term
{
  chomp($array_2[$j]);
  my $phenotype_term = $array_2[$j];         # set the search term

  for(my $i = 0; $i < @array_1; $i++)        # loop through each line in the document
  {
    my @word_array = split(/\s/, $array_1[$i]);         # split abstracts on each word
    $word_number = $word_number + scalar(@word_array);      # find out how many words are in abstracts

    foreach my $word (@word_array)          # look through each word in current line
    {
      if($word =~ /\b\Q$phenotype_term\E\b/)   #  does line contain filter term
      {
        $term_score{$array_2[$j]} = $term_score{$array_2[$j]} + 1;    # increment term count
      }
    }
  }
  $term_frequency_score = $term_score{$array_2[$j]} / $word_number;   # calculate term frequency
  
  print($array_2[$j]."\t  ");                     # print the term
  print($term_score{$array_2[$j]}."\t  ");        # print the number of term occurances
  print($term_frequency_score."\n");              # print the frequency
}