#!/usr/bin/perl -w use warnings; use strict; use Getopt::Long; my $terms = ""; my $pathway_abstracts = ""; my %word_counts; my $count = 0; my %term_score; my $term_frequency_score = 0; my $word_number = 0; ##################### GET USER INPUT ########################################### GetOptions( "pathway_abstracts=s" => \$pathway_abstracts, "terms_file=s" => \$terms ); ################### STORE IN ARRAYS ############################################ # store pathway abstract in arrays open(IN, "$pathway_abstracts" ) || die "$!"; my @array_1 = ; close(IN); # store terms in array open(IN2, "$terms" ) || die "$!"; my @array_2 = ; close(IN2); #################### CREATE HASHES OF TERMS #################################### foreach my $key (@array_2) # assign a score of 0 to each term { chomp($key); $term_score{$key} = 0; } ################################################################################ print("Term\t| "); print("Number\t| "); print("Frequency\n"); print("----------------------------------------------------\n"); for (my $j = 0; $j < @array_2; $j++) # loop through each search term { chomp($array_2[$j]); my $phenotype_term = $array_2[$j]; # set the search term for(my $i = 0; $i < @array_1; $i++) # loop through each line in the document { my @word_array = split(/\s/, $array_1[$i]); # split abstracts on each word $word_number = $word_number + scalar(@word_array); # find out how many words are in abstracts foreach my $word (@word_array) # look through each word in current line { if($word =~ /\b\Q$phenotype_term\E\b/) # does line contain filter term { $term_score{$array_2[$j]} = $term_score{$array_2[$j]} + 1; # increment term count } } } $term_frequency_score = $term_score{$array_2[$j]} / $word_number; # calculate term frequency print($array_2[$j]."\t "); # print the term print($term_score{$array_2[$j]}."\t "); # print the number of term occurances print($term_frequency_score."\n"); # print the frequency }