use strict; use warnings; use Data::Dumper; my %mycorpus = ( text1 => "

Irrelevant text that I do not need. ##a## gt##a b c## ##a b a c d A b## <97> 164 notes", text2 => "p> Irrelevant text that may feature the word soft, softest, or softly. ##a## ##a b a ## <97> 379 notes Irrelevant text.", text3 => "

##C## ##A## ##b## <97> 180 notes Irrelevant text." ); my %counts; my %overallcounts; foreach my $filename (sort keys %mycorpus) { my $date; my $hashtags = ''; #find the dates if ($mycorpus{$filename} =~ /(\d{4}-\d{2}-\d{2})T/g){ $date = $1; } #find only the relevant text while ($mycorpus{$filename} =~ /##(.*)##/g){ $hashtags = $1; #split text into words my @words = split /\W+/, $hashtags; foreach my $word (@words){ if ($word =~ /(\w+)/gi){ $word =~ tr/A-Z/a-z/; $counts{$date}{$word}++; $overallcounts{$date}++; #new hash to help with relative frequency overall count per day } } } } print Dumper \%counts; #### $VAR1 = { '2017-09-04' => { 'b' => 3, 'd' => 1, 'c' => 2, 'a' => 5 }, '2017-09-30' => { 'b' => 2, 'c' => 1, 'a' => 4 } }; #### $VAR1 = { '2017-09-04' => { 'b' => 27.27, 'd' => 9.09, 'c' => 18.18, 'a' => 45.45 }, '2017-09-30' => { 'b' => 28.57, 'c' => 14.29, 'a' => 57.14 } };