use strict; use warnings; use Data::Dumper; my %mycorpus = ( text1 => "
Irrelevant text that I do not need. ##a## gt##a b c## ##a b a c d A b## <97> 164 notes", text2 => "p> Irrelevant text that may feature the word soft, softest, or softly. ##a## ##a b a ## <97> 379 notes Irrelevant text.", text3 => "
##C##
##A##
##b##
<97> 180 notes
Irrelevant text."
);
my %counts;
my %overallcounts;
foreach my $filename (sort keys %mycorpus) {
my $date;
my $hashtags = '';
#find the dates
if ($mycorpus{$filename} =~ /(\d{4}-\d{2}-\d{2})T/g){
$date = $1;
}
#find only the relevant text
while ($mycorpus{$filename} =~ /##(.*)##/g){
$hashtags = $1;
#split text into words
my @words = split /\W+/, $hashtags;
foreach my $word (@words){
if ($word =~ /(\w+)/gi){
$word =~ tr/A-Z/a-z/;
$counts{$date}{$word}++;
$overallcounts{$date}++; #new hash to help with relative frequency overall count per day
}
}
}
}
print Dumper \%counts;
####
$VAR1 = {
'2017-09-04' => {
'b' => 3,
'd' => 1,
'c' => 2,
'a' => 5
},
'2017-09-30' => {
'b' => 2,
'c' => 1,
'a' => 4
}
};
####
$VAR1 = {
'2017-09-04' => {
'b' => 27.27,
'd' => 9.09,
'c' => 18.18,
'a' => 45.45
},
'2017-09-30' => {
'b' => 28.57,
'c' => 14.29,
'a' => 57.14
}
};