use strict; use warnings; my %mycorpus = ( a => "date:#20180101# comment:#d1 d2 d3 d4 d5 d6#", b => "date:#20180101# comment:#b1 b2 b3 b4 b5 b6 b7# comment:#c1 c2 c3 c4 c5 c6#", c => "date:#20180101# comment:#d1 d2 d3 d4 d5 d6#", ); my %counts; foreach my $filename ( sort keys %mycorpus ) { my $date; my $dataset = ''; my $word = ''; while ( $mycorpus{$filename} =~ /date:#(\d+)#/g ) { $date = $1; } while ( $mycorpus{$filename} =~ /comment:#(.*?)#/g ) { $dataset .= "$1 "; } while ( $dataset =~ m/(\w+) \s (?= (\w+) \s (\w+) \s (\w+) \s (\w+) )/gx ) { $word = "$1 $2 $3 $4 $5"; $counts{$date}{$word}++; } } use Data::Dumper; print Dumper \%counts; #### $VAR1 = { '20180101' => { 'c1 c2 c3 c4 c5' => 1, 'b1 b2 b3 b4 b5' => 1, 'b3 b4 b5 b6 b7' => 1, 'b2 b3 b4 b5 b6' => 1, 'd2 d3 d4 d5 d6' => 2, 'd1 d2 d3 d4 d5' => 2, 'c2 c3 c4 c5 c6' => 1 } };