#!/usr/bin/perl use strict ; use warnings ; use Mojo::DOM; my $path = "U:/Perl/risk disclosures"; chdir($path) or die "Cant chdir to $path $!"; # This program counts the total number of unique six-grams in a 10-K and enumerates the frequency of each one. # Starting off computing a simple word count for each word in the 10-K. my @sequence ; my %sequences ; my $fh ; # Here creating an array of six-grams. my @files = <*.htm>; foreach my $file (@files) { open($fh, $file|) ; while(<$fh>) { my $dom = Mojo::DOM->new(<$fh>); my $text = $dom->all_text(); for (split/\s+/, $text) { push @sequence, $_ ; if (@sequence >=10) { shift @sequence until @sequence ==10 ; ++$sequences{"@sequence"}; } } } } close($fh) ; my @keys = sort { "\L$sequences{$a}" <=> "\L$sequences{$b}" or "\L$a" cmp "\L$b" } keys %sequences ; open(my $fh3, '>', 'report4.txt') ; foreach (@keys) { print $fh3 "$_ \t $sequences{$_}\n "; } close $fh3 ;