use File::Find::Rule; use List::MoreUtils qw(all); use Number::Format qw(format_number); use Regexp::Assemble; my %ngram; my %dict; my $total; ### Which dictionary? 'Tis set up for testing at home and work. my $uname = `uname -a`; my $dict = $uname =~ /debian/i ? '/usr/share/dict/american-english' : $uname =~ /SunOS/i ? '/usr/share/lib/dict/words' : undef ; ### Gather ngrams. open my $DICT, '<', $dict or die $!; while (<$DICT>) { chomp; ### Only allow words that begin with a lowercase letter, ### contain only letters (no hyphens, quotes, etc.), ### and have 3 or more letters. next unless m/\A[a-z][A-Za-z]+\z/ && length >= 3; print "$_\n"; ### Gather letter trios (ngrams, or, more specifically, trigrams). my $str = $_; my @ngrams = map { substr($str, $_, 3); } 0 .. (length $_) - 3; ### Tally. ++$ngram{$_} for @ngrams; ++$total; ### Only add 4+ lengths to the dictionary--many temps were matching lengths of 3. ++$dict{$_} if length >= 4; } print "\n"; print 'Total words: ', format_number($total), "\n"; ### Show the results sorted by occurrence and remove those less than 1%. print "All:\n"; for my $ngram (sort {$ngram{$b} <=> $ngram{$a}} keys %ngram) { my $percentage = format_number(($ngram{$ngram} / $total) * 100, 1, 1); printf "%3s: %4s (%4s%%)\n", $ngram, format_number($ngram{$ngram}), $percentage; delete $ngram{$ngram} if $percentage < 1; } print "\n"; print "Keepers:\n"; for my $ngram (sort {$ngram{$b} <=> $ngram{$a}} keys %ngram) { my $percentage = format_number(($ngram{$ngram} / $total) * 100, 1, 1); printf "%3s: %4s (%4s%%)\n", $ngram, format_number($ngram{$ngram}), $percentage; } print "\n"; ### Build an RE based on the ngrams. my $ra = Regexp::Assemble->new; $ra->add($_) for keys %ngram; print $ra->re, "\n"; ### Files must match these to be considered temporary. my @REs = ( ### Lower/upper case letters not in the extension. qr/\A[^.]+[a-z]/, qr/\A[^.]+[A-Z]/, ### Digit. qr/\d/, ### Name only contains upper/lower case letters or digits; ext. optional. qr/\A[a-zA-Z\d]+(?:\.[a-zA-Z]{1,4})?\z/, ); File::Find::Rule->file ->exec( sub { my $file = $_; ### Test for REs, words, then ngrams. return unless all { $file =~ $_ } @REs; for ($file =~ /([A-Za-z][a-z]+|[A-Z]+)/g) { if (exists $dict{lc $_}) { print "\tSkipping '$file' due to presence of '$_'\n"; return; } } return if lc $file =~ $ra->re; print "$file\n"; } ) ->in(qw(/data /tmp));