use File::Find::Rule;
use List::MoreUtils qw(all);
use Number::Format qw(format_number);
use Regexp::Assemble;

my %ngram;
my %dict;
my $total;

### Which dictionary? 'Tis set up for testing at home and work.
my $uname = `uname -a`;
my $dict = $uname =~ /debian/i ? '/usr/share/dict/american-english' :
		$uname =~ /SunOS/i ? '/usr/share/lib/dict/words' :
		undef ;

### Gather ngrams.
open my $DICT, '<', $dict or die $!;
while (<$DICT>) {
	chomp;
	### Only allow words that begin with a lowercase letter,
	### contain only letters (no hyphens, quotes, etc.),
	### and have 3 or more letters.
	next unless m/\A[a-z][A-Za-z]+\z/ && length >= 3;
	print "$_\n";
	### Gather letter trios (ngrams, or, more specifically, trigrams).
	my $str = $_;
	my @ngrams = map {
		substr($str, $_, 3);
	} 0 .. (length $_) - 3;
	### Tally.
	++$ngram{$_} for @ngrams;
	++$total;
	### Only add 4+ lengths to the dictionary--many temps were matching lengths of 3.
	++$dict{$_} if length >= 4;
}
print "\n";
print 'Total words: ', format_number($total), "\n";

### Show the results sorted by occurrence and remove those less than 1%.
print "All:\n";
for my $ngram (sort {$ngram{$b} <=> $ngram{$a}} keys %ngram) {
	my $percentage = format_number(($ngram{$ngram} / $total) * 100, 1, 1);
	printf "%3s: %4s (%4s%%)\n", $ngram, format_number($ngram{$ngram}), $percentage;
	delete $ngram{$ngram} if $percentage < 1;
}
print "\n";

print "Keepers:\n";
for my $ngram (sort {$ngram{$b} <=> $ngram{$a}} keys %ngram) {
	my $percentage = format_number(($ngram{$ngram} / $total) * 100, 1, 1);
	printf "%3s: %4s (%4s%%)\n", $ngram, format_number($ngram{$ngram}), $percentage;
}
print "\n";

### Build an RE based on the ngrams.
my $ra = Regexp::Assemble->new;
$ra->add($_) for keys %ngram;
print $ra->re, "\n";

### Files must match these to be considered temporary.
my @REs = (
	### Lower/upper case letters not in the extension.
	qr/\A[^.]+[a-z]/,
	qr/\A[^.]+[A-Z]/,
	### Digit.
	qr/\d/,
	### Name only contains upper/lower case letters or digits; ext. optional.
	qr/\A[a-zA-Z\d]+(?:\.[a-zA-Z]{1,4})?\z/,
);

File::Find::Rule->file
	->exec(
		sub {
			my $file = $_;
			### Test for REs, words, then ngrams.
			return unless all { $file =~ $_ } @REs;
			for ($file =~ /([A-Za-z][a-z]+|[A-Z]+)/g) {
				if (exists $dict{lc $_}) {
					print "\tSkipping '$file' due to presence of '$_'\n";
					return;
				}
			}
			return if lc $file =~ $ra->re;
			print "$file\n";
		}
	)
	->in(qw(/data /tmp));