$ cat test_input.txt
Hello, world!
I said, "Hello, world!".
Did he say "Hello, world!"?
We're not sure.
1tab: 2tabs: 3tabs: END-TABS
# multi-spacing here - blank line next
The cat sat on the mat.
Old pronouns: thou; thee; thy; thine.
New pronouns: "u", 'ur'.
Forecastle = "fo'c'sle" or "fo'c's'le"
Forecastle = 'fo'c'sle' or 'fo'c's'le'
Don't hide the Very pistol; it could be very important.
Why exclude different but include same?
####
#!/usr/bin/env perl
use strict;
use warnings;
use autodie;
use Lingua::StopWords 'getStopWords';
use Text::CSV;
my ($lang, $encoding) = qw{en UTF-8};
my %word_re_for = (
en => qr{^.*?\b([\p{Alnum}']*[\p{Alnum}]+).*$},
);
my ($in_file, $out_file) = qw{test_input.txt test_output.csv};
my $is_stop = _mod_stops(getStopWords($lang, $encoding));
my %count_for;
{
open my $fh, '<:encoding(UTF-8)', $in_file;
while (<$fh>) {
TOKEN: for my $token (split) {
next TOKEN unless $token =~ $word_re_for{$lang};
my $word = lc $1;
next TOKEN if $is_stop->{$word};
++$count_for{$word};
}
}
}
{
open my $fh, '>:encoding(UTF-8)', $out_file;
my $csv = Text::CSV::->new({sep_char => "\t", binary => 1});
$csv->say($fh, [$_, $count_for{$_}]) for sort keys %count_for;
}
sub _mod_stops {
my ($stops) = @_;
my @adds = qw{thou thee thy thine u ur};
my @dels = qw{very same};
$stops->{$_} = 1 for @adds;
delete @$stops{@dels};
return $stops;
}
####
$ cat test_output.csv
1tab 1
2tabs 1
3tabs 1
blank 1
cat 1
different 1
end 1
exclude 1
fo'c's'le 2
fo'c'sle 2
forecastle 2
hello 3
hide 1
important 1
include 1
line 1
mat 1
multi 1
new 1
next 1
old 1
pistol 1
pronouns 2
said 1
same 1
sat 1
say 1
sure 1
very 2
world 3