$ cat test_input.txt Hello, world! I said, "Hello, world!". Did he say "Hello, world!"? We're not sure. 1tab: 2tabs: 3tabs: END-TABS # multi-spacing here - blank line next The cat sat on the mat. Old pronouns: thou; thee; thy; thine. New pronouns: "u", 'ur'. Forecastle = "fo'c'sle" or "fo'c's'le" Forecastle = 'fo'c'sle' or 'fo'c's'le' Don't hide the Very pistol; it could be very important. Why exclude different but include same? #### #!/usr/bin/env perl use strict; use warnings; use autodie; use Lingua::StopWords 'getStopWords'; use Text::CSV; my ($lang, $encoding) = qw{en UTF-8}; my %word_re_for = ( en => qr{^.*?\b([\p{Alnum}']*[\p{Alnum}]+).*$}, ); my ($in_file, $out_file) = qw{test_input.txt test_output.csv}; my $is_stop = _mod_stops(getStopWords($lang, $encoding)); my %count_for; { open my $fh, '<:encoding(UTF-8)', $in_file; while (<$fh>) { TOKEN: for my $token (split) { next TOKEN unless $token =~ $word_re_for{$lang}; my $word = lc $1; next TOKEN if $is_stop->{$word}; ++$count_for{$word}; } } } { open my $fh, '>:encoding(UTF-8)', $out_file; my $csv = Text::CSV::->new({sep_char => "\t", binary => 1}); $csv->say($fh, [$_, $count_for{$_}]) for sort keys %count_for; } sub _mod_stops { my ($stops) = @_; my @adds = qw{thou thee thy thine u ur}; my @dels = qw{very same}; $stops->{$_} = 1 for @adds; delete @$stops{@dels}; return $stops; } #### $ cat test_output.csv 1tab 1 2tabs 1 3tabs 1 blank 1 cat 1 different 1 end 1 exclude 1 fo'c's'le 2 fo'c'sle 2 forecastle 2 hello 3 hide 1 important 1 include 1 line 1 mat 1 multi 1 new 1 next 1 old 1 pistol 1 pronouns 2 said 1 same 1 sat 1 say 1 sure 1 very 2 world 3