Parsing 3457 files
regex: 4.126449
real 0m4.194s
user 0m13.465s
sys 0m0.414s
####
Parsing 17285 files
regex: 20.588749
real 0m20.681s
user 1m7.373s
sys 0m1.873s
##
##
#!/usr/bin/perl
use strict;
use feature qw{ say };
use warnings;
use Env;
use utf8;
use Time::HiRes qw(gettimeofday tv_interval usleep);
use open ':std', ':encoding(UTF-8)';
use threads;
use Thread::Queue;
my $benchmark = 1; # print timings for loops
my $TMP='./tmp';
my $IN;
my $OUT;
my @data = glob("data-* ??/data-*");
my $filecount = scalar(@data);
die if $filecount < 0;
say "Parsing $filecount files";
my $wordfile="data.dat";
truncate $wordfile, 0;
#$|=1;
# substitute whole words
my %whole = qw{
going go
getting get
goes go
knew know
trying try
tried try
told tell
coming come
saying say
men man
women woman
took take
lying lie
dying die
};
# substitute on prefix
my %prefix = qw{
need need
talk talk
tak take
used use
using use
};
# substitute on substring
my %substring = qw{
mean mean
work work
read read
allow allow
gave give
bought buy
want want
hear hear
came come
destr destroy
paid pay
selve self
cities city
fight fight
creat create
makin make
includ include
};
my $re1 = qr{\b(@{[ join '|', reverse sort keys %whole ]})\b}i;
my $re2 = qr{\b(@{[ join '|', reverse sort keys %prefix ]})\w*}i;
my $re3 = qr{\b\w*?(@{[ join '|', reverse sort keys %substring ]})\w*}i;
truncate $wordfile, 0;
my $threads = 8;
my $forkcount = 0;
my $infile;
my $subdir = 0;
my $subdircount = 255;
my $tempdir = "temp";
mkdir "$tempdir";
mkdir "$tempdir/$subdir" while ($subdir++ <= $subdircount);
$subdir = 0;
my $i = 0;
my $t0 = [gettimeofday];
my $elapsed;
my $queue = 'Thread::Queue'->new;
sub process_file {
while (my $task = $queue->dequeue) {
my ($infile, $subdir, $i) = @$task;
open my $IN, '<', $infile or exit(0);
open my $OUT, '>', "$tempdir/$subdir/text-$i" or exit(0);
while (<$IN>) {
tr/-!"#%&()*',.\/:;?@\[\\\]”_“{’}><^)(|/ /; # no punct "
s/^/ /;
s/\n/ \n/;
s/[[:digit:]]{1,12}//g;
s/w(as|ere)/be/gi;
s{$re2}{ $prefix{lc $1} }g; # prefix
s{$re3}{ $substring{lc $1} }g; # part
s{$re1}{ $whole{lc $1} }g; # whole
print $OUT "$_";
}
close $OUT;
close $IN;
}
}
my @workers = map threads->create(\&process_file), 1 .. $threads;
foreach $infile (@data) {
$subdir = 1 if $subdir++ > $subdircount;
$queue->enqueue([$infile, $subdir, $i++]);
}
$queue->end;
$_->join for @workers;
local @ARGV = glob("$tempdir/*/*");
open $OUT, '>', $wordfile or die "Error opening $wordfile";
print {$OUT} $_ while <>;
close $OUT;
unlink glob "$tempdir/*/*";
$elapsed = tv_interval($t0);
print "regex: $elapsed\n" if $benchmark;