#!/usr/bin/env perl # https://www.perlmonks.org/?node_id=11147200 use strict; use warnings; use MCE; use Time::HiRes 'time'; die "usage: $0 infile1.txt [ infile2.txt ... ]\n" unless @ARGV; # substitute whole word only my %W1 = qw{ going go getting get goes go knew know trying try tried try told tell coming come saying say men man women woman took take lying lie dying die made make }; # substitute on prefix my %W2 = qw{ need need talk talk tak take used use using use }; # substitute on substring my %W3 = qw{ mean mean work work read read allow allow gave give bought buy want want hear hear came come destr destroy paid pay selve self cities city fight fight creat create makin make includ include }; my $RE1 = qr{\b(@{[ join '|', reverse sort keys %W1 ]})\b}i; my $RE2 = qr{\b(@{[ join '|', reverse sort keys %W2 ]})\w*}i; my $RE3 = qr{\b\w*?(@{[ join '|', reverse sort keys %W3 ]})\w*}i; my $OUT_FH; # output file-handle used by workers # Spawn worker pool. my $mce = MCE->new( max_workers => MCE::Util::get_ncpu(), chunk_size => '64K', init_relay => 0, # specifying init_relay loads MCE::Relay use_slurpio => 1, # enable slurpio user_begin => sub { # worker begin routine per each file to be processed my ($outfile) = @{ MCE->user_args() }; open $OUT_FH, '>>', $outfile; }, user_end => sub { # worker end routine per each file to be processed close $OUT_FH if defined $OUT_FH; }, user_func => sub { # worker chunk routine my ($mce, $chunk_ref, $chunk_id) = @_; # process entire chunk versus line-by-line $$chunk_ref =~ tr/-!"#%&'()*,.\/:;?@\[\\\]_{}0123456789//d; $$chunk_ref =~ s/w(as|ere)/be/gi; $$chunk_ref =~ s/$RE1/ $W1{lc $1} /g; $$chunk_ref =~ s/$RE2/ $W2{lc $1} /g; $$chunk_ref =~ s/$RE3/ $W3{lc $1} /g; # Output orderly and serially. MCE->relay_lock; print $OUT_FH $$chunk_ref; $OUT_FH->flush; MCE->relay_unlock; } )->spawn; # Process file(s). my $status = 0; while (my $infile = shift @ARGV) { if (-d $infile) { warn "WARN: '$infile': Is a directory, skipped\n"; $status = 1; } elsif (! -f $infile) { warn "WARN: '$infile': No such file, skipped\n"; $status = 1; } else { my $outfile = $infile; $outfile =~ s/\.txt$/.dat/; if ($outfile eq $infile) { warn "WARN: '$outfile': matches input name, skipped\n"; $status = 1; next; } # truncate output file open my $fh, '>', $outfile or do { warn "WARN: '$outfile': $!, skipped\n"; $status = 1; next; }; close $fh; # process file; pass argument(s) to workers my $start = time(); $mce->process($infile, { user_args => [ $outfile ] }); my $total_seconds = time() - $start; my $minutes = int($total_seconds / 60); my $seconds = $total_seconds - ($minutes * 60); printf "file: $infile mins: $minutes secs: %0.3f\n", $seconds; } } $mce->shutdown; # reap workers exit $status;