.../ManySubstitutions>perl mceparallel.pl nightfall.txt
file: nightfall.txt mins: 0 secs: 5.763
file: nightfall.txt mins: 0 secs: 4.290
file: nightfall.txt mins: 0 secs: 4.179
file: nightfall.txt mins: 0 secs: 4.293
####
use strict;
use warnings;
use Fcntl qw(:flock);
use File::Copy 'move';
use POSIX "sys_wait_h"; #for waitpid FLAGS
use Time::HiRes 'time';
$|=1;
my @in_files = qw(nightfall1.txt nightfall2.txt nightfall3.txt nightfall4.txt );
my $start_epoch = time();
# Fire off number of child processes equal to the
# number of files in @in_files;
# Then the parent who started these little guys,
# goes into a blocking wait until they all finish
# In this simple example, they will finish at about the same time
# because all the input files are roughly identical
# Each child can return a status code via exit($code_number).
# Each child writes its own output file, so the only real "bottleneck"
# is max avg throughput of the file system.
############
## Common code for all forked processes
#
# substitute whole word only
my %w1 = qw{
going go
getting get
goes go
knew know
trying try
tried try
told tell
coming come
saying say
men man
women woman
took take
lying lie
dying die
made make
};
# substitute on prefix
my %w2 = qw{
need need
talk talk
tak take
used use
using use
};
# substitute on substring
my %w3 = qw{
mean mean
work work
read read
allow allow
gave give
bought buy
want want
hear hear
came come
destr destroy
paid pay
selve self
cities city
fight fight
creat create
makin make
includ include
};
my $re1 = qr{\b(@{[ join '|', reverse sort keys %w1 ]})\b}i;
my $re2 = qr{\b(@{[ join '|', reverse sort keys %w2 ]})\w*}i;
my $re3 = qr{\b\w*?(@{[ join '|', reverse sort keys %w3 ]})\w*}i;
#my $re3 = qr{\w*?(@{[ join '|', reverse sort keys %w3 ]})\w*}i; #half speed of \b version
#########
## Fork off the children
#
$SIG{CHLD} = 'IGNORE';
open(my $fh_log, '>>', "Alogfile.txt") or die "unable to open Alogfile.txt $!";
#$fh_log->autoflush; #not needed this is automatic before locking or unlocking a file!
foreach my $file_name (@in_files)
{
if(my $pid = fork)
{ # parent
safe_print ($fh_log, "Spawned child pid: $pid for $file_name\n");
}
elsif(defined $pid ) # pid==0
{ # child
safe_print ($fh_log, "This is child pid $$ for $file_name. I am alive and working!\n");
process_file($file_name);
safe_print ($fh_log, "Child $$ finished work on $file_name\n");
exit(0);
}
else
{ # fork failed pid undefined
die "MASSIVE ERROR - FORK FAILED with $!";
}
}
### now wait for all children to finish, no matter who they are
1 while wait != -1 ; # avoid zombies this is a blocking operation
safe_print ($fh_log, "Parenting talking...all my children are finished! Hooray!\n");
close $fh_log;
sub safe_print
{
my ($fh, @text) = @_;
my $now_epoch = time();
my $delta_secs = $now_epoch - $start_epoch;
flock $fh, LOCK_EX or die "flock can't get lock $!";
printf $fh "%.3f secs %s", $delta_secs, $_ foreach @text;
printf "%.3f secs %s", $delta_secs, $_ foreach @text;
flock $fh, LOCK_UN or die "flock can't release lock $!";
}
sub process_file
{
my $filename = shift;
open my $IN, '<', $filename or die "can't open input $filename $!";
my $outfile = $filename;
$outfile =~ s/\.txt$/\.out/;
open my $OUT, '>', $outfile or die "can't open output $outfile $!";
safe_print ($fh_log, "opened $filename and $outfile\n");
while (<$IN>)
{
tr/-!"#%&'()*,.\/:;?@\[\\\]_{}0123456789//d; #no punct no digits
s/w(as|ere)/be/gi;
s{$re1}{ $w1{lc $1} }g; #this ~2-3 sec
s{$re2}{ $w2{lc $1} }g; #this ~3 sec
s{$re3}{ $w3{lc $1} }g; #this ~6 sec
print $OUT "$_";
}
close $IN;
close $OUT;
safe_print ($fh_log, "Child $$ finished working on $filename!\n");
exit(0); #CHILD has to exit itself!
}
__END__
0.007 secs Spawned child pid: -1620 for nightfall1.txt
0.008 secs This is child pid -1620 for nightfall1.txt. I am alive and working!
0.011 secs opened nightfall1.txt and nightfall1.out
0.013 secs Spawned child pid: -5660 for nightfall2.txt
0.014 secs This is child pid -5660 for nightfall2.txt. I am alive and working!
0.017 secs opened nightfall2.txt and nightfall2.out
0.019 secs Spawned child pid: -20048 for nightfall3.txt
0.020 secs This is child pid -20048 for nightfall3.txt. I am alive and working!
0.022 secs opened nightfall3.txt and nightfall3.out
0.029 secs Spawned child pid: -4840 for nightfall4.txt
0.031 secs This is child pid -4840 for nightfall4.txt. I am alive and working!
0.034 secs opened nightfall4.txt and nightfall4.out
4.818 secs Child -4840 finished working on nightfall4.txt!
4.827 secs Child -1620 finished working on nightfall1.txt!
4.835 secs Child -5660 finished working on nightfall2.txt!
4.842 secs Child -20048 finished working on nightfall3.txt!
4.844 secs Parent talking...all my children are finished! Hooray!
####
File sizes for reference:
10/04/2022 06:44 PM 80,748,006 nightfall.txt
10/08/2022 01:30 PM 20,187,006 nightfall1.txt
10/08/2022 01:30 PM 20,187,078 nightfall2.txt
10/08/2022 01:30 PM 20,187,057 nightfall3.txt
10/08/2022 01:30 PM 20,186,865 nightfall4.txt