wget http://www.astro.sunysb.edu/fwalter/AST389/TEXTS/Nightfall.htm html2text-cpp Nightfall.htm >nightfall.txt mkdir 00; for i in `seq -w 0 3456`; do head -$((RANDOM/128)) nightfall.txt >00/data-$i; done #### #!/usr/bin/perl use strict; use warnings; use 5.36.0; use Env; use utf8; use POSIX "sys_wait_h"; #for waitpid FLAGS use Time::HiRes qw(gettimeofday tv_interval); use open ':std', ':encoding(UTF-8)'; my $benchmark = 1; # print timings for loops my $TMP='/tmp'; my $HOME = $ENV{HOME}; my $IN; my $OUT; my @data = glob("data-* ??/data-*"); my $filecount = scalar(@data); die if $filecount < 0; say "Parsing $filecount files"; my $wordfile="data.dat"; truncate $wordfile, 0; #$|=1; # substitute whole words my %whole = qw{ going go getting get goes go knew know trying try tried try told tell coming come saying say men man women woman took take lying lie dying die }; # substitute on prefix my %prefix = qw{ need need talk talk tak take used use using use }; # substitute on substring my %substring = qw{ mean mean work work read read allow allow gave give bought buy want want hear hear came come destr destroy paid pay selve self cities city fight fight creat create makin make includ include }; my $re1 = qr{\b(@{[ join '|', reverse sort keys %whole ]})\b}i; my $re2 = qr{\b(@{[ join '|', reverse sort keys %prefix ]})\w*}i; my $re3 = qr{\b\w*?(@{[ join '|', reverse sort keys %substring ]})\w*}i; truncate $wordfile, 0; my $maxforks = 64; print "maxforks: $maxforks\n"; my $forkcount = 0; my $infile; my $subdir = 0; my $subdircount = 255; my $tempdir = "temp"; mkdir "$tempdir"; mkdir "$tempdir/$subdir" while ($subdir++ <= $subdircount); $subdir = 0; my $i = 0; my $t0 = [gettimeofday]; my $elapsed; foreach $infile(@data) { $forkcount -= waitpid(-1, WNOHANG) > 0 while $forkcount >= $maxforks; # do { $elapsed=tv_interval($t0); print "elapsed: $elapsed\n"; die; } if $i++ >1000; # 1000 files test $i++; # comment out if you uncomment the above line $subdir = 1 if $subdir++ > $subdircount; if (my $pid = fork) { # $pid defined and !=0 -->parent ++$forkcount; } else { # $pid==0 -->child open my $IN, '<', $infile or exit(0); open my $OUT, '>', "$tempdir/$subdir/text-$i" or exit(0); while (<$IN>) { tr/-!"#%&()*',.\/:;?@\[\\\]”_“{’}><^)(|/ /; # no punct " s/^/ /; s/\n/ \n/; s/[[:digit:]]{1,12}//g; s/w(as|ere)/be/gi; s{$re2}{ $prefix{lc $1} }g; # prefix s{$re3}{ $substring{lc $1} }g; # part s{$re1}{ $whole{lc $1} }g; # whole print $OUT "$_"; } close $OUT; close $IN; defined $pid and exit(0); # $pid==0 -->child, must exit itself } } ### now wait for all children to finish, no matter who they are 1 while wait != -1; # avoid zombies this is a blocking operation local @ARGV = glob("$tempdir/*/*"); my @text = <>; unlink glob "$tempdir/*/*"; open $OUT, '>', $wordfile or die "Error opening $wordfile"; print $OUT @text; close $OUT; $elapsed = tv_interval($t0); print "regex: $elapsed\n" if $benchmark; #### for dir in $(seq -w 01 10); do cp -a 00 $dir; done