use strict; use warnings; # substitute whole word only my %w1 = qw{ going go getting get goes go knew know trying try tried try told tell coming come saying say men man women woman took take lying lie dying die made make }; # substitute on prefix my %w2 = qw{ need need talk talk tak take used use using use }; # substitute on substring my %w3 = qw{ mean mean work work read read allow allow gave give bought buy want want hear hear came come destr destroy paid pay selve self cities city fight fight creat create makin make includ include }; my $re1 = qr{\b(@{[ join '|', reverse sort keys %w1 ]})\b}i; my $re2 = qr{\b(@{[ join '|', reverse sort keys %w2 ]})\w*}i; my $re3 = qr{\b\w*?(@{[ join '|', reverse sort keys %w3 ]})\w*}i; #see discussion #my $re3 = qr{\w*?(@{[ join '|', reverse sort keys %w3 ]})\w*}i; #print "$re3\n"; #for debugging my $out='out-perl.dat'; open my $OUT, '>', $out or die "unable to open $out $!"; my $start = time(); my $finish; open my $IN, '<', "nightfall.txt" or die " $!"; #75 MB file while (<$IN>) { tr/-!"#%&'()*,.\/:;?@\[\\\]_{}0123456789//d; # no punct no digits # other formulations possible s/w(as|ere)/be/gi; s{$re1}{ $w1{lc $1} }g; #this ~2-3 sec s{$re2}{ $w2{lc $1} }g; #this ~3 sec s{$re3}{ $w3{lc $1} }g; #this ~6 (best) - 14 sec print $OUT "$_"; } $finish = time(); my $total_seconds = $finish-$start; my $minutes = int ($total_seconds/60); my $seconds = $total_seconds - ($minutes*60); print "minutes: $minutes seconds: $seconds\n"; __END__ Time to completion with \b added to begin of $re3 minutes: 0 seconds: 12