for loop over records from metadata table { slurp $docid.txt (from pdftotext) into $contents; $s = $title; $s =~ s/\W+/.+?/g; # gaps/punctuation between words -> wildcards if($contents =~ /$s/sig){ print "pos = ", pos $contents, ",$s|"; pos $contents = 0; }else{ print "$textfile didn't contain $s\n"; } } #### my @s = split(/\W+/, $s); foreach $word (@s){ if($contents !~ /$word/sig){ print "$textfile didn't contain $s\n"; $nomatch = 1; last; } } print "pos = ", pos $contents, ",$s|" unless $nomatch; #### real 0m0.150s user 0m0.044s sys 0m0.068s #### time tr -d '\n' < 250234.txt | egrep 'the...pattern' real 0m7.178s user 0m6.592s sys 0m0.036s time tr -d '\n' < 250234.txt | LANG=C egrep 'the...pattern' real 0m0.027s user 0m0.012s sys 0m0.008s