for loop over records from metadata table { slurp $docid.txt (from pdftotext) into $contents; $s = $title; $s =~ s/\W+/.+?/g; # gaps/punctuation between words -> wildcards if($contents =~ /$s/sig){ print "pos = ", pos $contents, ",$s|"; pos $contents = 0; }else{ print "$textfile didn't contain $s\n"; } } ##

##

	my @s = split(/\W+/, $s);
	foreach $word (@s){
	    if($contents !~ /$word/sig){
		print "$textfile didn't contain $s\n";
		$nomatch = 1;
		last;
	    }
	}
	print "pos = ", pos $contents, ",$s|" unless $nomatch;

##

##

real	0m0.150s
user	0m0.044s
sys	0m0.068s

##

##

time tr -d '\n' < 250234.txt | egrep 'the...pattern'
real	0m7.178s
user	0m6.592s
sys	0m0.036s

time tr -d '\n' < 250234.txt | LANG=C egrep 'the...pattern'
real	0m0.027s
user	0m0.012s
sys	0m0.008s