for loop over records from metadata table {
slurp $docid.txt (from pdftotext) into $contents;
$s = $title;
$s =~ s/\W+/.+?/g; # gaps/punctuation between words -> wildcards
if($contents =~ /$s/sig){
print "pos = ", pos $contents, ",$s|";
pos $contents = 0;
}else{
print "$textfile didn't contain $s\n";
}
}
####
my @s = split(/\W+/, $s);
foreach $word (@s){
if($contents !~ /$word/sig){
print "$textfile didn't contain $s\n";
$nomatch = 1;
last;
}
}
print "pos = ", pos $contents, ",$s|" unless $nomatch;
####
real 0m0.150s
user 0m0.044s
sys 0m0.068s
####
time tr -d '\n' < 250234.txt | egrep 'the...pattern'
real 0m7.178s
user 0m6.592s
sys 0m0.036s
time tr -d '\n' < 250234.txt | LANG=C egrep 'the...pattern'
real 0m0.027s
user 0m0.012s
sys 0m0.008s