# (part of much larger codebase which uses strict) my %TPL; # hash for storing objects which persist between invocations of ocr_name sub trim_word { # trim words my $ref=shift; $$ref=~s/\([^)]+\)//g; $$ref=~s/\s+$//; $$ref=~s/^\s+//; $$ref=~s/[-\s]/.*/g; # internal hyphens or spaces } sub fuzzify_word { # try to relax matching for common OCR mistakes my $ref=shift; $$ref=~s/O/[OD]/ig; # O <-> O or D $$ref=~s/G/[gc]/ig; # uc G <-> C $$ref=~s/[ij]/[ij]/g; # lc I <-> J $$ref=~s/[HN]/[hn]/ig; # uc H <-> N $$ref=~s/W/[wn]/ig; # uc W <-> N $$ref=~s/T/[it]/ig; # T or t <-> i or t $$ref=~s/C/[CL]/ig; # C or c <-> C or L } sub ocr_sort { # sorting function, put longer names first length($b) <=> length($a); } sub ocr_name { # examine a file with OCR to get a name my ($file,$hExclude,$hIgnore)=@_; # file, hash of excluded words, hash of ignored words my ($first,$last,$acct,$origfirst,$origlast,$text); unless ($TPL{ocr_match}) { # regenerate big hash & regexps my ($sth,$re,$re2,%re,%re2); $sth=sql("select lastname,firstname,acct from names order by acct desc"); # calls DBI (code not shown), desc so more recent accounts which will be matched earlier (maybe) (?) while (($last,$first,$acct)=$sth->fetchrow) { trim_word(\$first); trim_word(\$last); $origfirst=$first; $origlast=$last; fuzzify_word(\$first); fuzzify_word(\$last); $re{"$first$last"}||="(?{$acct})$first\\s*$last|(?{$acct})$last,?\\s+$first"; # use ?{} to store account if this branch matches # catch unusual last names $first=substr($origfirst,0,1); $re2{"$first$last"}||="(?{$acct})\\b$last,?\\s+$first" unless exists $hExclude->{lc $origlast} or length($last)<4; } $sth->finish; $re=join('|',sort ocr_sort values %re); $re2=join('|',sort ocr_sort values %re2); $TPL{ocr_match}=sub { my $text=shift; my @matches; die "Bad re $re" unless index($re,'|')>0; use re 'eval'; while ($text=~/$re/gis) { push @matches,$^R; } @matches; }; $TPL{ocr_match2}=sub { my $text=shift; my @matches; die "Bad re $re2" unless index($re2,'|')>0; use re 'eval'; while ($text=~/$re2/gis) { push @matches,$^R; } @matches; }; } $text=run_ocr($file); # convert file and invoke OCR, returning raw OCR'd text output $ignored=join('|',keys %$hIgnore); $text=~s/($ignore)//oig; # delete words which cause false hits map { $match{$_}++ } $TPL{ocr_match}->($text); # call precompiled proc map { $match{$_}++ } $TPL{ocr_match2}->($text); return keys %match; # array of accounts of named individuals in document } #### ...(?{474})[CL]r[ij]s[it][ij]a[hn][ij],?\s+S[it]eve|(?{487})... # that's matching fictitious account 00474, Mr. Steve Christian