in reply to Searching for best match
Hello Sosi,
In addition to Text::Fuzzy, you should look at Algorithm::Diff. Here’s some proof-of-concept code:
#! perl use strict; use warnings; use Algorithm::Diff qw( LCS ); use Data::Dump; use constant MIN_MATCH => 0.5; my @source = ( 'John Ronald Reuel Tolkien', 'John Ronald S Tolkien', 'Trent Reznor', 'Barack Hussein Obama II', 'Barack Hussein II', ); my @search = ( 'John Ronald Reuel T', 'Trent Reznor', 'Barack Hussein II', 'Barack Hussein Obama II', 'No match here', ); my %searches = map { $_ => [] } @search; for my $s (@search) { my @search_chars = split //, $s; my @matches; for my $source (@source) { my @source_chars = split //, $source; my @diff_chars = LCS(\@search_chars, \@source_chars); my $diff = join '', @diff_chars; if (!@{ $searches{$s} } || length $diff > length $searches{$s}->[0]) { $searches{$s}->[0] = $diff; $searches{$s}->[1] = $source; } } } for my $key (keys %searches) { my $len_key = length $key; my $len_match = length $searches{$key}->[0]; delete $searches{$key} if ($len_match / $len_key) < MIN_MATCH; } dd \%searches;
Output:
23:19 >perl 1044_SoPW.pl { "Barack Hussein II" => ["Barack Hussein II", "Barack Hussein O +bama II"], "Barack Hussein Obama II" => ["Barack Hussein Obama II", "Barack Hus +sein Obama II"], "John Ronald Reuel T" => ["John Ronald Reuel T", "John Ronald Re +uel Tolkien"], "Trent Reznor" => ["Trent Reznor", "Trent Reznor"], } 23:19 >
Hope that helps,
| Athanasius <°(((>< contra mundum | Iustus alius egestas vitae, eros Piratica, |
|
|---|