in reply to Metric for confidence of complex match
The basic value would be the number of bits set. You can add weight to certain bits, or to certain fields, or to both.
Terribly under-tested code:
Output:use strict; use warnings; my %match = ( 'ABC.com' => { 'XYZ.com' => { company => 1, # reg. only contact => 0, # no match address => 3, # reg. and admin. phone => 2, # admin. only email => 2, # admin. only }, 'BBC.com' => { company => 3, contact => 3, address => 3, phone => 3, email => 3, }, }, 'FOO.com' => { 'BAR.com' => { company => 1, contact => 3, address => 3, phone => 2, email => 1, }, 'BAZ.com' => { company => 3, contact => 2, address => 2, phone => 2, email => 2, }, }, ); # I'm sure there is a faster way to count bits. sub bit_count { my ( $number ) = @_; my $bits = unpack 'b*', pack 'C', $number; my $bits_set = ( $bits =~ tr/1// ); return $bits_set; } sub confidence_by_bits { my ( $href ) = @_; my $weight = 0; foreach ( values %{ $href } ) { $weight += bit_count($_); } return $weight; } sub confidence_by_weighted_position { my ( $href ) = @_; my $weight = 0; foreach ( values %{ $href } ) { # Reg - slightly important if ( $_ | 1 ) { $weight += 1; } # Admin - very important if ( $_ | 2 ) { $weight += 5; } # Bonus for both if ( $_ == 3 ) { $weight += 20; } } return $weight; } sub confidence_by_weighted_field { my ( $href ) = @_; my $weight = 0; while ( my ( $key, $val ) = each %{ $href } ) { my $bits = bit_count($val); if ( $key eq 'company' ) { $bits *= 5; } elsif ( $key eq 'phone' ) { $bits *= 0.8; } $weight += $bits; } return $weight; } print_with_confidence( \%match, \&confidence_by_bits ); print_with_confidence( \%match, \&confidence_by_weighted_position ); print_with_confidence( \%match, \&confidence_by_weighted_field ); sub print_with_confidence { my ( $match_href, $conf_func ) = @_; while ( my ( $domain1, $v1 ) = each %match ) { my @candidates; while ( my ( $domain2, $v2 ) = each %$v1 ) { my $confidence = $conf_func->( $v2 ); push @candidates, [ $domain2, $confidence ]; } @candidates = sort { $b->[1] <=> $a->[1] || $b->[0] cmp $a->[0] } @candidates; print "$domain1\n"; foreach ( @candidates ) { my ( $domain2, $confidence ) = @{ $_ }; printf "\t%7d\t%s\n", $confidence, $domain2; } } print "\n"; }
ABC.com 10 BBC.com 5 XYZ.com FOO.com 7 BAR.com 6 BAZ.com ABC.com 130 BBC.com 50 XYZ.com FOO.com 70 BAR.com 50 BAZ.com ABC.com 17 BBC.com 8 XYZ.com FOO.com 13 BAZ.com 10 BAR.com
|
|---|