Re: calculate matching words/sentence

Something like this may be suitable for you. It will return 1 if the strings are pseudo-identical and 0 if they are completely different. It will return values between 0 and 1 with the value increasing as the similaritly increases. Pseudo identical is the appropriate word as we don't consisder word order or word frequency (where the same word appears more than once). This may or not matter to you.

I uses just one loop and a hash table so should not be glacial. You can tokenize any way you like, I remove punctiation and lower case....

print compare( 'Hello', 'hello' ), $/;  # 1
print compare( 'Hello', 'HELLO WORLD' ), $/; # 0.5
print compare( 'The quick brown fox jumped over the lazy dogs.', 
               'The quick brown dogs jumped over the lazy fox.' ), $/;
+ # 1
print compare( 'The quick brown fox jumped over the lazy dogs.', 
               'The quick brown dogs jumped over the lazy kangaroo.' )
+; # 0.888



sub compare {
    my ( $str1, $str2 ) = @_;
    my $tok_str1 = tokenize($str1);
    my $tok_str2 = tokenize($str2);
    # swap unless @$tok_str1 contains the most tokens
    ($tok_str1, $tok_str2) = ($tok_str2, $tok_str1) if @$tok_str2 > @$
+tok_str1;
    # make a lookup hash for the smaller numer of tokens in str2
    my %h;
    @h{@$tok_str2} = ();  # slice syntax if fastest
    # now scan str1 for these tokens and count
    my $found = 0;
    for my $tok ( @$tok_str1 ) {
        $found++ if exists $h{$tok};
    }
    my $similarity = $found/@$tok_str1;
  return $similarity;
}



sub tokenize {
    my ($str) = @_;
    # remove punctuation stuff
    $str =~ s/[^A-Za-z0-9 ]+//g;
    # lowercase
    $str = lc $str;
    # magic whitespace split and return array ref
  return [split ' ', $str];
}
[download]

cheers

tachyon

s&&rsenoyhcatreve&&&s&n.+t&"$'$`$\"$\&"&ee&&y&srve&&d&&print

Comment on Re: calculate matching words/sentence Download Code