Thanks,
tachyon.
For my application, I needed maximal runs of identical words between two strings. I'm loose about what comprises a word. For example, for my application "hi-res" is the same as "hires", and punctuation and case don't matter. In the spirit of TIMTOWTDI, here's what I scraped together:
# FRAGMENT
my $lcs = lcss(standardize($x1), standardize($x2);
sub lcss {
my ($str1, $str2 ) = @_;
my @match = ();
my @longest = ();
my $i = 0;
my $seq1 = [split(/\s+/, $str1)];
my $seq2 = [split (/\s+/, $str2)];
my $sub = sub {
@longest = map {$_} @match if (@match >= @longest);
@match = ();
};
traverse_sequences( $seq1, $seq2, {
MATCH => sub {push(@match, $seq1->[$_[0]]);},
DISCARD_A => $sub,
DISCARD_B => $sub,
});
my $lcs = join(' ', @longest);
return $lcs;
}
# lowercase and remove odd characters
sub standardize {
my ($text) = @_;
return unless $text;
$text =~ s/\[.*?\]/ /g;
$text =~ s/[.,?"':&()!-]/ /g;
$text =~ s/[^\w ]//g;
$text =~ s/^\s+//;
$text =~ s/\s+$//;
$text =~ s/\s+/ /;
$text = lc $text;
return $text;
}
rkg