use strict; use warnings; # See http://en.wikipedia.org/wiki/Canonicalization sub canonicalize { my ($string) = @_; # Remove everything except certain characters. $string =~ tr{A-Za-z0-9 }{}cd; # Make case-insensitive (if you want) # $string = lc $string; return $string; } sub match_up_canonically { my ( $lines1_aref, $lines2_aref ) = @_; my %where; $where{ canonicalize($_) } |= 1 for @{ $lines1_aref }; $where{ canonicalize($_) } |= 2 for @{ $lines2_aref }; my ( @matches, @nonmatches1, @nonmatches2 ); for ( @{ $lines1_aref} ) { my $n = $where{ canonicalize($_) }; if ( $n == 3 ) { push @matches, $_; } elsif ( $n == 1 ) { push @nonmatches1, $_; } else { die "Can't happen"; } } for ( @{ $lines2_aref} ) { my $n = $where{ canonicalize($_) }; if ( $n == 3 ) { # Do nothing! # The matched lines already printed in the @lines1 loop. # ...or... # Print the matched lines again, because they may be # different, just not different in a way that matters. # push @matches, $_; } elsif ( $n == 2 ) { push @nonmatches2, $_; } else { die "Can't happen"; } } return( \@matches, \@nonmatches1, \@nonmatches2 ); } my @lines1 = ( 'able baker charlie', 'roger, fox, dog', 'Gomez Morticia Cousin-Itt', 'Wednesday Pugsley Lurch', ); my @lines2 = ( 'Gomez Morticia Cousin_ITT', 'roger; fox; dog', 'Wednesday Pugsley Fester', 'able baker charlie', ); my ( $m, $n1, $n2 ) = match_up_canonically( \@lines1, \@lines2 ); print join "\n", 'Matched:', @{ $m }, "\n"; print join "\n", 'Non-matched1:', @{ $n1 }, "\n"; print join "\n", 'Non-matched2:', @{ $n2 }, "\n";