#! /usr/bin/perl -w use 5.018; # 1123881a.pl (cf [id=1123881]) # $file1 is used in place of OP's text file1; DATA stands in for text file 2 # perl 5, version 18, subversion 4 (v5.18.4) built for MSWin32-x86-multi-thread-64int # ... # Binary build 1804 [298913] provided by ActiveState http://www.ActiveState.com # Built Mar 19 2015 17:49:00 my (@F1, %F1, @F2, %F2); my $file1 = "Now is the time for the quick red fox to jump over the lazy brown dog's spooon while all good men run away with the fork and come to the aid of their country"; chomp $file1; @F1 = split / /, $file1; # individual words my $file2 = ; chomp $file2; @F2 = split / /, $file2; $F1{$_}++ for @F1; # produces hash with key::value pairs word => count for each word $F2{$_}++ for @F2; say "\n\t --- Ln 25 Printing keys and values for the HASHES, \%F1 and \%F2\n\t\t ...and creating ARRAYS \@F1combined and \@F2combined."; my (@F1combined, @F2combined); # while ( my ($key, $value) = each(%F1) ) { print "$key => $value\n"; push @F1combined, ($key . ' => ' . $value); } say "\n\t --- \%F2, next: ---"; while ( my ($key, $value) = each(%F2) ) { print "$key => $value\n"; push @F2combined, ($key . ' => ' . $value); } my @sort_arr1 = sort {fc($a) cmp fc($b)} @F1combined; # fc to normalize my @sort_arr2 = sort {fc($a) cmp fc($b)} @F2combined; my $entry; # a complete element of an array, @sort_arr1 in this case. See Ln 54 my $counter = qr/ => \d+/; # the part of of the element we'll exclude in Ln 22-23 (so can match words w/variant counts) my $word; # search term for the word only, less the fat arrow and counter; see Ln 22-23 my $match_count = 0; my $mismatch=0; my $len1 = $#sort_arr1; # used to determine the terminal state of the loop at Ln 50 my $len2 = $#sort_arr2; my $item_count = ($len1 > $len2) ? ($len1+1) : ($len2+1); # Longer of the two arrays (files) ... say "\t\t \$item_count: $item_count"; # which causes "uninit" warnings at Ln 55 et seq. my $i; for ( $i=0; $i<($item_count); $i++) { my $entry = $sort_arr1[$i]; chomp $entry; say ">> Ln 56 \$i: $i |$entry| "; # can be used for DEBUG if ( $entry =~ /(\w+)$counter/i ) { $word = $1; } else { next; } if ( grep {/$word/} @sort_arr2 ) { say "\t found |$word| in both arrays (files) \n"; $match_count++; } else { say "\t didn't match entry, |$entry| \n"; $mismatch++; } } say "\n\t \$match_count: $match_count"; say "\t \$mismatch: $mismatch"; my $element_total = $match_count+$mismatch; say "\n\t SLOPPY SPEC: among other issues, does not treat cases where the number of instances of a word in one file \t is different than the number of instances in the second file as a mismatch (eg. if the word is in both, even \t though in differing quanties, it's treated as a match."; say "\t No allowance made for use with arrays having different numbers of elements (variance produces 'uninitialized' warnings).\n"; say "\n\t Here's one measure of SIMILARITY (using matchs/total elements evaled): " . $match_count/$item_count; say "\n\t Another uses the total of matches and mismatches as the divisor: " . $match_count/$element_total; say "\n\t Magnitude of DIS-similarty (using the ratio of mismatches/matches) : " . $mismatch/$match_count; say "\n\t By the same sloppy spec, but using mismatch/elements_in_first_array): ". $mismatch/($#sort_arr1 + 1); __DATA__ now is the time for all good men to come to the aid of their country while the quick red fox jumps over the lazy brown dog's back and the fork runs away with the spoon #### D:\> 1123881a.pl --- Ln 25 Printing keys and values for the HASHES, %F1 and %F2 ...and creating ARRAYS @F1combined and @F2combined. brown => 1 all => 1 their => 1 country => 1 is => 1 fox => 1 aid => 1 while => 1 to => 2 time => 1 for => 1 Now => 1 spooon => 1 dog's => 1 of => 1 away => 1 and => 1 with => 1 over => 1 run => 1 the => 5 quick => 1 fork => 1 red => 1 come => 1 men => 1 good => 1 lazy => 1 jump => 1 --- %F2, next: --- brown => 1 all => 1 runs => 1 jumps => 1 their => 1 country => 1 back => 1 is => 1 fox => 1 aid => 1 time => 1 while => 1 to => 2 for => 1 spoon => 1 dog's => 1 of => 1 away => 1 now => 1 with => 1 the => 6 and => 1 over => 1 fork => 1 quick => 1 red => 1 come => 1 men => 1 lazy => 1 good => 1 $item_count: 30 # some output, similar to the following, has been deleted for the sake of brevity # (even so, brevity is not present in abundance) ... >> Ln 56 $i: 22 |spooon => 1| didn't match entry, |spooon => 1| >> Ln 56 $i: 23 |the => 5| # No allowance for variance in occurences, 5 in file1 and 6 in DATA found |the| in both arrays (files) # Part of the loose spec; should this count for similarity or dis-similarity? >> Ln 56 $i: 24 |their => 1| found |their| in both arrays (files) >> Ln 56 $i: 25 |time => 1| found |time| in both arrays (files) >> Ln 56 $i: 26 |to => 2| found |to| in both arrays (files) ... Use of uninitialized value $entry in scalar chomp at D:\_Perl_\pl_test\1123881a.pl line 55, line 1. Use of uninitialized value $entry in concatenation (.) or string at D:\_Perl_\pl_test\1123881a.pl line 56, > Ln 56 $i: 29 || Use of uninitialized value $entry in pattern match (m//) at D:\_Perl_\pl_test\1123881a.pl line 57, line 1 $match_count: 27 $mismatch: 2 SLOPPY SPEC: (AMONG OTHER ISSUES) does not treat cases where the number of instances of a word in one file is different than the number of instances in the second file as a mismatch (eg. if the word is in both, though in differing quanties, it's treated as a match. No allowance made for use with arrays having different numbers of elements (variance produces 'uninitialised" warnings Here's one measure of SIMILARITY (using matchs/total elements evaled): 0.9 Another uses the total of matches and mismatches as the divisor: 0.931034482758621 Magnitude of DIS-similarty (using the ratio of mismatches/matches) : 0.0740740740740741 By the same sloppy spec, but using mismatch/elements_in_first_array): 0.0689655172413793