use strict; use warnings; #selection criteria my @SELECTED_FIELDS=(1,2,3); my $VALUE_TO_CANONICAL_FORM = sub { return shift; }; #lines for two dummy files my @aFile1=('111,A1,B1,C1' , '121,A1,B2,C1' , '112,A1,B1,C2' , '222,A2,B2,C2' , '221,A2,B2,C1'); # , '112,A1,B1,C2' # , '113,A1,B1,C3' # , '121,A1,B2,C1' # , '122,A1,B2,C2' # , '123,A1,B2,C3' # , '131,A1,B3,C1' # , '132,A1,B3,C2' # , '133,A1,B3,C2'); my @aFile2=(@aFile1); #---------------------------------------------------------- # selection if we don't care about combinations and only # care that all possible values in selected fields are # represented. #---------------------------------------------------------- print "If we don't care about unique combos of selected fields...\n"; my %hFields = map { $_ => {} } @SELECTED_FIELDS; foreach my $aFileLines (\@aFile1, \@aFile2) { foreach my $sLine (@$aFileLines) { #should probably use Text::CVS as per jethro unless #you are *absolutely* sure that a comma never appears #as field values, and whitespace around commas is always #part of the value rather than part of the separator my @aFields = split(/,/, $sLine); #print join('|', @aFields) . "\n"; my @aDupFields; my $aLineWithNewValue=[$sLine,0]; for my $iField (@SELECTED_FIELDS) { my $sValue = &$VALUE_TO_CANONICAL_FORM($aFields[$iField]); if (exists $hFields{$iField}->{$sValue}) { push @aDupFields, $iField; } else { #keep track of number of fields with new values $aLineWithNewValue->[1]++; #save line as sample record containing the value $hFields{$iField}->{$sValue} = $aLineWithNewValue; } } #try to reduce the number of lines needed to display all values #by revisiting the values that have already been found and #replacing their sample line with the new one containing the all #ready found value and at least one new value #Note: this reduction only eliminates lines with just one new #value in favor of lines with two or more new values. It would #have to be modified significantly to handle the possibility of #lines with N new values. foreach (@aDupFields) { my $iField = $_; my $sValue = &$VALUE_TO_CANONICAL_FORM($aFields[$iField]); my $aSavedLine = $hFields{$iField}->{$sValue}; if (1 == $aSavedLine->[1]) { $aLineWithNewValue->[1]++; $hFields{$iField}->{$sValue}=$aLineWithNewValue; } } } } #print out lines my %hPrinted; while (my ($iField,$hValues) = each(%hFields)) { while (my ($sValue, $aSavedLine) = each (%$hValues)) { my $sLine = $aSavedLine->[0]; unless (exists $hPrinted{$sLine}) { print "$sLine\n"; $hPrinted{$sLine} = 1; } } }