patent #, char1, char2, char3, ... , char480 1234567,1,0,1,0,1,0, ... (480 characteristics) (x 8 million lines) #### #!/usr/bin/perl use strict; my(@patno1,@patno2,@record1,@record2); my $startat=@ARGV[0]; my $endat=@ARGV[1]; open(OUT, "; close(OUT); #clear variance file if it exists open(OUT, ">variance.csv")|| die("Could not open file variance.csv!\n"); close(OUT); map(chomp,@lines); # iterate over all patents for(my $i=$startat;$i<=$endat;$i++) { @record1=split(/\,/,$lines[$i]); $patno1=shift(@record1); # iterate through other lines to compare for(my $j=$i+1;$j<$#lines;$j++) { @record2=split(/\,/,$lines[$j]); $patno2=shift @record2; my $variance=0; # iterate through each characteristic for(my $k=0;$k<$#record1;$k++) { if($record1[$k]!=$record2[$k]) { $variance++; } } open(OUT, ">>variance.csv")|| die("Could not open file variance.csv!\n"); print OUT $patno1.",".$patno2.",".$variance."\n"; close(OUT); } }