1: #!/usr/bin/perl
   2: 
   3: ##########################################################################
   4: #                                                                        #
   5: # How diffy works:                                                       #
   6: # - it simply compares files and prints out the names of identical files #
   7: # - if you give it 2 paths it compares the files in those paths          #
   8: # - if you give it 1 path, it compares just the files in this path       #
   9: # - if you give it no path, the current directory is taken               #
  10: # - if you give the option -h, hardlings will be shown too               #
  11: #                                                                        #
  12: # TODO:                                                                  #
  13: #  make it faster, smaller and more robust                               #
  14: #  print usage with -h and get better options                            #
  15: #                                                                        #
  16: ##########################################################################
  17: 
  18: $hardlinks = 0;
  19: 
  20: 
  21: if ( "$ENV{'DIFFY_SEARCH_SIZE'}" ) {
  22:     $min_size = $ENV{'DIFFY_SEARCH_SIZE'};
  23: } else {
  24:     $min_size = "20k";
  25: }
  26: 
  27: $numDirs = 0;
  28: foreach (@ARGV) {
  29:     if($_ eq "?") {
  30: 	die "*** Usage: diffy [directoy_1 [directory_2]] [-h]\n\n";
  31:     }
  32:     
  33:     if($_ eq "-h") {
  34: 	$hardlinks = 1;
  35:     } elsif($numDirs < 2) {
  36: 	if($numDirs == 0) { $dir1 = $_ }
  37: 	if($numDirs == 1) { $dir2 = $_ }
  38: 		
  39: 	$numDirs++;
  40:     }
  41: }
  42: 
  43: if($numDirs == 0) {
  44:     $dir1 = ".";
  45:     
  46:     $numDirs = 1;
  47: }
  48: 
  49: if($numDirs == 1) {
  50:     $dir2 = $dir1;
  51: }
  52: 
  53: 
  54: %files1 = &getFiles($dir1);
  55: %files2 = &getFiles($dir2);
  56: 
  57: 
  58: @keys1 = sort { $files1{$a} <=> $files1{$b} } keys(%files1);
  59: @keys2 = sort { $files2{$a} <=> $files2{$b} } keys(%files2);
  60: 
  61: 
  62: $numKeys1 = scalar(@keys1) + 1 ;
  63: $numKeys2 = scalar(@keys2) + 1 ;
  64: 
  65: 
  66: $value       = $files1{$keys1[0]};
  67: $i2          = 0;
  68: $index       = 0;
  69: %savedFiles1 = %empty; # $savedFiles1{index} = filename
  70: %savedFiles2 = %empty;
  71: 
  72: for($i1 = 0; $i1 < $numKeys1; $i1++) {
  73:     if($value != $files1{$keys1[$i1]}) {
  74: 	if($index > 0) {
  75: 	    ### now delete double pairs and print the remaining pairs
  76: 	    
  77: 	    @savedKeys = keys(%savedFiles1);
  78: 	    $numSaved  = scalar(@savedKeys);
  79: 			
  80: 	    while($numSaved > 0) {
  81: 		for($iSave = 1; $iSave < $numSaved; $iSave++) {
  82: 		    if($savedFiles1{$savedKeys[0]} eq $savedFiles2{$savedKeys[$iSave]} &&
  83: 		       $savedFiles2{$savedKeys[0]} eq $savedFiles1{$savedKeys[$iSave]}) {
  84: 			
  85: 			delete $savedFiles1{$savedKeys[$iSave]};
  86: 			delete $savedFiles2{$savedKeys[$iSave]};
  87: 			last;
  88: 		    }
  89: 		}
  90: 		print "ln -f \"".$savedFiles1{$savedKeys[0]}."\""." "."\"".$savedFiles2{$savedKeys[0]}."\""."\n";
  91: 		
  92: 		delete $savedFiles1{$savedKeys[0]};
  93: 		delete $savedFiles2{$savedKeys[0]};
  94: 		
  95: 		@savedKeys = keys(%savedFiles1);
  96: 		$numSaved  = scalar(@savedKeys);
  97: 	    }
  98: 	}
  99: 	$index       = 0;
 100: 	%savedFiles1 = %empty; # just to be sure
 101: 	%savedFiles2 = %empty;
 102: 	
 103: 	$value = $files1{$keys1[$i1]};
 104:     }
 105:     
 106:     while($files2{$keys2[$i2]} < $files1{$keys1[$i1]}) {
 107: 	$i2++;
 108: 	if($i2 > $numKeys2) { exit; }
 109:     }
 110:     
 111:     if($files2{$keys2[$i2]} == $files1{$keys1[$i1]}) {
 112: 	$i2Old = $2;
 113: 	
 114: 	while($files2{$keys2[$i2]} == $files1{$keys1[$i1]}) {
 115: 	    if($keys2[$i2] ne $keys1[$i1]) {
 116: 		open(FILE, $keys1[$i1]);
 117: 		$inode1 = (stat(FILE))[1];
 118: 		close(FILE);
 119: 		
 120: 		open(FILE, $keys2[$i2]);
 121: 		$inode2 = (stat(FILE))[1];
 122: 		close(FILE);
 123: 		
 124: 		if($inode1 == $inode2) {
 125: 		    if($hardlinks == 1) {
 126: 			$index++;
 127: 			$savedFiles1{$index} = $keys1[$i1];
 128: 			$savedFiles2{$index} = $keys2[$i2];
 129: 		    }
 130: 		} else {
 131: 		    if(system("cmp -s \"$keys1[$i1]\" \"$keys2[$i2]\"") == 0) {
 132: 			$index++;
 133: 			$savedFiles1{$index} = $keys1[$i1];
 134: 			$savedFiles2{$index} = $keys2[$i2];
 135: 		    }
 136: 		}
 137: 	    }
 138: 	    $i2++;
 139: 	    if($i2 > $numKeys2) { exit; }
 140: 	}
 141: 	$i2 = $i2Old;
 142: 
 143:         # $i2 gets a reset, because every file in %files1 has to be compared with
 144:         # every file in %files2 of equal size. Otherwise some files would be skipped.	
 145:     }
 146: }
 147: 
 148: sub getFiles {
 149:     local($dir, %list, $file, $size);
 150:     
 151:     $dir = $_[0];
 152:     unless(-d $dir) { die "*** ERROR: $dir is not a directory.\n"; }
 153:     
 154:     open(DIR, "find $dir -size +$min_size -type f -printf \"%p %s\\n\" |") || 
 155: 	die "*** ERROR: Cannot access $dir.\n";
 156:     while(<DIR>) {
 157: 	chop;
 158: 	($file, $size) = split ( /\s/, $_, 2 );
 159: 	if(-f $file) {
 160: 	    $list{$file} = $size;
 161: 	}
 162:     }
 163:     close(DIR);
 164:     
 165:     unless(keys(%list)) { die "*** No files in $dir\n"; }    
 166:     %list;
 167: }

Replies are listed 'Best First'.
Re: diffy
by bikeNomad (Priest) on Jun 22, 2001 at 04:21 UTC
    Interesting script. Some comments:

    You don't have to open a file to stat() it; lines 116-122 could be done with the (faster):

    $inode1 = (stat($keys1[$i1]))[1]; $inode2 = (stat($keys2[$i2]))[1];

    Also, you might want to try the File::Compare module as well as File::Find to eliminate the calls to the external (Unix) cmp and find programs; this would go a long ways toward making this script portable.

    Also, Algorithm::Diff might be useful for comparing the lists of files. See File::DiffTree here for an example of using it for that.

    update: added link to DiffTree.

Re: diffy
by CharlesClarkson (Curate) on Jun 22, 2001 at 07:19 UTC

    As it is, getFiles doesn't work on windows. This rewrite works on my win98 box.

    sub getFiles { my %list; my ($dir, $min_size) = @_; die "$dir is not a directory.\n" unless -d $dir; opendir my $dir_handle, $dir || die "Cannot open directory $dir: $ +!"; foreach (grep -f, readdir $dir_handle) { my $size = (stat)[7]; $list{$_} = $size if $size >= $min_size; } die "No files in $dir\n" unless keys %list; return %list; }
    Call it with:
    my %files1 = getFiles($dir1, $min_size);

    HTH,
    Charles K. Clarkson
      Except that his getFiles recurses deeply into subdirectory structures (because he uses the Unix find command) and yours doesn't (it doesn't do subdirectories of the given directory). That's why I suggested he use File::Find.