in reply to file comparison using file open in binary mode.
Since you seem to be making progress, and are perhaps just stumbling over some hash comparison issues, I took a whack at writing the code. I incorporated some of the ideas from zwon and johngg but mostly I just made it up as I went along.
Note how I tried to break down the problem into smaller problems each handled by a subroutine.
#!/usr/bin/perl -w use strict; use warnings; use diagnostics; # Given: # Two arrays of filenames with full path. # # Goal: # Find identical files between the two lists. Cannot compare by name. # # Strategy: # 1. Gather size information on every file. # 2. For any files between lists with identical sizes, gather digest i +nformation. # 3. For any files between lists with identical digests, print info. use Digest; # Lists of files to compare. my @FileList1 = populate_file_list1(); # Fill out file list somehow. my @FileList2 = populate_file_list2(); # Fill out file list somehow. # Find duplicate sizes. my (%Sizes1, %Sizes2); find_sizes(\@FileList1, \%Sizes1); find_sizes(\@FileList2, \%Sizes2); my @duplicate_sizes = find_duplicate_keys(\%Sizes1, \%Sizes2); # Create list of files to calculate the digest on. my (@SizedFileList1, @SizedFileList2); foreach my $size (@duplicate_sizes) { push @SizedFileList1, @{ $Sizes1{$size} }; push @SizedFileList2, @{ $Sizes2{$size} }; } # Find duplicate digests. my (%Digests1, %Digests2); find_digests(\@SizedFileList1, \%Digests1); find_digests(\@SizedFileList2, \%Digests2); my @duplicate_digests = find_duplicate_keys(\%Digests1, \%Digests2); # Print cross-directory digest duplicates. foreach my $digest (@duplicate_digests) { foreach my $file1 (@{ $Digests1{$digest} }) { foreach my $file2 (@{ $Digests2{$digest} }) { print "Duplicate found: $file1 => $file2\n"; } } } exit 0; #------------------------------------------------ # find_sizes # Given references to filelist and hash, # Store size of each regular file in the hash. # Store array of all regular files with same size # in the hash, keyed on the file size. #------------------------------------------------ sub find_sizes { # Pass reference to FileListN array and SizesN hash my ($filelist, $sizes) = @_; foreach my $file (@$filelist) { my @stats = lstat $file; # lstat the file next if ! -f _; # Ignore if not regular fi +le push @{ $$sizes{$stats[7]} }, $file;# Save filename with other +s of same size. } } #------------------------------------------------ # find_digests # Given references to filelist and hash, # Store digest of each regular file in the hash. # Store array of all regular files with same digest # in the hash, keyed on the file digest. #------------------------------------------------ sub find_digests { # Pass reference to SizedFileListN array and DigestN hash my ($filelist, $digests) = @_; foreach my $file (@$filelist) { my $dval = calc_digest($file); # Calculate digest on file +. next if !defined $dval; # Skip unreadable files. push @{ $$digests{$dval} }, $file; # Save filename with other +s of same digest. } } #------------------------------------------------ # find_duplicate_keys # Given references to two hashs, # Return array of keys that occur in both hashs. #------------------------------------------------ sub find_duplicate_keys { my ($hash1, $hash2) = @_; my %seen; $seen{$_}++ foreach keys %$hash1; $seen{$_}++ foreach keys %$hash2; return grep { $seen{$_} >= 2 } keys %seen; } #------------------------------------------------ # calc_digest # Given a filename, # Calculate a digest algorithm on the content. #------------------------------------------------ sub calc_digest { my ($file) = @_; my $fh; if (!open($fh, "<", $file)) { warn ("Cannot open $file: $!"); return undef; } my $ctx = Digest->new("MD5"); # Choose digest algorithm $ctx->addfile($fh); close $fh; return $ctx->hexdigest; } #------------------------------------------------ # populate_file_list1 # populate_file_list2 # # Fill file list. # # The code below is for testing only. #------------------------------------------------ sub populate_file_list1 { return glob("find.test.dir/dirA/*"); } sub populate_file_list2 { return glob("find.test.dir/dirB/*"); }
|
|---|