Re: file comparison using file open in binary mode.

Since you seem to be making progress, and are perhaps just stumbling over some hash comparison issues, I took a whack at writing the code. I incorporated some of the ideas from zwon and johngg but mostly I just made it up as I went along.

Note how I tried to break down the problem into smaller problems each handled by a subroutine.

#!/usr/bin/perl -w
use strict;
use warnings;
use diagnostics;

# Given:
# Two arrays of filenames with full path.
#
# Goal:
# Find identical files between the two lists.  Cannot compare by name.
#
# Strategy:
# 1. Gather size information on every file.
# 2. For any files between lists with identical sizes, gather digest i
+nformation.
# 3. For any files between lists with identical digests, print info.

use Digest;

# Lists of files to compare.
my @FileList1 = populate_file_list1();  # Fill out file list somehow.
my @FileList2 = populate_file_list2();  # Fill out file list somehow.

# Find duplicate sizes.
my (%Sizes1, %Sizes2);
find_sizes(\@FileList1, \%Sizes1);
find_sizes(\@FileList2, \%Sizes2);
my @duplicate_sizes = find_duplicate_keys(\%Sizes1, \%Sizes2);

# Create list of files to calculate the digest on.
my (@SizedFileList1, @SizedFileList2);
foreach my $size (@duplicate_sizes)
{
    push @SizedFileList1, @{ $Sizes1{$size} };
    push @SizedFileList2, @{ $Sizes2{$size} };
}

# Find duplicate digests.
my (%Digests1, %Digests2);
find_digests(\@SizedFileList1, \%Digests1);
find_digests(\@SizedFileList2, \%Digests2);
my @duplicate_digests = find_duplicate_keys(\%Digests1, \%Digests2);

# Print cross-directory digest duplicates.
foreach my $digest (@duplicate_digests)
{
    foreach my $file1 (@{ $Digests1{$digest} })
    {
        foreach my $file2 (@{ $Digests2{$digest} })
        {
            print "Duplicate found: $file1 => $file2\n";
        }
    }
}

exit 0;

#------------------------------------------------
# find_sizes
# Given references to filelist and hash,
# Store size of each regular file in the hash.
# Store array of all regular files with same size
# in the hash, keyed on the file size.
#------------------------------------------------
sub find_sizes
{
    # Pass reference to FileListN array and SizesN hash
    my ($filelist, $sizes) = @_;

    foreach my $file (@$filelist)
    {
        my @stats = lstat $file;            # lstat the file
        next if ! -f _;                     # Ignore if not regular fi
+le
        push @{ $$sizes{$stats[7]} }, $file;# Save filename with other
+s of same size.
    }
}

#------------------------------------------------
# find_digests
# Given references to filelist and hash,
# Store digest of each regular file in the hash.
# Store array of all regular files with same digest
# in the hash, keyed on the file digest.
#------------------------------------------------
sub find_digests
{
    # Pass reference to SizedFileListN array and DigestN hash
    my ($filelist, $digests) = @_;

    foreach my $file (@$filelist)
    {
        my $dval = calc_digest($file);      # Calculate digest on file
+.
        next if !defined $dval;             # Skip unreadable files.
        push @{ $$digests{$dval} }, $file;  # Save filename with other
+s of same digest.
    }
}

#------------------------------------------------
# find_duplicate_keys
# Given references to two hashs,
# Return array of keys that occur in both hashs.
#------------------------------------------------
sub find_duplicate_keys
{
    my ($hash1, $hash2) = @_;
    my %seen;
    $seen{$_}++ foreach keys %$hash1;
    $seen{$_}++ foreach keys %$hash2;
    return grep { $seen{$_} >= 2 } keys %seen;
}

#------------------------------------------------
# calc_digest
# Given a filename,
# Calculate a digest algorithm on the content.
#------------------------------------------------
sub calc_digest
{
    my ($file) = @_;
    my $fh;
    if (!open($fh, "<", $file))
    {
        warn ("Cannot open $file: $!");
        return undef;
    }
    my $ctx = Digest->new("MD5");   # Choose digest algorithm
    $ctx->addfile($fh);
    close $fh;
    return $ctx->hexdigest;
}

#------------------------------------------------
# populate_file_list1
# populate_file_list2
#
# Fill file list.
#
# The code below is for testing only.
#------------------------------------------------
sub populate_file_list1
{
    return glob("find.test.dir/dirA/*");
}
sub populate_file_list2
{
    return glob("find.test.dir/dirB/*");
}
[download]

Comment on Re: file comparison using file open in binary mode. Download Code