Since you seem to be making progress,
and are perhaps just stumbling over some hash comparison issues,
I took a whack at writing the code.
I incorporated some of the ideas from zwon and johngg but mostly I just made it up as I went along.
Note how I tried to break down the problem into smaller problems
each handled by a subroutine.
#!/usr/bin/perl -w
use strict;
use warnings;
use diagnostics;
# Given:
# Two arrays of filenames with full path.
#
# Goal:
# Find identical files between the two lists. Cannot compare by name.
#
# Strategy:
# 1. Gather size information on every file.
# 2. For any files between lists with identical sizes, gather digest i
+nformation.
# 3. For any files between lists with identical digests, print info.
use Digest;
# Lists of files to compare.
my @FileList1 = populate_file_list1(); # Fill out file list somehow.
my @FileList2 = populate_file_list2(); # Fill out file list somehow.
# Find duplicate sizes.
my (%Sizes1, %Sizes2);
find_sizes(\@FileList1, \%Sizes1);
find_sizes(\@FileList2, \%Sizes2);
my @duplicate_sizes = find_duplicate_keys(\%Sizes1, \%Sizes2);
# Create list of files to calculate the digest on.
my (@SizedFileList1, @SizedFileList2);
foreach my $size (@duplicate_sizes)
{
push @SizedFileList1, @{ $Sizes1{$size} };
push @SizedFileList2, @{ $Sizes2{$size} };
}
# Find duplicate digests.
my (%Digests1, %Digests2);
find_digests(\@SizedFileList1, \%Digests1);
find_digests(\@SizedFileList2, \%Digests2);
my @duplicate_digests = find_duplicate_keys(\%Digests1, \%Digests2);
# Print cross-directory digest duplicates.
foreach my $digest (@duplicate_digests)
{
foreach my $file1 (@{ $Digests1{$digest} })
{
foreach my $file2 (@{ $Digests2{$digest} })
{
print "Duplicate found: $file1 => $file2\n";
}
}
}
exit 0;
#------------------------------------------------
# find_sizes
# Given references to filelist and hash,
# Store size of each regular file in the hash.
# Store array of all regular files with same size
# in the hash, keyed on the file size.
#------------------------------------------------
sub find_sizes
{
# Pass reference to FileListN array and SizesN hash
my ($filelist, $sizes) = @_;
foreach my $file (@$filelist)
{
my @stats = lstat $file; # lstat the file
next if ! -f _; # Ignore if not regular fi
+le
push @{ $$sizes{$stats[7]} }, $file;# Save filename with other
+s of same size.
}
}
#------------------------------------------------
# find_digests
# Given references to filelist and hash,
# Store digest of each regular file in the hash.
# Store array of all regular files with same digest
# in the hash, keyed on the file digest.
#------------------------------------------------
sub find_digests
{
# Pass reference to SizedFileListN array and DigestN hash
my ($filelist, $digests) = @_;
foreach my $file (@$filelist)
{
my $dval = calc_digest($file); # Calculate digest on file
+.
next if !defined $dval; # Skip unreadable files.
push @{ $$digests{$dval} }, $file; # Save filename with other
+s of same digest.
}
}
#------------------------------------------------
# find_duplicate_keys
# Given references to two hashs,
# Return array of keys that occur in both hashs.
#------------------------------------------------
sub find_duplicate_keys
{
my ($hash1, $hash2) = @_;
my %seen;
$seen{$_}++ foreach keys %$hash1;
$seen{$_}++ foreach keys %$hash2;
return grep { $seen{$_} >= 2 } keys %seen;
}
#------------------------------------------------
# calc_digest
# Given a filename,
# Calculate a digest algorithm on the content.
#------------------------------------------------
sub calc_digest
{
my ($file) = @_;
my $fh;
if (!open($fh, "<", $file))
{
warn ("Cannot open $file: $!");
return undef;
}
my $ctx = Digest->new("MD5"); # Choose digest algorithm
$ctx->addfile($fh);
close $fh;
return $ctx->hexdigest;
}
#------------------------------------------------
# populate_file_list1
# populate_file_list2
#
# Fill file list.
#
# The code below is for testing only.
#------------------------------------------------
sub populate_file_list1
{
return glob("find.test.dir/dirA/*");
}
sub populate_file_list2
{
return glob("find.test.dir/dirB/*");
}
|