in reply to Find duplicate files with exact same files noted

Nice. Instead of File::Compare, I use Digest::MD5 or similar, and I don't care about the file name that much, as I also want to find duplicate binary files with different names, as for MP3 or JPG files. To find duplicate files:

$ cp tshirt.jpg duplicate.image $ dups.pl I've MD5'd 191 files to 191 checksums ./3.jpg ./image00111.jpg ./4.jpg ./image00222.jpg ./duplicate.image ./tshirt.jpg $ dups.pl -q image ./duplicate.image ./image00111.jpg ./image00222.jpg ./image00222surf.jpg ./image00333.jpg ./image00554.jpg ./image00665.jpg ./image00776.jpg ./image00887.jpg ./image009.jpg ./image00998.jpg ./image010109.jpg ./image011.jpg ./image0121210.jpg $
#!/pro/bin/perl use strict; use warnings; use Digest::MD5 qw( md5_hex ); use DB_File; use File::Find; use Getopt::Long qw(:config bundling nopermute); my $opt_q = 0; # Query the database GetOptions ( "q" => \$opt_q, ) or die "usage: dups.pl [-q]\n"; my %sum; tie my %md5, "DB_File", "dups.md5"; if ($opt_q) { my @db = sort keys %md5; untie %md5; foreach my $pat (@ARGV) { print "$_\n" for grep m/$pat/i => @db; } exit; } my $nfile = 0; find (sub { if (-d and -f "$_/dups.md5") { tie my %d5, "DB_File", "$_/dups.md5"; foreach my $f (keys %d5) { $md5{"$File::Find::name/$f"} //= $d5{$f}; } untie %d5; } -f or return; (my $f = $File::Find::name) =~ s:^_new/::; printf STDERR " %6d %-70.70s\r", ++$nfile, $f; if (exists $md5{$f}) { push @{$sum{$md5{$f}}}, $f; return; } local $/; open my $p, "< $_" or die "$f: $!\n"; my $sum = md5_hex (<$p>); push @{$sum{$md5{$f} = $sum}}, $f; }, sort glob "*"); print STDERR "I've MD5'd $nfile files to ", scalar keys %md5, " checks +ums\n"; open STDOUT, "| sort"; foreach my $r (values %sum) { my @p = @$r; @p > 1 or next; $p[0] =~ m{(?:^|/)\d+/} and @p = map { $_->[0] } sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] or $a-> +[0] cmp $b->[0] } map { [ $_, (m/(\d+)\b/g), 0, 0, 0 ] } @p; print join "\t", @p; print "\n"; } close STDOUT;

Enjoy, Have FUN! H.Merijn

Replies are listed 'Best First'.
Re^2: Find duplicate files with exact same files noted
by smahesh (Pilgrim) on Sep 01, 2010 at 04:04 UTC

    Hi, I wrote a similar script using the MD5 hash for detecting duplicates. It is not cleaned up or optimized - but does the intended job. It is now in my tool collection.

    #!/usr/bin/perl # # Find duplicate files in specified directories using md5sum values to # identify duplicates. # # (C) 2009 S.M.Mahesh use strict; use warnings; use File::Find; use Digest::MD5; my $version = 0.1; my %md5sums; my $md5 = Digest::MD5->new(); sub Usage() { print<<USAGEDOC; $0 v$version - FindDuplicate script USAGE: $0 <DIR1> [DIR2...DIRn] where, DIR1..DIRn Specifies the directories to search EXAMPLE: $0 /home/user/downloads /home/user/documents USAGEDOC exit 1; } sub wanted { return unless -f $File::Find::name; # Return if it is not a plain +file return if -l $File::Find::name; # Return in case this is a symlink if (open(FILE, $File::Find::name) ) { binmode(FILE); my $sum = $md5->addfile(*FILE)->hexdigest(); close(FILE); my $aref = $md5sums{$sum}; if ( defined $aref ) { push @$aref, $File::Find::name; } else { my @list = ($File::Find::name); $md5sums{$sum} = \@list; } } else { print "ERROR: Could not open '$File::Find::name' for reading\n +"; } return; } Usage() if( $#ARGV < 0 ); foreach my $dir (@ARGV) { print "$dir \n"; unless ( -d $dir ) { print "ERROR: '$dir' is not a valid directory\n"; next; } find(\&wanted, $dir); } print "\n", '-'x25, "\n"; print "Printing duplicate files (if any)\n"; print '-'x25, "\n\n"; foreach my $sum (sort keys %md5sums) { my $list = $md5sums{$sum}; if ($#$list > 0) { print "$sum :\n"; foreach my $file (@$list) { print "\t $file\n"; } print"\n" } } print '-'x25, "\n";

    Mahesh