Re: Find duplicate files with exact same files noted

Nice. Instead of File::Compare, I use Digest::MD5 or similar, and I don't care about the file name that much, as I also want to find duplicate binary files with different names, as for MP3 or JPG files. To find duplicate files:

$ cp tshirt.jpg duplicate.image
$ dups.pl
I've MD5'd 191 files to 191 checksums
./3.jpg ./image00111.jpg
./4.jpg ./image00222.jpg
./duplicate.image       ./tshirt.jpg
$ dups.pl -q image
./duplicate.image
./image00111.jpg
./image00222.jpg
./image00222surf.jpg
./image00333.jpg
./image00554.jpg
./image00665.jpg
./image00776.jpg
./image00887.jpg
./image009.jpg
./image00998.jpg
./image010109.jpg
./image011.jpg
./image0121210.jpg
$
[download]

#!/pro/bin/perl

use strict;
use warnings;

use Digest::MD5 qw( md5_hex );
use DB_File;
use File::Find;

use Getopt::Long qw(:config bundling nopermute);
my $opt_q = 0; # Query the database
GetOptions (
    "q" => \$opt_q,
    ) or die "usage: dups.pl [-q]\n";

my %sum;
tie my %md5, "DB_File", "dups.md5";

if ($opt_q) {
    my @db = sort keys %md5;
    untie %md5;
    foreach my $pat (@ARGV) {
        print "$_\n" for grep m/$pat/i => @db;
        }
    exit;
    }

my $nfile = 0;
find (sub {
    if (-d and -f "$_/dups.md5") {
        tie my %d5, "DB_File", "$_/dups.md5";
        foreach my $f (keys %d5) {
            $md5{"$File::Find::name/$f"} //= $d5{$f};
            }
        untie %d5;
        }
    -f or return;
    (my $f = $File::Find::name) =~ s:^_new/::;

    printf STDERR " %6d %-70.70s\r", ++$nfile, $f;
    if (exists $md5{$f}) {
        push @{$sum{$md5{$f}}}, $f;
        return;
        }
    local $/;
    open my $p, "< $_" or die "$f: $!\n";
    my $sum = md5_hex (<$p>);
    push @{$sum{$md5{$f} = $sum}}, $f;
    }, sort glob "*");

print STDERR "I've MD5'd $nfile files to ", scalar keys %md5, " checks
+ums\n";
open STDOUT, "| sort";
foreach my $r (values %sum) {
    my @p = @$r;
    @p >  1 or  next;
    $p[0] =~ m{(?:^|/)\d+/} and
        @p = map  { $_->[0] }
             sort { $a->[1] <=> $b->[1] or $a->[2] <=> $b->[2] or $a->
+[0] cmp $b->[0] }
             map  { [ $_, (m/(\d+)\b/g), 0, 0, 0 ] }
             @p;
    print join "\t", @p;
    print "\n";
    }
close STDOUT;
[download]

Enjoy, Have FUN! H.Merijn

Comment on Re: Find duplicate files with exact same files noted Select or Download Code

Replies are listed 'Best First'.
Re^2: Find duplicate files with exact same files noted by smahesh (Pilgrim) on Sep 01, 2010 at 04:04 UTC
Hi, I wrote a similar script using the MD5 hash for detecting duplicates. It is not cleaned up or optimized - but does the intended job. It is now in my tool collection. #!/usr/bin/perl # # Find duplicate files in specified directories using md5sum values to # identify duplicates. # # (C) 2009 S.M.Mahesh use strict; use warnings; use File::Find; use Digest::MD5; my $version = 0.1; my %md5sums; my $md5 = Digest::MD5->new(); sub Usage() { print<<USAGEDOC; $0 v$version - FindDuplicate script USAGE: $0 <DIR1> [DIR2...DIRn] where, DIR1..DIRn Specifies the directories to search EXAMPLE: $0 /home/user/downloads /home/user/documents USAGEDOC exit 1; } sub wanted { return unless -f $File::Find::name; # Return if it is not a plain +file return if -l $File::Find::name; # Return in case this is a symlink if (open(FILE, $File::Find::name) ) { binmode(FILE); my $sum = $md5->addfile(*FILE)->hexdigest(); close(FILE); my $aref = $md5sums{$sum}; if ( defined $aref ) { push @$aref, $File::Find::name; } else { my @list = ($File::Find::name); $md5sums{$sum} = \@list; } } else { print "ERROR: Could not open '$File::Find::name' for reading\n +"; } return; } Usage() if( $#ARGV < 0 ); foreach my $dir (@ARGV) { print "$dir \n"; unless ( -d $dir ) { print "ERROR: '$dir' is not a valid directory\n"; next; } find(\&wanted, $dir); } print "\n", '-'x25, "\n"; print "Printing duplicate files (if any)\n"; print '-'x25, "\n\n"; foreach my $sum (sort keys %md5sums) { my $list = $md5sums{$sum}; if ($#$list > 0) { print "$sum :\n"; foreach my $file (@$list) { print "\t $file\n"; } print"\n" } } print '-'x25, "\n"; [download] Mahesh	[reply] [d/l]

Replies are listed 'Best First'.

Re^2: Find duplicate files with exact same files noted
by smahesh (Pilgrim) on Sep 01, 2010 at 04:04 UTC

Hi, I wrote a similar script using the MD5 hash for detecting duplicates. It is not cleaned up or optimized - but does the intended job. It is now in my tool collection.

#!/usr/bin/perl
#
# Find duplicate files in specified directories using md5sum values to
# identify duplicates.
#
# (C) 2009 S.M.Mahesh

use strict;
use warnings;
use File::Find;
use Digest::MD5;

my $version = 0.1;
my %md5sums;
my $md5 = Digest::MD5->new();

sub Usage() {
    print<<USAGEDOC;

    $0 v$version - FindDuplicate script

    USAGE:
        $0  <DIR1> [DIR2...DIRn]

        where,
            DIR1..DIRn      Specifies the directories to search
        
    EXAMPLE:
        $0  /home/user/downloads /home/user/documents

USAGEDOC

    exit 1;
}

sub wanted {

    return unless -f $File::Find::name; # Return if it is not a plain 
+file
    return if -l $File::Find::name; # Return in case this is a symlink

    if (open(FILE, $File::Find::name) )
    {
        binmode(FILE);
        my $sum = $md5->addfile(*FILE)->hexdigest();
        close(FILE);

        my $aref = $md5sums{$sum};
        if ( defined $aref )
        {
            push @$aref, $File::Find::name;
        }
        else
        {
            my @list = ($File::Find::name);
            $md5sums{$sum} = \@list;
        }
    }
    else
    {
        print "ERROR: Could not open '$File::Find::name' for reading\n
+";
    }

    return;
}


Usage() if( $#ARGV < 0 );

foreach my $dir (@ARGV) {
    print "$dir \n";

    unless ( -d $dir ) {
        print "ERROR: '$dir' is not a valid directory\n";
        next;
    }

    find(\&wanted, $dir);
}


print "\n", '-'x25, "\n";
print "Printing duplicate files (if any)\n";
print '-'x25, "\n\n";

foreach my $sum (sort keys %md5sums) {
    my $list = $md5sums{$sum};

    if ($#$list > 0)
    {
        print "$sum :\n";
        foreach my $file (@$list) {
            print "\t $file\n";
        }
        print"\n"
    }
}

print '-'x25, "\n";
[download]

Mahesh

[reply]
[d/l]