Beefy Boxes and Bandwidth Generously Provided by pair Networks
Keep It Simple, Stupid
 
PerlMonks  

Duplicate file bounty hunter

by djw (Vicar)
on Apr 24, 2002 at 00:42 UTC ( [id://161492]=sourcecode: print w/replies, xml ) Need Help??
Category: Utility Scripts
Author/Contact Info djw - djw@perldev.org
Description: This will search out a directory recursively for duplicate files of a specified size (default: 100 MB). It logs everything and makes you dinner.

Enjoy, djw

*update* 04.24.2002
After some suggestions by fellow monks, I have changed this to use Digest::MD5. Thanks everyone++.

djw
#!/usr/bin/perl -w
use strict;

use File::stat;
use Digest::MD5;
use File::Find  qw(finddepth);
use Time::HiRes qw(gettimeofday);

# -----------
#   1024 x 1024 = 1048576 
# kbytes x 1024 = 1 MB

use constant MINFILESIZE => 104857600; # 100 MB
use constant ONEMEGABYTE => 1048576;
use vars qw(%fileInfo $number $totalSpace);

my ($totalFiles, $duplicateFiles) = 0;
my $dir = shift || &usage;

print STDOUT "\nRunning.  This could take a few minutes....";

# -----------    
# turn off buffering for errorlog et al.

$| = 1;

# -----------    
# Redirecting standard error output to 'error.log' - can
# get large if there are permission issues during the 
# recursive search.

open(OLDERR, ">&STDERR");
open(STDERR, '>', "error.log") || die "Can't redirect STDERR: ($!)\n";
select(STDERR);

# -----------    
# I wanted to see how long it would take for this
# to search trough large volumes.
#
# 89.2 minutes to search through a mounted drive 
# (130 GB of data) over a 100mbit switched network.
# Found 4 duplicates that were over 100 MB in size.
#
# 812.5 MB of total duplicated space.

my $beginRun = gettimeofday;

finddepth \&search, $dir;

my $endRun = gettimeofday;
my $runTime = $endRun - $beginRun;

# -----------    
# translate seconds into appropriate time for display
# later. precise? nah...

if ($runTime > 60) {
    $runTime = sprintf("%.2f minutes", $runTime / 60);
} elsif ($runTime > 3600) {
    $runTime = sprintf("%.2f hours", $runTime / 3600);
} else {
    $runTime = sprintf("%.2f seconds", $endRun - $beginRun);
}

print STDOUT "Complete.\n";

# -----------    
# This writes file info to our 'duplicate.log' file.
# [filename], [size], [quantity] (greater than 1)

&write;

close(STDERR);
close(OLDERR);

sub search {
    # -----------    
    # The Meat (tm).
    #
    # Using File::Find this recursively searches
    # through each directory from the directory
    # given at runtime.  It checks to see if each
    # file is of the size we are curious about.
    #
    # If it is, we get the MD5 digest info for the
    # file to see if we already have it in our
    # hash.  If it exists, we increment the
    # counter, if not, a new key gets created
    # (using the MD5 digest).

    if (-f) {
        my $fsize = stat($_)->size;
        if ($fsize > MINFILESIZE) {
        
            open(MD5FILE, "$_") || warn "Can't open file ($_): ($!)\n"
+;
            binmode(MD5FILE);
            my $md5hash = Digest::MD5->new->addfile(*MD5FILE)->hexdige
+st;        
            close(MD5FILE);
            
            if (exists($fileInfo{$md5hash})) {
                $fileInfo{$md5hash}[2]{count} += 1;
            } else {
                $fileInfo{$md5hash}[0]{filename} = $_;
                $fileInfo{$md5hash}[1]{size}     = $fsize;
                $fileInfo{$md5hash}[2]{count}    = 1;
            }
        }
        $totalFiles++;
    }
}


sub write {
    foreach (keys %fileInfo) {
        if ($fileInfo{$_}[2]{count} < 2) {
            delete $fileInfo{$_};
        }
    }

    if (%fileInfo) {
        open (LOG, "+>duplicates.log") || die "Can't create logfile: (
+$!)\n";
        foreach (keys %fileInfo) {
            next if ($fileInfo{$_}[2]{count} < 2);
            $duplicateFiles++;
            $number = sprintf("%.1f", $fileInfo{$_}[1]{size} / ONEMEGA
+BYTE);
            my $duplicateSpace = $number * ($fileInfo{$_}[2]{count} - 
+1);
            $totalSpace += $duplicateSpace;
            write(LOG);
        }
        close(LOG);
        print STDOUT "\nFound $duplicateFiles/$totalFiles duplicate fi
+les.\n";
        print STDOUT "Runtime: $runTime.\n";
        print STDOUT "Duplicated Space: $totalSpace MB\n";
    } else {
        print STDOUT "\nNo duplicates found - 0/$totalFiles files.\n";
        print STDOUT "Runtime: $runTime.\n";
    }
}

sub usage {
    print "Usage: ./duplicates.pl [dirname]\n";
    print "\n";
    print "BAD MR. KITTY!\n\nMake sure you supply a directory to searc
+h through!\n";
    print "Example: ./duplicates.pl /home/foo/\n";

    exit;
}

format LOG_TOP =
                              FILENAME  SIZE       QTY
-----------------------------------------------------------
.

format LOG =
@>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>  @>>>> MB   (@>)
$fileInfo{$_}[0]{filename}, $number, $fileInfo{$_}[2]{count}
.
Replies are listed 'Best First'.
Re: Duplicate file bounty hunter
by belg4mit (Prior) on Apr 24, 2002 at 01:02 UTC
    One word MD5.
    PS> I have a utility which does this and much more at snafu. I've been meaning to freshen it up and submit here. It might meet your needs.

    --
    perl -pew "s/\b;([mnst])/'$1/g"

      A previous discussion on this subject lies here: Find duplicate files.. Much the same conclusion: use Digest::MD5.


      print@_{sort keys %_},$/if%_=split//,'= & *a?b:e\f/h^h!j+n,o@o;r$s-t%t#u'
      MD5 - never thought of using that, thanks for the tip. I downloaded your utility but have to take a look at it a bit later. Thanks belg4mit++.

      djw
Re: Duplicate file bounty hunter
by rob_au (Abbot) on Apr 24, 2002 at 02:27 UTC
    I must agree wholeheartedly with the recommendation of belg4mit to explore the usage of Digest::MD5 for the comparison of files. The following is a small script that I wrote previously based upon a node by demerphq here which may be of use for comparative purposes.

    #!/usr/bin/perl -wT use Digest::MD5; use File::Find; use IO::File; use strict; $| = 1; $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin'; my $ctx = Digest::MD5->new; my %digest; my $path = $ARGV[0] || '.'; find ({ 'wanted' => sub { if (-f $_) { lstat; if ((-r _) && (!-l _)) { $ctx->reset; my $fh = IO::File->new($_, 'r'); $ctx->addfile(\$fh); my $md5 = $ctx->hexdigest; if (exists $digest{$md5}) { push @{$digest{$md5}->{'dupes'}}, $_; } else { $digest{$md5} = { 'file' => $_, 'dupes' => [] } } } } else { print "Searching $_\n"; } }, 'no_chdir' => 1 }, $path); print "There are ", ((scalar @{$digest{$_}->{'dupes'}}) || 0), " dupli +cate files.\n"; exit 0;

     

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://161492]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others musing on the Monastery: (3)
As of 2024-03-29 06:27 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found