| Category: | Utility Scripts |
| Author/Contact Info | djw - djw@perldev.org |
| Description: | This will search out a directory recursively for duplicate files of a specified size (default: 100 MB). It logs everything and makes you dinner.
Enjoy, djw *update* 04.24.2002 After some suggestions by fellow monks, I have changed this to use Digest::MD5. Thanks everyone++. djw |
#!/usr/bin/perl -w
use strict;
use File::stat;
use Digest::MD5;
use File::Find qw(finddepth);
use Time::HiRes qw(gettimeofday);
# -----------
# 1024 x 1024 = 1048576
# kbytes x 1024 = 1 MB
use constant MINFILESIZE => 104857600; # 100 MB
use constant ONEMEGABYTE => 1048576;
use vars qw(%fileInfo $number $totalSpace);
my ($totalFiles, $duplicateFiles) = 0;
my $dir = shift || &usage;
print STDOUT "\nRunning. This could take a few minutes....";
# -----------
# turn off buffering for errorlog et al.
$| = 1;
# -----------
# Redirecting standard error output to 'error.log' - can
# get large if there are permission issues during the
# recursive search.
open(OLDERR, ">&STDERR");
open(STDERR, '>', "error.log") || die "Can't redirect STDERR: ($!)\n";
select(STDERR);
# -----------
# I wanted to see how long it would take for this
# to search trough large volumes.
#
# 89.2 minutes to search through a mounted drive
# (130 GB of data) over a 100mbit switched network.
# Found 4 duplicates that were over 100 MB in size.
#
# 812.5 MB of total duplicated space.
my $beginRun = gettimeofday;
finddepth \&search, $dir;
my $endRun = gettimeofday;
my $runTime = $endRun - $beginRun;
# -----------
# translate seconds into appropriate time for display
# later. precise? nah...
if ($runTime > 60) {
$runTime = sprintf("%.2f minutes", $runTime / 60);
} elsif ($runTime > 3600) {
$runTime = sprintf("%.2f hours", $runTime / 3600);
} else {
$runTime = sprintf("%.2f seconds", $endRun - $beginRun);
}
print STDOUT "Complete.\n";
# -----------
# This writes file info to our 'duplicate.log' file.
# [filename], [size], [quantity] (greater than 1)
&write;
close(STDERR);
close(OLDERR);
sub search {
# -----------
# The Meat (tm).
#
# Using File::Find this recursively searches
# through each directory from the directory
# given at runtime. It checks to see if each
# file is of the size we are curious about.
#
# If it is, we get the MD5 digest info for the
# file to see if we already have it in our
# hash. If it exists, we increment the
# counter, if not, a new key gets created
# (using the MD5 digest).
if (-f) {
my $fsize = stat($_)->size;
if ($fsize > MINFILESIZE) {
open(MD5FILE, "$_") || warn "Can't open file ($_): ($!)\n"
+;
binmode(MD5FILE);
my $md5hash = Digest::MD5->new->addfile(*MD5FILE)->hexdige
+st;
close(MD5FILE);
if (exists($fileInfo{$md5hash})) {
$fileInfo{$md5hash}[2]{count} += 1;
} else {
$fileInfo{$md5hash}[0]{filename} = $_;
$fileInfo{$md5hash}[1]{size} = $fsize;
$fileInfo{$md5hash}[2]{count} = 1;
}
}
$totalFiles++;
}
}
sub write {
foreach (keys %fileInfo) {
if ($fileInfo{$_}[2]{count} < 2) {
delete $fileInfo{$_};
}
}
if (%fileInfo) {
open (LOG, "+>duplicates.log") || die "Can't create logfile: (
+$!)\n";
foreach (keys %fileInfo) {
next if ($fileInfo{$_}[2]{count} < 2);
$duplicateFiles++;
$number = sprintf("%.1f", $fileInfo{$_}[1]{size} / ONEMEGA
+BYTE);
my $duplicateSpace = $number * ($fileInfo{$_}[2]{count} -
+1);
$totalSpace += $duplicateSpace;
write(LOG);
}
close(LOG);
print STDOUT "\nFound $duplicateFiles/$totalFiles duplicate fi
+les.\n";
print STDOUT "Runtime: $runTime.\n";
print STDOUT "Duplicated Space: $totalSpace MB\n";
} else {
print STDOUT "\nNo duplicates found - 0/$totalFiles files.\n";
print STDOUT "Runtime: $runTime.\n";
}
}
sub usage {
print "Usage: ./duplicates.pl [dirname]\n";
print "\n";
print "BAD MR. KITTY!\n\nMake sure you supply a directory to searc
+h through!\n";
print "Example: ./duplicates.pl /home/foo/\n";
exit;
}
format LOG_TOP =
FILENAME SIZE QTY
-----------------------------------------------------------
.
format LOG =
@>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> @>>>> MB (@>)
$fileInfo{$_}[0]{filename}, $number, $fileInfo{$_}[2]{count}
.
|
|
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Duplicate file bounty hunter
by belg4mit (Prior) on Apr 24, 2002 at 01:02 UTC | |
by grinder (Bishop) on Apr 24, 2002 at 14:58 UTC | |
by djw (Vicar) on Apr 24, 2002 at 01:08 UTC | |
|
Re: Duplicate file bounty hunter
by rob_au (Abbot) on Apr 24, 2002 at 02:27 UTC |