Category: | Utility Scripts |
Author/Contact Info | djw - djw@perldev.org |
Description: | This will search out a directory recursively for duplicate files of a specified size (default: 100 MB). It logs everything and makes you dinner.
Enjoy, djw *update* 04.24.2002 After some suggestions by fellow monks, I have changed this to use Digest::MD5. Thanks everyone++. djw |
#!/usr/bin/perl -w use strict; use File::stat; use Digest::MD5; use File::Find qw(finddepth); use Time::HiRes qw(gettimeofday); # ----------- # 1024 x 1024 = 1048576 # kbytes x 1024 = 1 MB use constant MINFILESIZE => 104857600; # 100 MB use constant ONEMEGABYTE => 1048576; use vars qw(%fileInfo $number $totalSpace); my ($totalFiles, $duplicateFiles) = 0; my $dir = shift || &usage; print STDOUT "\nRunning. This could take a few minutes...."; # ----------- # turn off buffering for errorlog et al. $| = 1; # ----------- # Redirecting standard error output to 'error.log' - can # get large if there are permission issues during the # recursive search. open(OLDERR, ">&STDERR"); open(STDERR, '>', "error.log") || die "Can't redirect STDERR: ($!)\n"; select(STDERR); # ----------- # I wanted to see how long it would take for this # to search trough large volumes. # # 89.2 minutes to search through a mounted drive # (130 GB of data) over a 100mbit switched network. # Found 4 duplicates that were over 100 MB in size. # # 812.5 MB of total duplicated space. my $beginRun = gettimeofday; finddepth \&search, $dir; my $endRun = gettimeofday; my $runTime = $endRun - $beginRun; # ----------- # translate seconds into appropriate time for display # later. precise? nah... if ($runTime > 60) { $runTime = sprintf("%.2f minutes", $runTime / 60); } elsif ($runTime > 3600) { $runTime = sprintf("%.2f hours", $runTime / 3600); } else { $runTime = sprintf("%.2f seconds", $endRun - $beginRun); } print STDOUT "Complete.\n"; # ----------- # This writes file info to our 'duplicate.log' file. # [filename], [size], [quantity] (greater than 1) &write; close(STDERR); close(OLDERR); sub search { # ----------- # The Meat (tm). # # Using File::Find this recursively searches # through each directory from the directory # given at runtime. It checks to see if each # file is of the size we are curious about. # # If it is, we get the MD5 digest info for the # file to see if we already have it in our # hash. If it exists, we increment the # counter, if not, a new key gets created # (using the MD5 digest). if (-f) { my $fsize = stat($_)->size; if ($fsize > MINFILESIZE) { open(MD5FILE, "$_") || warn "Can't open file ($_): ($!)\n" +; binmode(MD5FILE); my $md5hash = Digest::MD5->new->addfile(*MD5FILE)->hexdige +st; close(MD5FILE); if (exists($fileInfo{$md5hash})) { $fileInfo{$md5hash}[2]{count} += 1; } else { $fileInfo{$md5hash}[0]{filename} = $_; $fileInfo{$md5hash}[1]{size} = $fsize; $fileInfo{$md5hash}[2]{count} = 1; } } $totalFiles++; } } sub write { foreach (keys %fileInfo) { if ($fileInfo{$_}[2]{count} < 2) { delete $fileInfo{$_}; } } if (%fileInfo) { open (LOG, "+>duplicates.log") || die "Can't create logfile: ( +$!)\n"; foreach (keys %fileInfo) { next if ($fileInfo{$_}[2]{count} < 2); $duplicateFiles++; $number = sprintf("%.1f", $fileInfo{$_}[1]{size} / ONEMEGA +BYTE); my $duplicateSpace = $number * ($fileInfo{$_}[2]{count} - +1); $totalSpace += $duplicateSpace; write(LOG); } close(LOG); print STDOUT "\nFound $duplicateFiles/$totalFiles duplicate fi +les.\n"; print STDOUT "Runtime: $runTime.\n"; print STDOUT "Duplicated Space: $totalSpace MB\n"; } else { print STDOUT "\nNo duplicates found - 0/$totalFiles files.\n"; print STDOUT "Runtime: $runTime.\n"; } } sub usage { print "Usage: ./duplicates.pl [dirname]\n"; print "\n"; print "BAD MR. KITTY!\n\nMake sure you supply a directory to searc +h through!\n"; print "Example: ./duplicates.pl /home/foo/\n"; exit; } format LOG_TOP = FILENAME SIZE QTY ----------------------------------------------------------- . format LOG = @>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> @>>>> MB (@>) $fileInfo{$_}[0]{filename}, $number, $fileInfo{$_}[2]{count} . |
|
---|
Replies are listed 'Best First'. | |
---|---|
Re: Duplicate file bounty hunter
by belg4mit (Prior) on Apr 24, 2002 at 01:02 UTC | |
by grinder (Bishop) on Apr 24, 2002 at 14:58 UTC | |
by djw (Vicar) on Apr 24, 2002 at 01:08 UTC | |
Re: Duplicate file bounty hunter
by rob_au (Abbot) on Apr 24, 2002 at 02:27 UTC |