#!/usr/bin/perl -w #------------------------------------------------------------ # dupfinder: find duplicate files #------------------------------------------------------------ # Copyright Antonio Bellezza 2003 # mail: antonio@beautylabs.net #------------------------------------------------------------ # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License # as published by the Free Software Foundation; # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. #------------------------------------------------------------ use strict; # Show program usage if no arguments are given if ($#ARGV == -1) { usage(); exit 0; } my $finder = Finder -> new( @ARGV ); my @group = $finder -> findDuplicates(); # Result printout and elaboration for my $group ( @group ) { print "Possible duplicates of size $group->[0]{size}:\n"; for (1..$#$group) { print "[$_] $group->[$_]\n"; } print "\nACTION: [] continue [1-$#$group] keep corresponding file and remove the rest\n"; my $input = ; chomp $input; next if $input eq ''; if ($input =~ m|([0-9]+)| && $1 > 0 && $1 <= $#$group) { for (0..($1-1), ($1+1)..$#$group) { my $delendum = $group -> [$_]; print "Unlinking $delendum: "; unlink $delendum; print "done\n"; } } print "\n"; } #---------------------------------------- # usage() # Show usage #---------------------------------------- sub usage { print "Usage: dupfinder dir ...\n", "Find and interactively remove duplicate files\n", } package Finder; #------------------------------------------------------------ # A finder is implemented as a hash # $finder -> {groups} is the array of groups of possibly # equal files # Each group is an array whose first element is a hash # with the various key attributes. # Subsequent elements are the filenames in the group # Example: # [ # [ { size=>0 }, 'empty.txt', 'null.dat', 'nothing_here' ], # [ { size=>1321, hash12=>'xyz' }, 'myfile.a', 'myfile.b' ], # [ { size=>1321, hash12=>'wtt' }, 'myfile.c', 'myfile.d' ] # ] #------------------------------------------------------------ use strict; use IO::File; use File::Find; use Digest::MD5 qw( md5 ); use constant MINREADSIZE => 1024; use constant MAXREADSIZE => 1024 * 1024; use constant BLOCK => 4096; our $VERSION = '1.0'; our $handles = {}; #---------------------------------------- # new ( dir, ... ) # Create new finder #---------------------------------------- sub new { my $class = shift; my $self = { dirs => [ @_ ], groups => [], terminal => [] }; return bless $self, $class; } #---------------------------------------- # readDirs () # Find all files and setup finder #---------------------------------------- sub readDirs { my $self = shift; my $group = [ {} ]; find( sub { -f && ( push @$group, $File::Find::name ) }, @{$self->{dirs}} ); $self -> {groups} = [ $group ]; } #---------------------------------------- # findDuplicates() # Return list of terminal groups #---------------------------------------- sub findDuplicates { my $self = shift; my $hasher; $self -> readDirs(); # print $self -> status; $hasher = { process => \&size, name => 'size', terminal => sub { shift==0 } }; $self -> {groups} = [ $self -> partition( $self -> {groups} [0], $hasher ) ]; # print $self -> status; $self -> prune(); # print $self -> status; for ( @{$self -> {groups}} ) { my @processList = ( $_ ); my $size = $_ -> [0] {size}; my $iterator = Finder::Looper -> new( $size ); while ( @processList and my ( $start, $length ) = $iterator -> next() ) { $hasher = { process => \&sample, args => [ $start, $length ] }; my @newList = (); for (@processList) { my @subgroup = ( $self -> partition( $_, $hasher ) ); $self -> prune( \@subgroup ); push @newList, @subgroup; } @processList = @newList; } closeHandles( @processList ); $self -> addTerminal( @processList ); } return @{ $self -> {terminal} }; } #---------------------------------------- # prune () # prune ( \@group ) # Remove groups only containing one file # If argument is omitted, remove from $self -> {groups} # Add to terminal groups with terminal key # Return number of remaining groups #---------------------------------------- sub prune { my $self = shift; my $src = $_[0] || $self -> {groups}; my $counter = 0; for ( my $i = $#$src; $i>=0; $i--) { my $group = $src -> [$i]; if ( $group -> [0] {terminal} ) { # Remove and add to terminal groups $self -> addTerminal( $group ); closeHandles( $group ); splice @$src, $i, 1; } elsif ( $#$group > 1 ) { # Keep in place $counter ++; } else { # Drop group only containing one file closeHandles( $group ); splice @$src, $i, 1; } } return $counter; } #---------------------------------------- # partition( $group, hasher [, hasher par, ... ] ) # Execute a discriminatory step and create subgroups # Return list of groups # A hasher is a hash ref of type # { # process => sub { taking fileName as first arg, key as second argument }, # name => hash-key name / undef if not added, # terminal => sub { shift is a terminal key or not }, # args => [ extra arguments to pass ] # } #---------------------------------------- sub partition { my $self = shift; my ($group, $hasher, @hasherPar) = @_; my $key = shift @{$group}; my %bucket = (); for (@{$group}) { my $hash = $hasher -> {process} -> ($_, $key, @{ $hasher -> {args} || [] }, @hasherPar); push @{ $bucket {$hash} ||= [] }, $_; } my @result = (); for (keys %bucket) { # Create a clone of the key my $newKey = { %$key }; $newKey -> { $hasher -> {name} } = $_ if $hasher -> {name}; $newKey -> {terminal} = 1 if ( $hasher -> {terminal} && $hasher -> {terminal} -> ($_) ); push @result, [ $newKey, @{$bucket {$_}} ]; } return @result; } #---------------------------------------- # status() # Return string showing finder status #---------------------------------------- sub status { my $self = shift; my $res = 'Groups:'; for (grep {$_ > 0} map {$#$_} @{$self -> {groups}}) { $res .= " $_"; } $res .= "\nTerminal:"; for (grep {$_ > 0} map {$#$_} @{$self -> {terminal}}) { $res .= " $_"; } $res .= "\n"; return $res; } #---------------------------------------- # function #---------------------------------------- # fileHandle( filename ) # return fileHandle or undef #---------------------------------------- sub fileHandle { my ($fileName) = @_; unless ($handles -> {$fileName}) { my $handle = IO::File -> new(); $handle -> open("<$fileName") || return undef; $handles -> {$fileName} = $handle; } return $handles -> {$fileName}; } #---------------------------------------- # function #---------------------------------------- # closeHandle( filename ) # close handle #---------------------------------------- sub closeHandle { my ($fname) = @_; delete $handles -> {$fname}; } #---------------------------------------- # function #---------------------------------------- # closeHandles( $group, ... ) # Close handles of filenames contained in group #---------------------------------------- sub closeHandles { for my $group (@_) { for (1..$#$group) { closeHandle( $group -> [$_] ); } } } #---------------------------------------- # addTerminal( \@file, ... ) # Add to terminal sets arrays of files with given size #---------------------------------------- sub addTerminal { my $self = shift; push @{ $self -> {terminal} }, @_; } { my $error = 0; #---------------------------------------- # hash ( filename, key [, start [, length ]] ) # Apply MD5 to file segment #---------------------------------------- sub hash { my ($fname, $key, $start, $length) = @_; $start ||= 0; my $res; # Return a consecutive error code if unable to open file my $handle = openHandle( $fname ) || return "Error " . $error++; $handle -> seek( $start, 0 ); if ($length) { $handle -> read( $res, $length ); $res = md5( $res ); } else { my $md5 = Digest::MD5->new(); while ( $handle -> read( $res, BLOCK ) ) { $md5->add( $res ); } $res = $md5->digest(); } return $res; } #---------------------------------------- # sample ( filename, key [, start [, length ] ] ) #---------------------------------------- sub sample { my ($fname, $key, $start, $length) = @_; $start ||= 0; my $res; # Return a consecutive error code if unable to open file my $handle = fileHandle( $fname ) || return "Error " . $error++; $handle -> seek( $start, 0 ); if ($length) { $handle -> read( $res, $length ); } else { $res = ''; my $buffer; while ( $handle -> read( $buffer, BLOCK ) ) { $res .= $buffer; } } return $res; } } #---------------------------------------- # size ( filename ) # Find file size #---------------------------------------- sub size { my $fname = shift; return (stat ($_))[7]; } #------------------------------------------------------------ # Finder::Looper #------------------------------------------------------------ # Iterator providing starting points and lengths # for interlaced reads #------------------------------------------------------------ package Finder::Looper; use constant MINREADSIZE => Finder::MINREADSIZE; use constant MAXREADSIZE => Finder::MAXREADSIZE; use constant BLOCK => Finder::BLOCK; #---------------------------------------- # new( size [, minsize [, maxsize ]] ) #---------------------------------------- sub new { my $class = shift; my ( $size, $minsize, $maxsize ) = @_; $minsize ||= MINREADSIZE; $maxsize ||= MAXREADSIZE; bless { size => $size, minsize => $minsize || MINREADSIZE, maxsize => $maxsize || MAXREADSIZE, readsize => $minsize || MINREADSIZE, oldsize => 0, i => 0, gap => 1 << nextLog2( $size ) }, $class; } #---------------------------------------- # next() # return ( start, length ) # return () if the iteration is over #---------------------------------------- sub next { my $self = shift; # Return EOL if the gap has become smaller than the size # unless it's the first iteration ( oldsize = 0 ) if ( $self -> {readsize} > $self -> {gap} && $self -> {oldsize} > 0 ) { return (); } if ( $self -> {i} * $self -> {gap} >= $self -> {size} ) { $self -> {i} = 0; $self -> {oldsize} = $self -> {readsize}; $self -> {gap} >>= 1; $self -> {readsize} <<= 1 if ( $self -> {readsize} < $self -> {gap} && $self -> {readsize} < $self -> {maxsize} ); } my $offset = ( $self -> {i} % 2 ) ? 0 : $self -> {oldsize}; my $start = $self -> {i} * $self -> {gap} + $offset; my $length = $self -> {readsize} - $offset; $length = $self -> {size} - $start if $start + $length > $self -> {size}; $self -> {i} ++; if ( $length <= 0 ) { return $self -> next(); } else { return ( $start, $length ); } } #---------------------------------------- # function #---------------------------------------- # nextLog2( positive integer ) # return exponent of nearest power of 2 # not less than integer # Warning: returns at most the biggest power of # two expressed by an integer #---------------------------------------- sub nextLog2 { my $i = shift; my $pow = 1; my $exp = 0; while ( $pow < $i && $pow > 0 ) { $pow <<= 1; $exp++; } return $exp; } 1;