comment on

The rationale behind this program is: verifying whether two files differ doesn't require reading them completely. The program first groups the files by size, then iteratively subdivides the groups according to some samples of the files. Sample size and distribution increase and identical files are read completely (the only way to prove they are really identical).

Update: It is safe to remove use of Digest::MD5 and sub hash, which could be used to experiment with different hashing methods but are not used by the program in its present form.

Update: this has evolved to a more complex program.

#!/usr/bin/perl -w

#------------------------------------------------------------
# dupfinder: find duplicate files
#------------------------------------------------------------
# Copyright Antonio Bellezza 2003
# mail: antonio@beautylabs.net
#------------------------------------------------------------
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation;
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#------------------------------------------------------------

use strict;

# Show program usage if no arguments are given
if ($#ARGV == -1) {
    usage();
    exit 0;
}

my $finder = Finder -> new( @ARGV );

my @group = $finder -> findDuplicates();

# Result printout and elaboration
for my $group ( @group ) {
    print "Possible duplicates of size $group->[0]{size}:\n";
    for (1..$#$group) {
    print "[$_] $group->[$_]\n";
    }
    print "\nACTION: [] continue  [1-$#$group] keep corresponding file
+ and remove the rest\n";
    my $input = <STDIN>;
    chomp $input;
    next if $input eq '';
    if ($input =~ m|([0-9]+)| && $1 > 0 && $1 <= $#$group) {

    for (0..($1-1), ($1+1)..$#$group) {
        my $delendum = $group -> [$_];
        print "Unlinking $delendum: ";
        unlink $delendum;
        print "done\n";
    }
    }
    print "\n";
}




#----------------------------------------
# usage()
# Show usage
#----------------------------------------
sub usage {
    print "Usage: dupfinder dir ...\n",
    "Find and interactively remove duplicate files\n",
}



package Finder;

#------------------------------------------------------------
# A finder is implemented as a hash
# $finder -> {groups} is the array of groups of possibly
# equal files
# Each group is an array whose first element is a hash
# with the various key attributes.
# Subsequent elements are the filenames in the group
# Example:
# [
#   [ { size=>0 }, 'empty.txt', 'null.dat', 'nothing_here' ],
#   [ { size=>1321, hash12=>'xyz' }, 'myfile.a', 'myfile.b' ],
#   [ { size=>1321, hash12=>'wtt' }, 'myfile.c', 'myfile.d' ]
# ]
#------------------------------------------------------------

use strict;
use IO::File;
use File::Find;
use Digest::MD5 qw( md5 );

use constant MINREADSIZE => 1024;
use constant MAXREADSIZE => 1024 * 1024;
use constant BLOCK   => 4096;

our $VERSION = '1.0';

our $handles = {};


#----------------------------------------
# new ( dir, ... )
# Create new finder
#----------------------------------------
sub new {
    my $class = shift;
    my $self = {
    dirs     => [ @_ ],
    groups   => [],
    terminal => []
    };
    return bless $self, $class;
}

#----------------------------------------
# readDirs ()
# Find all files and setup finder
#----------------------------------------
sub readDirs {
    my $self = shift;
    my $group = [ {} ];
    find( sub { -f && ( push @$group, $File::Find::name ) },
      @{$self->{dirs}} );
    $self -> {groups} = [ $group ];
}




#----------------------------------------
# findDuplicates()
# Return list of terminal groups
#----------------------------------------
sub findDuplicates {
    my $self = shift;
    my $hasher;
    $self -> readDirs();

#    print $self -> status;

    $hasher = { process  => \&size,
        name     => 'size',
        terminal => sub { shift==0 } };

    $self -> {groups} = [ $self -> partition( $self -> {groups} [0], $
+hasher ) ];

#    print $self -> status;

    $self -> prune();

#    print $self -> status;

    
    for ( @{$self -> {groups}} ) {
    my @processList = ( $_ );
    my $size = $_ -> [0] {size};
    my $iterator = Finder::Looper -> new( $size );

    while ( @processList and my ( $start, $length ) = $iterator -> nex
+t() ) {
        $hasher = { process => \&sample,
            args    => [ $start, $length ] };
        my @newList = ();
        for (@processList) {
        my @subgroup = ( $self -> partition( $_, $hasher ) );
        $self -> prune( \@subgroup );
        push @newList, @subgroup;
        }
        @processList = @newList;
    }
    closeHandles( @processList );
    $self -> addTerminal( @processList );
    }

    return @{ $self -> {terminal} };
}



#----------------------------------------
# prune ()
# prune ( \@group )
# Remove groups only containing one file
# If argument is omitted, remove from $self -> {groups}
# Add to terminal groups with terminal key
# Return number of remaining groups
#----------------------------------------
sub prune {
    my $self = shift;
    my $src = $_[0] || $self -> {groups};
    my $counter = 0;
    for ( my $i = $#$src; $i>=0; $i--) {
    my $group = $src -> [$i];
    if ( $group -> [0] {terminal} ) {
        # Remove and add to terminal groups
        $self -> addTerminal( $group );
        closeHandles( $group );
        splice @$src, $i, 1;
    } elsif ( $#$group > 1 ) {
        # Keep in place
        $counter ++;
    } else {
        # Drop group only containing one file
        closeHandles( $group );
        splice @$src, $i, 1;
    }
    }
    return $counter;
}


#----------------------------------------
# partition( $group, hasher [, hasher par, ... ] )
# Execute a discriminatory step and create subgroups
# Return list of groups
# A hasher is a hash ref of type
# {
#    process  => sub { taking fileName as first arg, key as second arg
+ument },
#    name     => hash-key name / undef if not added,
#    terminal => sub { shift is a terminal key or not },
#    args     => [ extra arguments to pass ]
# }
#----------------------------------------
sub partition {
    my $self = shift;
    my ($group, $hasher, @hasherPar) = @_;

    my $key = shift @{$group};
    my %bucket = ();

    for (@{$group}) {
    my $hash = $hasher -> {process} -> ($_, $key,
                        @{ $hasher -> {args} || [] },
                        @hasherPar);
    push @{ $bucket {$hash} ||= [] }, $_;
    }

    my @result = ();

    for (keys %bucket) {
    # Create a clone of the key
    my $newKey = { %$key };
    $newKey -> { $hasher -> {name} } = $_ if $hasher -> {name};
    $newKey -> {terminal} = 1
        if ( $hasher -> {terminal} && $hasher -> {terminal} -> ($_) );
    push @result, [ $newKey, @{$bucket {$_}} ];
    }

    return @result;
}


#----------------------------------------
# status()
# Return string showing finder status
#----------------------------------------
sub status {
    my $self = shift;
    my $res = 'Groups:';
    for (grep {$_ > 0} map {$#$_} @{$self -> {groups}}) {

    $res .= " $_";
    }

    $res .= "\nTerminal:";
    for (grep {$_ > 0} map {$#$_} @{$self -> {terminal}}) {
    $res .= " $_";
    }
    $res .= "\n";
    return $res;
}


#----------------------------------------
# function
#----------------------------------------
# fileHandle( filename )
# return fileHandle or undef
#----------------------------------------
sub fileHandle {
    my ($fileName) = @_;
    unless ($handles -> {$fileName}) {
    my $handle = IO::File -> new();
    $handle -> open("<$fileName") || return undef;
    $handles -> {$fileName} = $handle;
    }
    return $handles -> {$fileName};
}

#----------------------------------------
# function
#----------------------------------------
# closeHandle( filename )
# close handle
#----------------------------------------
sub closeHandle {
    my ($fname) = @_;
    delete $handles -> {$fname};
}

#----------------------------------------
# function
#----------------------------------------
# closeHandles( $group, ... )
# Close handles of filenames contained in group
#----------------------------------------
sub closeHandles {
    for my $group (@_) {
    for (1..$#$group) {
        closeHandle( $group -> [$_] );
    }
    }
}

#----------------------------------------
# addTerminal( \@file, ... )
# Add to terminal sets arrays of files with given size
#----------------------------------------
sub addTerminal {
    my $self = shift;
    push @{ $self -> {terminal} }, @_;
}

{
    my $error = 0;
#----------------------------------------
# hash ( filename, key [, start [, length ]] )
# Apply MD5 to file segment
#----------------------------------------
sub hash {
    my ($fname, $key, $start, $length) = @_;
    $start ||= 0;

    my $res;

    # Return a consecutive error code if unable to open file
    my $handle = openHandle( $fname ) || return "Error " . $error++;
    $handle -> seek( $start, 0 );

    if ($length) {
    $handle -> read( $res, $length );
    $res = md5( $res );
    } else {
    my $md5 = Digest::MD5->new();
    while ( $handle -> read( $res, BLOCK ) ) {
        $md5->add( $res );
    }
    $res = $md5->digest();
    }

    return $res;
}


#----------------------------------------
# sample ( filename, key [, start [, length ] ] )
#----------------------------------------
sub sample {
    my ($fname, $key, $start, $length) = @_;
    $start ||= 0;

    my $res;

    # Return a consecutive error code if unable to open file
    my $handle = fileHandle( $fname ) || return "Error " . $error++;
    $handle -> seek( $start, 0 );

    if ($length) {
    $handle -> read( $res, $length );
    } else {
    $res = '';
    my $buffer;
    while ( $handle -> read( $buffer, BLOCK ) ) {
        $res .= $buffer;
    }
    }

    return $res;
}
}


#----------------------------------------
# size ( filename )
# Find file size
#----------------------------------------
sub size {
    my $fname = shift;
    return (stat ($_))[7];
}






#------------------------------------------------------------
# Finder::Looper
#------------------------------------------------------------
# Iterator providing starting points and lengths
# for interlaced reads
#------------------------------------------------------------
package Finder::Looper;

use constant MINREADSIZE => Finder::MINREADSIZE;
use constant MAXREADSIZE => Finder::MAXREADSIZE;
use constant BLOCK       => Finder::BLOCK;


#----------------------------------------
# new( size [, minsize [, maxsize ]] )
#----------------------------------------
sub new {
    my $class = shift;
    my ( $size, $minsize, $maxsize ) = @_;
    $minsize ||= MINREADSIZE;
    $maxsize ||= MAXREADSIZE;
    bless {
    size     => $size,
    minsize  => $minsize || MINREADSIZE,
    maxsize  => $maxsize || MAXREADSIZE,
    readsize => $minsize || MINREADSIZE,
    oldsize  => 0,
    i        => 0,
    gap      => 1 << nextLog2( $size )
    }, $class;
}

#----------------------------------------
# next()
# return ( start, length )
# return () if the iteration is over
#----------------------------------------

sub next {
    my $self = shift;

    # Return EOL if the gap has become smaller than the size
    # unless it's the first iteration ( oldsize = 0 )
    if ( $self -> {readsize} > $self -> {gap} && $self -> {oldsize} > 
+0 ) {
    return ();
    }
    
    if ( $self -> {i} * $self -> {gap} >= $self -> {size} ) {
    $self -> {i} = 0;
    $self -> {oldsize} = $self -> {readsize};
    $self -> {gap} >>= 1;
    $self -> {readsize} <<= 1
        if ( $self -> {readsize} < $self -> {gap}
         && $self -> {readsize} < $self -> {maxsize} );
    }

    my $offset = ( $self -> {i} % 2 ) ? 0 : $self -> {oldsize};
    
    my $start  = $self -> {i} * $self -> {gap} + $offset;
    my $length = $self -> {readsize} - $offset;
    $length    = $self -> {size} - $start if $start + $length > $self 
+-> {size};

    $self -> {i} ++;

    if ( $length <= 0 ) {
    return $self -> next();
    } else {
    return ( $start, $length );
    } 
}

#----------------------------------------
# function
#----------------------------------------
# nextLog2( positive integer )
# return exponent of nearest power of 2
# not less than integer
# Warning: returns at most the biggest power of
# two expressed by an integer
#----------------------------------------
sub nextLog2 {
    my $i = shift;
    my $pow = 1;
    my $exp = 0;
    while ( $pow < $i && $pow > 0 ) {
    $pow <<= 1;
    $exp++;
    }
    return $exp;
}



1;
[download]

In reply to Interlaced duplicate file finder by abell

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


No such thing as a small change
	PerlMonks