#!/usr/bin/perl -w

#------------------------------------------------------------
# dupfinder: find duplicate files
#------------------------------------------------------------
# Copyright Antonio Bellezza 2003
# mail: antonio@beautylabs.net
#------------------------------------------------------------
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation;
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#------------------------------------------------------------

use strict;

# Show program usage if no arguments are given
if ($#ARGV == -1) {
    usage();
    exit 0;
}

my $finder = Finder -> new( @ARGV );

my @group = $finder -> findDuplicates();

# Result printout and elaboration
for my $group ( @group ) {
    print "Possible duplicates of size $group->[0]{size}:\n";
    for (1..$#$group) {
	print "[$_] $group->[$_]\n";
    }
    print "\nACTION: [] continue  [1-$#$group] keep corresponding file and remove the rest\n";
    my $input = <STDIN>;
    chomp $input;
    next if $input eq '';
    if ($input =~ m|([0-9]+)| && $1 > 0 && $1 <= $#$group) {

	for (0..($1-1), ($1+1)..$#$group) {
	    my $delendum = $group -> [$_];
	    print "Unlinking $delendum: ";
	    unlink $delendum;
	    print "done\n";
	}
    }
    print "\n";
}




#----------------------------------------
# usage()
# Show usage
#----------------------------------------
sub usage {
    print "Usage: dupfinder dir ...\n",
    "Find and interactively remove duplicate files\n",
}



package Finder;

#------------------------------------------------------------
# A finder is implemented as a hash
# $finder -> {groups} is the array of groups of possibly
# equal files
# Each group is an array whose first element is a hash
# with the various key attributes.
# Subsequent elements are the filenames in the group
# Example:
# [
#   [ { size=>0 }, 'empty.txt', 'null.dat', 'nothing_here' ],
#   [ { size=>1321, hash12=>'xyz' }, 'myfile.a', 'myfile.b' ],
#   [ { size=>1321, hash12=>'wtt' }, 'myfile.c', 'myfile.d' ]
# ]
#------------------------------------------------------------

use strict;
use IO::File;
use File::Find;
use Digest::MD5 qw( md5 );

use constant MINREADSIZE => 1024;
use constant MAXREADSIZE => 1024 * 1024;
use constant BLOCK   => 4096;

our $VERSION = '1.0';

our $handles = {};


#----------------------------------------
# new ( dir, ... )
# Create new finder
#----------------------------------------
sub new {
    my $class = shift;
    my $self = {
	dirs     => [ @_ ],
	groups   => [],
	terminal => []
    };
    return bless $self, $class;
}

#----------------------------------------
# readDirs ()
# Find all files and setup finder
#----------------------------------------
sub readDirs {
    my $self = shift;
    my $group = [ {} ];
    find( sub { -f && ( push @$group, $File::Find::name ) },
	  @{$self->{dirs}} );
    $self -> {groups} = [ $group ];
}




#----------------------------------------
# findDuplicates()
# Return list of terminal groups
#----------------------------------------
sub findDuplicates {
    my $self = shift;
    my $hasher;
    $self -> readDirs();

#    print $self -> status;

    $hasher = { process  => \&size,
		name     => 'size',
		terminal => sub { shift==0 } };

    $self -> {groups} = [ $self -> partition( $self -> {groups} [0], $hasher ) ];

#    print $self -> status;

    $self -> prune();

#    print $self -> status;

    
    for ( @{$self -> {groups}} ) {
	my @processList = ( $_ );
	my $size = $_ -> [0] {size};
	my $iterator = Finder::Looper -> new( $size );

	while ( @processList and my ( $start, $length ) = $iterator -> next() ) {
	    $hasher = { process => \&sample,
			args    => [ $start, $length ] };
	    my @newList = ();
	    for (@processList) {
		my @subgroup = ( $self -> partition( $_, $hasher ) );
		$self -> prune( \@subgroup );
		push @newList, @subgroup;
	    }
	    @processList = @newList;
	}
	closeHandles( @processList );
	$self -> addTerminal( @processList );
    }

    return @{ $self -> {terminal} };
}



#----------------------------------------
# prune ()
# prune ( \@group )
# Remove groups only containing one file
# If argument is omitted, remove from $self -> {groups}
# Add to terminal groups with terminal key
# Return number of remaining groups
#----------------------------------------
sub prune {
    my $self = shift;
    my $src = $_[0] || $self -> {groups};
    my $counter = 0;
    for ( my $i = $#$src; $i>=0; $i--) {
	my $group = $src -> [$i];
	if ( $group -> [0] {terminal} ) {
	    # Remove and add to terminal groups
	    $self -> addTerminal( $group );
	    closeHandles( $group );
	    splice @$src, $i, 1;
	} elsif ( $#$group > 1 ) {
	    # Keep in place
	    $counter ++;
	} else {
	    # Drop group only containing one file
	    closeHandles( $group );
	    splice @$src, $i, 1;
	}
    }
    return $counter;
}


#----------------------------------------
# partition( $group, hasher [, hasher par, ... ] )
# Execute a discriminatory step and create subgroups
# Return list of groups
# A hasher is a hash ref of type
# {
#    process  => sub { taking fileName as first arg, key as second argument },
#    name     => hash-key name / undef if not added,
#    terminal => sub { shift is a terminal key or not },
#    args     => [ extra arguments to pass ]
# }
#----------------------------------------
sub partition {
    my $self = shift;
    my ($group, $hasher, @hasherPar) = @_;

    my $key = shift @{$group};
    my %bucket = ();

    for (@{$group}) {
	my $hash = $hasher -> {process} -> ($_, $key,
					    @{ $hasher -> {args} || [] },
					    @hasherPar);
	push @{ $bucket {$hash} ||= [] }, $_;
    }

    my @result = ();

    for (keys %bucket) {
	# Create a clone of the key
	my $newKey = { %$key };
	$newKey -> { $hasher -> {name} } = $_ if $hasher -> {name};
	$newKey -> {terminal} = 1
	    if ( $hasher -> {terminal} && $hasher -> {terminal} -> ($_) );
	push @result, [ $newKey, @{$bucket {$_}} ];
    }

    return @result;
}


#----------------------------------------
# status()
# Return string showing finder status
#----------------------------------------
sub status {
    my $self = shift;
    my $res = 'Groups:';
    for (grep {$_ > 0} map {$#$_} @{$self -> {groups}}) {

	$res .= " $_";
    }

    $res .= "\nTerminal:";
    for (grep {$_ > 0} map {$#$_} @{$self -> {terminal}}) {
	$res .= " $_";
    }
    $res .= "\n";
    return $res;
}


#----------------------------------------
# function
#----------------------------------------
# fileHandle( filename )
# return fileHandle or undef
#----------------------------------------
sub fileHandle {
    my ($fileName) = @_;
    unless ($handles -> {$fileName}) {
	my $handle = IO::File -> new();
	$handle -> open("<$fileName") || return undef;
	$handles -> {$fileName} = $handle;
    }
    return $handles -> {$fileName};
}

#----------------------------------------
# function
#----------------------------------------
# closeHandle( filename )
# close handle
#----------------------------------------
sub closeHandle {
    my ($fname) = @_;
    delete $handles -> {$fname};
}

#----------------------------------------
# function
#----------------------------------------
# closeHandles( $group, ... )
# Close handles of filenames contained in group
#----------------------------------------
sub closeHandles {
    for my $group (@_) {
	for (1..$#$group) {
	    closeHandle( $group -> [$_] );
	}
    }
}

#----------------------------------------
# addTerminal( \@file, ... )
# Add to terminal sets arrays of files with given size
#----------------------------------------
sub addTerminal {
    my $self = shift;
    push @{ $self -> {terminal} }, @_;
}

{
    my $error = 0;
#----------------------------------------
# hash ( filename, key [, start [, length ]] )
# Apply MD5 to file segment
#----------------------------------------
sub hash {
    my ($fname, $key, $start, $length) = @_;
    $start ||= 0;

    my $res;

    # Return a consecutive error code if unable to open file
    my $handle = openHandle( $fname ) || return "Error " . $error++;
    $handle -> seek( $start, 0 );

    if ($length) {
	$handle -> read( $res, $length );
	$res = md5( $res );
    } else {
	my $md5 = Digest::MD5->new();
	while ( $handle -> read( $res, BLOCK ) ) {
	    $md5->add( $res );
	}
	$res = $md5->digest();
    }

    return $res;
}


#----------------------------------------
# sample ( filename, key [, start [, length ] ] )
#----------------------------------------
sub sample {
    my ($fname, $key, $start, $length) = @_;
    $start ||= 0;

    my $res;

    # Return a consecutive error code if unable to open file
    my $handle = fileHandle( $fname ) || return "Error " . $error++;
    $handle -> seek( $start, 0 );

    if ($length) {
	$handle -> read( $res, $length );
    } else {
	$res = '';
	my $buffer;
	while ( $handle -> read( $buffer, BLOCK ) ) {
	    $res .= $buffer;
	}
    }

    return $res;
}
}


#----------------------------------------
# size ( filename )
# Find file size
#----------------------------------------
sub size {
    my $fname = shift;
    return (stat ($_))[7];
}






#------------------------------------------------------------
# Finder::Looper
#------------------------------------------------------------
# Iterator providing starting points and lengths
# for interlaced reads
#------------------------------------------------------------
package Finder::Looper;

use constant MINREADSIZE => Finder::MINREADSIZE;
use constant MAXREADSIZE => Finder::MAXREADSIZE;
use constant BLOCK       => Finder::BLOCK;


#----------------------------------------
# new( size [, minsize [, maxsize ]] )
#----------------------------------------
sub new {
    my $class = shift;
    my ( $size, $minsize, $maxsize ) = @_;
    $minsize ||= MINREADSIZE;
    $maxsize ||= MAXREADSIZE;
    bless {
	size     => $size,
	minsize  => $minsize || MINREADSIZE,
	maxsize  => $maxsize || MAXREADSIZE,
	readsize => $minsize || MINREADSIZE,
	oldsize  => 0,
	i        => 0,
	gap      => 1 << nextLog2( $size )
    }, $class;
}

#----------------------------------------
# next()
# return ( start, length )
# return () if the iteration is over
#----------------------------------------

sub next {
    my $self = shift;

    # Return EOL if the gap has become smaller than the size
    # unless it's the first iteration ( oldsize = 0 )
    if ( $self -> {readsize} > $self -> {gap} && $self -> {oldsize} > 0 ) {
	return ();
    }
	
    if ( $self -> {i} * $self -> {gap} >= $self -> {size} ) {
	$self -> {i} = 0;
	$self -> {oldsize} = $self -> {readsize};
	$self -> {gap} >>= 1;
	$self -> {readsize} <<= 1
	    if ( $self -> {readsize} < $self -> {gap}
		 && $self -> {readsize} < $self -> {maxsize} );
    }

    my $offset = ( $self -> {i} % 2 ) ? 0 : $self -> {oldsize};
    
    my $start  = $self -> {i} * $self -> {gap} + $offset;
    my $length = $self -> {readsize} - $offset;
    $length    = $self -> {size} - $start if $start + $length > $self -> {size};

    $self -> {i} ++;

    if ( $length <= 0 ) {
	return $self -> next();
    } else {
	return ( $start, $length );
    } 
}

#----------------------------------------
# function
#----------------------------------------
# nextLog2( positive integer )
# return exponent of nearest power of 2
# not less than integer
# Warning: returns at most the biggest power of
# two expressed by an integer
#----------------------------------------
sub nextLog2 {
    my $i = shift;
    my $pow = 1;
    my $exp = 0;
    while ( $pow < $i && $pow > 0 ) {
	$pow <<= 1;
	$exp++;
    }
    return $exp;
}



1;