http://qs1969.pair.com?node_id=1099194


in reply to Re: Re: Re: Find duplicate files.
in thread Find duplicate files.

When there are many, many duplicates and you don't care to comb through the list, this version keeps the first one and happily delete all others on confirmation. It also skips symlinks, because the original will happily delete the original and keep the link!
#!/usr/bin/perl # Usage: ./fdupes.pl <start directory> use strict; use warnings; use Term::ReadKey; use File::Find; # testing - 0 for interactive mode, 1 to skip all deletion etc my $testing = 0; # skip files smaller than 100 bytes. Set to zero if you like... my $minsize = 100; my $filecount = my $bytecount = my $fileschecked = my $wasted = 0; my %files = (); &usage unless (@ARGV); my $searchdir = $ARGV[0]; my $autodelete; if ( $ARGV[0] eq '-d' ) { $searchdir = $ARGV[1]; $autodelete = "yes"; } sub wanted { return unless -f; return if -l; my $filesize = ( stat($_) )[7]; $bytecount += $filesize; return unless $filesize > $minsize; # skip small files $filecount++; push @{ $files{$filesize} }, $File::Find::name; } find( \&wanted, $searchdir || "." ); # update progress display 1000 times maximum my $update_period = int( $filecount / 1000 ) + 1; if ( $fileschecked % $update_period == 0 ) { print "Progress: $fileschecked/$filecount\r"; # note \r does carriage return, but NO LINE FEED # for progress display } my @dupesets; # list of lists - @{$dupesets[0]} = (file1, file2) # where file1 and file2 are dupes foreach my $size ( keys %files ) { my @entries = @{ $files{$size} }; my $samesizecount = scalar @entries; if ( @{ $files{$size} } == 1 ) { # unique size $fileschecked++; next; } # duplicates by file size.. Check if files are the same while ( my $base = shift @entries ) { # get first entry in list under filesize my @dupes = (); my $count = 0; while ( $count <= $#entries ) { # go through all @entries my $compare = $entries[$count]; if ( &same( $base, $compare ) ) { # remove "compare" from list so it can't be used # on next run splice( @entries, $count, 1 ); # removed "compare" from list - update progress if ( ++$fileschecked % $update_period == 0 ) { print "Progress: $fileschecked/$filecount\r"; } if (@dupes) { # already have some dupes - just add duplicate # #n to list push @dupes, $compare; $wasted += $size; } else { # no dupes yet - include base file and duplicate # #1 in list push @dupes, ( $base, $compare ); $wasted += $size; } } else { $count++; # only increase counter if not a dupe - note splice # will break $array[$position] loop otherwise } } if (@dupes) { push @dupesets, \@dupes; } # "base" file removed from list of files to check - update # progress meter if ( ++$fileschecked % $update_period == 0 ) { print "Progress: $fileschecked/$filecount\r"; } } } if (@dupesets) { my @deletelist = (); # at least one set of duplicates exists # number of sets of duplicates my $dupesetcount = scalar(@dupesets); my $dupesetcounter = 0; if ($autodelete) { foreach my $setref (@dupesets) { my $firstdupe=shift @$setref; push @deletelist, @$setref ; } } else { foreach my $setref (@dupesets) { if ($testing) { print @$setref, "\n"; next; } $dupesetcounter++; my @dupes = @$setref; print "Duplicates found ($dupesetcounter / $dupesetcount)" +, "... Should I keep...\n"; my $count = 0; # print up list of options of which file to keep while ( $count <= $#dupes ) { # go through all @entries my $entry = $dupes[$count]; print $count + 1, " : $entry\n"; $count++; } # alternative options - keep all files, skip to end print "0: All\n"; print "A: Skip all remaining duplicates\n"; # use ReadKey to get user input ReadMode 4; # Turn off controls keys my $key = ''; while ( not defined( $key = ReadKey(-1) ) ) { # No key yet } ReadMode 0; # Reset tty mode before e +xiting if ( $key eq 'A' ) { # skip any remaining dupes and get to deletion bit last; } # not a number or 'A' - default to zero (ie keep all files +) $key = '0' unless ( $key =~ /^\d+$/ ); if ( $key == 0 ) { # ALL - don't delete anyt +hing #print "you chose: ALL\n" +; } elsif ( defined $dupes[ $key - 1 ] ) { print "you chose: ", $dupes[ $key - 1 ], "\n"; my @list_to_delete = @dupes; # remove file to keep from list splice( @list_to_delete, $key - 1, 1 ); # add rest to deletelist push @deletelist, @list_to_delete; } else { #print "you chose: invalid number... (nothing will", # " be deleted)\n"; } print "\n"; } } # confirm deletion if any files are needing deleting if (@deletelist) { print "\n------------------------\n"; print "list of files to delete:\n"; foreach (@deletelist) { print "$_\n"; } print "\nAre you *sure* you want to delete all these files?", " (Y/N)\n"; ReadMode 4; # Turn off controls keys my $key = ''; while ( not defined( $key = ReadKey(-1) ) ) { # No key yet } ReadMode 0; # Reset tty mode before exiting if ( lc($key) eq 'y' ) { print "deleting\n"; unlink @deletelist; } else { print "wussing out\n"; } } 1 while $wasted =~ s/^([-+]?\d+)(\d{3})/$1,$2/; print "$wasted bytes in duplicated files\n"; } # routine to check equivalence in files. pass 1 checks first # "line" of file (up to \n char), rest of file checked if 1st # line matches sub same { local ( $a, $b ) = @_; open( A, $a ) || die; open( B, $b ) || die; if ( <A> ne <B> ) { # FIRST LINE is not the same return 0; # not duplicates } else { # try WHOLE FILE local $/ = undef; return <A> eq <B>; } } sub usage { print "Usage: $0 [-d] <start directory>\n"; print " -d : autodelete, keeps first\n"; exit; }