#!/usr/bin/perl -w use strict; use File::Find::Duplicates; use File::Temp (); use File::Stat qw/:stat/; use Getopt::Std; my ($space_saved, $space_wasted) = (0, 0); local $" = "\n"; my (%opts); $Getopt::Std::STANDARD_HELP_VERSION = 1; ###### Get command line options getopts( 'd:lsr', \%opts ); # Default to current directory if none was given, check for existance my $directory = $opts{d} || "."; die "$directory doesn't exist or it's not a directory!\n" unless -d $directory; ##### Main print {*STDERR} "Running...\n"; # Find duplicates recursively in the directory my @dupes = find_duplicate_files($directory); # For each set of duplicate files, perform the appropiate action and # populate the $space_saved variable foreach my $set (@dupes) { # Choose the template file and filter out files that # already are links my ( $template_file, @real_dups ) = filter_out_symlinks( $set->files ); if (@real_dups) { print $set->size, " bytes each:\n$template_file\n@real_dups\n"; } else {next} my $number_of_files = defined( $opts{s} ) ? softlink( $template_file, \@real_dups ) : defined( $opts{l} ) ? hardlink( $template_file, \@real_dups ) : defined( $opts{r} ) ? remove( $template_file, \@real_dups ) : 0; $space_wasted += ( scalar @real_dups ) * $set->size; $space_saved += $number_of_files * $set->size; } # Report the stats print "Space wasted: $space_wasted bytes\n"; print "Space saved: $space_saved bytes\n"; ##### Subroutines sub hardlink { # Replace duplicates with hard links and return the number # of links created. my $original = shift; my $duplicates = shift; my $files_linked; foreach my $duplicate (@$duplicates) { # Step 1: link original to tempfile my $tempfile = File::Temp::tempnam( $directory, 'X' x 6 ); link $original, $tempfile or next; # Step 2: move tempfile to duplicate unless ( rename $tempfile, $duplicate ) { next; } # This looks redundant, but somehow there were temp files # remaining if ( -e $tempfile ) { unlink $tempfile or die "Couldn't delete temporary file $tempfile: $!"; } ++$files_linked; } $files_linked = 0 unless defined($files_linked); return $files_linked; } sub softlink { # Replace duplicates with soft links and return the number # of links created. my $original = shift; my $duplicates = shift; my $files_linked; foreach my $duplicate (@$duplicates) { # Step 1: link original to tempfile my $tempfile = File::Temp::tempnam( $directory, 'X' x 6 ); symlink $original, $tempfile or next; # Step 2: move tempfile to duplicate unless ( rename $tempfile, $duplicate ) { next; } # This looks redundant, but somehow there were temp files # remaining if ( -e $tempfile ) { unlink $tempfile or die "Couldn't delete temporary file $tempfile: $!"; } ++$files_linked; } $files_linked = 0 unless defined($files_linked); return $files_linked; } sub remove { # Remove the duplicate files my $original = shift; my $duplicates = shift; my $files_deleted; foreach my $duplicate (@$duplicates) { # Delete de duplicate unless ( unlink $duplicate ) { warn "$duplicate couldn't be removed: $!\n"; next; } ++$files_deleted; } $files_deleted = 0 unless defined($files_deleted); return $files_deleted; } sub filter_out_symlinks { # Given a list of duplicate files which could belong to the same # inode or not, returns a filename belonging # to the inode with the most number of links and a list with # filenames of the duplicates belonging to different inodes than # the first one. Phew. my $putative_duplicates = shift; # Group the duplicate files by inode. my %real_dups; foreach my $filename (@$putative_duplicates) { push @{ $real_dups{ stat($filename)->inode } }, $filename; } # sort the inodes by most number of links my @sorted_inodes = sort { scalar @{$real_dups{$b}} <=> scalar @{$real_dups{$a}} } keys %real_dups; # Get the inode with the maximum number of links my $top_inode = shift @sorted_inodes; # Define any non-symlink file from the top inode # as the template file my $template_file = (); foreach my $file ( @{ $real_dups{$top_inode} } ) { next if -l $file; $template_file = $file; last; } # List all the files belonging to the rest of the inodes my @real_dups = map @{ $real_dups{$_} }, @sorted_inodes; return ( $template_file, @real_dups ); } __END__ =head1 NAME dup2link - yet another duplicate file finder v. 0.1.6 =head1 SYNOPSIS dup2link [-lsr] -d directory =head2 Options Decide what to do with found duplicates: =over =item * l: create a hard link =item * s: create a symlink =item * r: remove the file =back If no switch is given, nothing is done and a report is printed. =head1 DESCRIPTION dup2link searches duplicate files recursively from any arbitrary position. It uses an MD5 sum criteria to decide whether two files are identical or not. It can then delete, create symlinks or hardlinks of the files, according to the user's choice. In any case, a report is printed to standard output, which details the path of every duplicate file and their size, along with a the total amount of disk space wasted and saved. =head1 IMPORTANT This script was only tested in a Linux (Ubuntu 8.04) platform. Please proceed with extreme caution before applying it to your personal files. It is strongly suggested that you first run it without any switches to see the files that would be affected. It is also adviced to backup up any important information before attempting to run it in write mode (-l, -s or -r). Please consider using more mature, feature-rich and thoroughly tested applications, such as fdupes, fdf or fslint (among others). =head1 REQUIRES This script requires the following modules, all freely available from CPAN: =over =item * File::Find::Duplicates =item * File::Temp =item * File::Stat =item * Getopt::Std =back =head1 SEE ALSO See documentation for module File::Find::Duplicates for details in the algorithm used for the determination of duplicate files. =head1 AUTHOR bruno E<lt>brunovecchi@yahoo.com.arE<gt> =head1 COPYRIGHT AND LICENSE Copyright (C) 2008 by bruno This script is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.8 or, at your option, any later version of Perl 5 you may have available. =cut
In reply to Re: Replace duplicate files with hardlinks
by bruno
in thread Replace duplicate files with hardlinks
by bruno
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |