in reply to Comparing two files
But you can still use the same strategy using md5 hashes. And indeed here is some (partially tested) sample code for this problem:
#! /usr/bin/perl -w use strict; use Digest::MD5 qw(md5); my %file_hash; foreach my $file (@ARGV) { my $key = md5(normalize_text(slurp_file($file))); push @{$file_hash{$key}}, $file; } foreach my $files (values %file_hash) { if (@$files < 2) { next; } else { # $files is an anonymous array of files, which # are *probably* all duplicates of each other. # Put appropriate logic here. Were it not for # memory limits, *this* would be the whole # script! my %file_of; foreach my $file (@$files) { my $text = normalize_text(slurp_file($file)); if (exists $file_of{ $text }) { print "$file_of{$text} and $file are dups\n"; unlink($file) or die "Cannot delete $file: $!"; } else { $file_of{$text} = $file; } } } } # Takes text, normalizes whitespace and returns it. sub normalize_text { my $text = shift; $text =~ s/\s+/ /g; $text =~ s/^ //; $text =~ s/ \z//; $text; } # Takes a file, returns the contents in a string sub slurp_file { local @ARGV = shift; local $/; <>; }
|
|---|