This program "sum.pl" (for Windows) generates checksums matching those produced by the "md5sum" program in Linux. I wrote it because I often need to validate whether 2 files on different computers are the same.
Enter "sum.pl" without arguments for a syntax message. Both files and/or directories (ie. "folders") are accepted as arguments. With the switch -R subdirectories are searched recursively. The switches -s <key> and -r control how the output is sorted. The -d switch gives a final report of any duplicate checksums found.
#!/usr/bin/perl -w
###############
## Libraries ##
###############
use strict;
use warnings;
use Data::Dumper;
use Digest::MD5 qw{ md5 md5_hex };
use File::Basename;
use Getopt::Long;
use IO::File;
#############
## Globals ##
#############
$| = 1;
my $iam = basename $0;
my $b_recurse = 0;
my $b_reverse = 0;
my $b_dups = 0;
my $b_compat = 0;
my $h_sums = 0;
my $sortkey = "";
my $syntax = qq{
syntax: $iam [switches] <file> [file ...]
Generates the MD5 checksum for one or more <files>, and displays
the file size (in bytes), the checksum and the filename for each.
If a given <file> refers to a directory the checksum is generated
for all files within it (use the -r switch to recurse through its
subdirs as well).
Switches
-c .......... Compatible output with "md5sum" in binary mode
-R .......... Recurse subdirs when <file> is a directory
-s <key> .... Sort files in subdirs by the given <key>, where
<key> is one of: "name" (default), "size", "sum"
-r .......... Reverse the order of the sort
-d .......... Find and report files with duplicate sums
};
##################
## Command-line ##
##################
Getopt::Long::Configure("bundling");
my $go = GetOptions(
"c" => \$b_compat,
"R" => \$b_recurse,
"r" => \$b_reverse,
"s=s" => \$sortkey,
"d" => \$b_dups,
);
$go or die $syntax;
(@ARGV > 0) or die $syntax;
##################
## Main program ##
##################
map { md5sum_file($_) } @ARGV;
$h_sums and show_duplicates($h_sums);
#################
## Subroutines ##
#################
sub fatal {
my ($err) = @_;
my $lnum = (caller)[2];
my $text = "${iam}[$lnum] FATAL: $err";
die "$text\n";
}
sub md5sum_file {
my ($fname) = @_;
(-f $fname) and return show_md5sum($fname);
if (-d $fname) {
my $dir = $fname;
return md5sum_dir($dir);
}
}
sub generate_md5sum {
my ($fname) = @_;
my $o_md5 = Digest::MD5->new;
my $fh = IO::File->new;
open($fh, "<", $fname) or fatal("Failed to open '$fname' ($!)");
binmode($fh);
my $sum = $o_md5->addfile($fh)->hexdigest();
close $fh;
if ($b_dups) {
$h_sums ||= { };
my $a_files = $h_sums->{$sum} ||= [ ];
push @$a_files, $fname;
}
return $sum;
}
sub show_md5sum {
my ($fname, $a_sum) = @_;
$fname =~ s:\\:/:g;
$a_sum ||= [ -s $fname, generate_md5sum($fname) ];
my ($size, $sum) = @$a_sum;
if ($b_compat) {
printf "%s *%s\n", $sum, $fname;
} else {
printf " %10d %s %s\n", $size, $sum, $fname;
}
}
sub md5sum_dir {
my ($dir) = @_;
print "\n";
my $fh = IO::File->new;
opendir($fh, $dir) or fatal("Can't read dir '$dir' ($!)");
my @files = readdir($fh);
closedir $fh;
my $h_sorted = { };
my $a_dirs = [ ];
foreach my $fname (@files) {
next if ($fname eq '.' or $fname eq '..');
my $path = "$dir/$fname";
(-l $path) and next;
if (-d $path) {
$b_recurse and push @$a_dirs, $path;
next;
}
(-f $path) or next;
if (not $sortkey) {
show_md5sum($path);
} else {
my $size = (-s $path);
my $sum = generate_md5sum($path);
$h_sorted->{$path} = [ $size, $sum, lc $path ];
}
}
$sortkey and show_sorted($h_sorted);
map { md5sum_dir($_) } @$a_dirs;
}
sub show_sorted {
my ($h) = @_;
my @keys = keys %$h;
if ($sortkey eq 'size') {
@keys = sort { $h->{$a}->[0] <=> $h->{$b}->[0] } @keys;
} elsif ($sortkey eq 'sum') {
@keys = sort { $h->{$a}->[1] cmp $h->{$b}->[1] } @keys;
} else {
@keys = sort { $h->{$a}->[2] cmp $h->{$b}->[2] } @keys;
}
$b_reverse and @keys = reverse @keys;
foreach my $path (@keys) {
show_md5sum($path, $h->{$path});
}
}
sub show_duplicates {
my ($h) = @_;
my @dups = grep { @{$h->{$_}} > 1 } keys %$h_sums;
my @sorted = sort { @{$h->{$a}} <=> @{$h->{$b}} } @dups;
foreach my $dup (@sorted) {
my $a_files = $h->{$dup};
print "\n [Duplicate Sum '$dup']\n";
for (my $i = 0; $i < @$a_files; $i++) {
my $fname = $a_files->[$i];
printf " %3d. %s\n", $i+1, $a_files->[$i];
}
}
}