1: #!/usr/bin/perl -w
   2: 
   3: use strict;
   4: 
   5: BEGIN {
   6: 
   7:     use vars qw(@POD_HOOKS $be_random);
   8:     no strict 'refs';
   9: 
  10: =head1 NAME
  11: 
  12: unify-dirs - Make identical files in two directories the same inode
  13: 
  14: =cut
  15: 
  16:     push @POD_HOOKS, NAME => sub {
  17: 	my @m;
  18: 	( @m = m/(\S+) - (.*)/ ) &&
  19: 	    do { *{PROGNAME}   = sub { $m[0] };
  20: 		 *{SHORT_DESC} = sub { $m[1] }; }
  21: 	};
  22: 
  23: =head1 SYNOPSIS
  24: 
  25: unify-dirs [options] dir1 dir2 [...]
  26: 
  27: =cut
  28: 
  29:     push @POD_HOOKS, SYNOPSIS => sub {
  30: 	my $a = $_;
  31: 	*{SYNOPSIS} = sub { $a }
  32:     };
  33: 
  34: =head1 DESCRIPTION
  35: 
  36: unify-dirs will traverse the given passed directories, and if it finds
  37: files that are identical (in name, contents, permission and mode), it
  38: will hard-link them and set the "immutable" and "immutable linkage
  39: invert" flags (if selected).
  40: 
  41: =cut
  42: 
  43:     push @POD_HOOKS, DESCRIPTION => sub {
  44: 	my $a = $_;
  45: 	*{DESCRIPTION} = sub { $a }
  46:     };
  47: 
  48: =head1 COMMAND LINE OPTIONS
  49: 
  50: The following command line options are available:
  51: 
  52: =cut
  53: 
  54:     # Extract the command line options for the "usage" screen from the
  55:     # POD ;-)
  56:     use vars qw(@options);
  57: 
  58:     push @POD_HOOKS, 'COMMAND LINE OPTIONS' => sub {
  59: 	# This hook is deleted below under RELEASE
  60: 	&Pod::Constants::add_hook
  61: 		(#-debug => 1,
  62: 		 '*item' => sub {
  63: 		     my ($switches, $description) =
  64: 			 m/^(.*?)\n\n(.*)/s;
  65: 		     my (@switches, $longest);
  66: 		     $longest = "";
  67: 		     for my $switch
  68: 			 ($switches =~ m/\G
  69: 					 ((?:-\w|--\w+))
  70: 					 (?:,\s*)?
  71: 					 /gx) {
  72: 			     push @switches, $switch;
  73: 			     if ( length $switch > length $longest) {
  74: 				 $longest = $switch;
  75: 			     }
  76: 			 }
  77: 		     $longest =~ s/^-*//;
  78: 		     push @options,
  79: 			 $longest, {
  80: 				    options => \@switches,
  81: 				    description => $description,
  82: 				   };
  83: 		 }
  84: 		);
  85:     };
  86: 
  87: =over 4
  88: 
  89: =item -h, --help
  90: 
  91: Display program usage
  92: 
  93: =item -v, --verbose
  94: 
  95: Verbose program execution
  96: 
  97: =item -d, --debug
  98: 
  99: Even more verbose program execution
 100: 
 101: =item -V, --version
 102: 
 103: Print the program version
 104: 
 105: =item -i, --immutable
 106: 
 107: Sets the "immutable" inode attribute.
 108: 
 109: =item -l, --linkage
 110: 
 111: Sets the "immutable linkage invert" inode attribute.
 112: 
 113: =item -r, --random
 114: 
 115: Turns on randomising of directory scanning and tree traversal.  This
 116: option tries to prevent against racing symlink attacks.  A better
 117: solution is planned.
 118: 
 119: =back
 120: 
 121: =head1 INODE ATTRIBUTES AND IMMUTABILITY
 122: 
 123: Hard linking identical files between directories has a drawback: if
 124: one is changed, then the other one changes too.
 125: 
 126: To counter this, you can set the "immutable" inode attribute on
 127: combined files (see L<chattr>).
 128: 
 129: Setting inode attribute requires root privileges, C<CAP_SYS_ATTR>, and
 130: a filesystem that supports it.  Currently this includes default ext2
 131: and ext3 in any recent kernel, or reiserfs with the "inode attributes"
 132: patch applied (available from
 133: C<ftp://ftp.namesys.com/pub/reiserfs-for-2.4/2.4.18.pending/>).
 134: 
 135: The problem with setting "immutable" is that then the file can not be
 136: unlinked or renamed.  In the case where you have a user without
 137: CAP_SYS_ATTR, but otherwise with write permission to a file, they
 138: cannot then change it.
 139: 
 140: In comes the "immutable linkage invert" flag.  This flag will toggle
 141: immutability of the file E<name>, but leave the file E<contents>,
 142: E<permissions>, etc protected.  This means that you can unlink the
 143: file, and hence replace it, edit it with most editors, etc.
 144: 
 145: This option requires a kernel patch - it is included in the s_context
 146: patch for the vserver project, which is at
 147: L<http://www.solucorp.qc.ca/miscprj/s_context.hc>).  It is also
 148: available on its own from L<http://sam.vilain.net/immutable/>.
 149: 
 150: This works well with ext2 and ext3, but is a little trickier to get
 151: working with reiserfs, as inode attributes are not a standard reiserfs
 152: feature.  See the above link for more information.
 153: 
 154: =head1 RELEASE
 155: 
 156: This is unify-dirs version 0.1.
 157: 
 158: =cut
 159: 
 160:     push @POD_HOOKS, RELEASE => sub {
 161: 	Pod::Constants::delete_hook("*item");
 162: 	my $v;
 163: 	(($v) = m/(\d+\.\d+)/) && (*{VERSION} = sub {$v});
 164:     };
 165: 
 166: 
 167: };
 168: 
 169: no strict 'subs';
 170: 
 171: sub abort {
 172:     print STDERR &PROGNAME.": aborting: @_\n", short_usage();
 173:     exit(1);
 174: }
 175: 
 176: sub barf { print STDERR &PROGNAME.": @_\n"; exit(1) }
 177: sub moan { print STDERR &PROGNAME.": WARNING: @_\n"; }
 178: sub say { print &PROGNAME.": @_\n"; }
 179: sub mutter { }
 180: sub whisper { }
 181: 
 182: use strict 'subs';
 183: 
 184: #=====================================================================
 185: #  MAIN SECTION STARTS HERE
 186: #=====================================================================
 187: my ($action, @dirs, $immutable, $linkage, $mode);
 188: 
 189: {
 190:     use Getopt::Long;
 191: 
 192:     no strict "refs", 'vars';
 193:     local ($^W) = 0;
 194: 
 195:     $action = "unify";
 196:     $mode = "";
 197: 
 198:     Getopt::Long::config("bundling");
 199:     #Getopt::Long::config("pass_through");
 200:     Getopt::Long::GetOptions
 201: 	    (
 202: 	     'help|h' => sub { $action = "show_help" },
 203: 	     'version|V' => sub { $action = "show_version" },
 204: 	     'verbose|v' => sub { *{"mutter"} = \&say },
 205: 	     'debug|d' => sub { *{"whisper"} = *{"mutter"} = \&say },
 206: 	     'immutable|i' => \$immutable,
 207: 	     'linkage|l' => \$linkage,
 208: 	     'random|r' => sub { $be_random = 1 },
 209: 	    );
 210: 
 211:     $mode .= "i" if $immutable;
 212:     $mode .= "I" if $linkage;
 213: 
 214:     if ( ! -t STDOUT ) {
 215: 	if ( -t STDERR ) {
 216: 	    eval "sub say { print STDERR \"\@_\n\" }";
 217: 	} else {
 218: 	    eval "sub say { }";
 219: 	}
 220:     }
 221: 
 222:     if ($action eq "show_help") {
 223: 	print usage();
 224: 	exit(0);
 225:     } elsif ($action eq "show_version") {
 226: 	print version(), "\n";
 227: 	exit(0);
 228:     }
 229: 
 230: }
 231: 
 232: while ( my $dir = shift @ARGV ) {
 233:     -d $dir or abort "Invalid directory $dir";
 234:     push @dirs, $dir;
 235: }
 236: 
 237: (@dirs > 1) or abort "Need at least two paths to unify";
 238: 
 239: say "Unifying: @dirs";
 240: 
 241: unify_dirs($mode, @dirs);
 242: 
 243: say "done";
 244: exit(0);
 245: 
 246: =head1 INTERNAL FUNCTIONS
 247: 
 248: Documented for prosperity
 249: 
 250: =head2 digest_file($filename)
 251: 
 252: returns a HASH ref containing stat into and a checksum of the file
 253: named by the given filename:
 254: 
 255:     { sha1 => $sha1_checksum,
 256:       stat => [ stat_info ]   }
 257: 
 258: "sha1" will be missing from the hash if the file is not regular (ie, a
 259: block special device, etc).
 260: 
 261: =cut
 262: 
 263: use Digest::SHA1;
 264: use IO::File;
 265: 
 266: sub digest_file {
 267:     my $filename = shift;
 268: 
 269:     my @stat = lstat $filename;
 270: 
 271:     my %returnval = ( stat => \@stat );
 272: 
 273:     if ( -f _ ) {
 274: 
 275: 	whisper "digesting $filename";
 276: 
 277: 	sysopen FILE, $filename, O_RDONLY
 278: 	    or die "could not open $filename for reading; $!";
 279: 
 280: 	my $block_size = $stat[11] || 2**16;
 281: 	my ($buffer, $bytes_read);
 282: 	my $sha1 = Digest::SHA1->new;
 283: 
 284: 	$sha1->add($buffer)
 285: 	    while ( $bytes_read = sysread FILE, $buffer, $block_size );
 286: 
 287: 	die "Error digesting $filename; $!" unless defined $bytes_read;
 288: 
 289: 	close FILE;
 290: 	$returnval{sha1} = $sha1->hexdigest;
 291: 	whisper "disgest is $returnval{sha1}";
 292: 
 293:     } else {
 294: 	whisper "not digesting $filename - not a regular file";
 295:     }
 296: 
 297:     return \%returnval;
 298: }
 299: 
 300: =head2 chattr ($filename, $attr_string)
 301: 
 302: Calls the C<EXT2_IOC_SETFLAGS> IOCTL on $filename, setting flags as
 303: per $attr_string.
 304: 
 305: =over
 306: 
 307: =item Permitted chattr attributes
 308: 
 309: The following table lists the allowed contents of $attr_string, and
 310: the corresponding bitmask to the IOCTL.
 311: 
 312:   i => 0x00000010    ("immutable")
 313:   I => 0x00008000    ("immutable linkage invert")
 314: 
 315: =back
 316: 
 317: =cut
 318: 
 319: BEGIN { use vars qw(%attr);
 320: 	push @POD_HOOKS,
 321: 	    "Permitted chattr attributes" => sub {
 322: 		%attr = map {
 323: 		    if ( m/(\w) => (0x\w+)/) {
 324: 			$1 => pack("L", hex($2));
 325: 		    } else { () }
 326: 		} split /\r?\n/, $_;
 327: 	    }; }
 328: 
 329: sub chattr {
 330:     my( $file, $attr_str ) = @_;
 331:     whisper ("chattr($file, $attr_str)");
 332:     my $EXT2_IOC_SETFLAGS = 0x40046602;
 333:     my $flags = pack("L", 0);
 334:     while( my $flag = chop($attr_str) ){
 335: 	$flags |= $attr{$flag};
 336:     }
 337:     open( F, $file ) or die "Can't open $file : $!";
 338:     ioctl( F, $EXT2_IOC_SETFLAGS, $flags ) or die "Can't set attr($flags) on $file: $!";
 339:     close F;
 340: }
 341: 
 342: =head2 unify_dirs($mode, @dirs)
 343: 
 344: This is the function that does the main work; it takes an arbitrary
 345: list of directories and combines them.
 346: 
 347: This function calls itself recursively for sub-directories.  It also
 348: uses the same trick "find" does to avoid performing unnecessary
 349: C<stat()>'s (see L<perlfunc>) where possible.
 350: 
 351: It is probably quite succeptible to missing possible unification in
 352: race condition situations; however it should never perform an
 353: incorrect unification.
 354: 
 355: If $mode is set, it specifies arguments to the C<chattr> function,
 356: above.
 357: 
 358: FIXME - should probably put a secure chdir() in here later, otherwise
 359: we may be vulnerable to racing symlink attacks.  To counter this, I've
 360: added some entropy throughout the function, to make it really hard to
 361: predict what order the program is going to do things.  Enable it with
 362: `C<-r>'
 363: 
 364: =cut
 365: 
 366: use ReadDir qw(&readdir_inode);
 367: 
 368: sub unify_dirs {
 369:     my $mode = shift;
 370: 
 371:     # get rid of the invalid directories
 372:     my @dirs = map { -d $_ ? $_ : () } @_;
 373:     return if @dirs == 1;
 374:     (@dirs = sort { $be_random *= -1 } @dirs) if ( $be_random );
 375: 
 376:     # %files is a (filename => inode) hash for this directory
 377:     my %files;
 378: 
 379:     # This holds per-directory-to-be-unified information on the files;
 380:     # $dir{$dir}->{$filename} = { sha1 => $sha1_checksum,
 381:     #                             stat => [ stat_info ]   }
 382:     # or just the inode number if it hasn't been stat'd yet
 383:     my %dir;
 384: 
 385:     # this contains a list of sub-directories of the current directory
 386:     my @subdirs;
 387: 
 388:     for my $dir ( @dirs ) {
 389: 	whisper "Processing $dir...";
 390: 
 391: 	$dir{$dir} = { };
 392: 	my @dirents = readdir_inode $dir;
 393: 	whisper "Readdir OK";
 394: 	(@dirents = sort { $be_random *= -1 } @dirents)
 395: 	    if ( $be_random );
 396: 
 397: 	# So we don't stat every file in a directory if we don't need
 398: 	# to.  This is the same optimisation that "find" uses.
 399: 	my $num_dirs = ((lstat $dir)[3]) - 2;
 400: 
 401: 	# for each directory entry, see if any other dir has the same
 402: 	# filename
 403: 	while ( my $ref = shift @dirents ) {
 404: 	    my ($filename, $inode) = (@$ref);
 405: 	    next if ($filename =~ m/^\.\.?$/);
 406: 
 407: 	    # we only have to stat entries where there are
 408: 	    # subdirectories remaining.
 409: 	    if ( $num_dirs ) {
 410: 
 411: 		# there are still sub-directories left in here.
 412: 		# look for them.
 413: 		$dir{$dir}->{$filename} =
 414: 		    { stat => [ lstat "$dir/$filename" ] };
 415: 
 416: 		if ( -d _ ) {
 417: 		    # found one!
 418: 		    $num_dirs--;
 419: 		    push @subdirs, $filename;
 420: 		    next;
 421: 		}
 422: 	    } else {
 423: 		# non-directory; remember the inode number
 424: 		$dir{$dir}->{$filename} = $inode;
 425: 	    }
 426: 
 427: 	    if ( exists $files{$filename}
 428: 		 and $files{$filename} != $inode ) {
 429: 
 430: 		# some other mirror has this same filename under a
 431: 		# different inode.  Ler's see if we can unify them.
 432: 		my $my_files_h = $dir{$dir};
 433: 		while ( my ($other_dir, $files_h) = each %dir ) {
 434: 		    next if $other_dir eq $dir;
 435: 
 436: 		    # does this other directory have this file too?
 437: 		    next unless exists $files_h->{$filename};
 438: 
 439: 		    whisper "COMPARE: $dir/$filename vs $other_dir/$filename";
 440: 
 441: 		    # stat both files if we haven't already
 442: 		    $my_files_h->{$filename} =
 443: 			{ stat => [ lstat "$dir/$filename" ] }
 444: 			    unless ref $my_files_h->{$filename} eq "HASH";
 445: 		    $files_h->{$filename} =
 446: 			{ stat => [ lstat "$other_dir/$filename" ] }
 447: 			    unless ref $files_h->{$filename} eq "HASH";
 448: 
 449: 		    my ($mine, $other)
 450: 			= ($my_files_h->{$filename}, $files_h->{$filename});
 451: 
 452: 		    # compare file sizes, skip if different
 453: 		    whisper "file stat() info different", next
 454: 			if ( $mine->{stat}->[7] != $other->{stat}->[7] or
 455: 			     $mine->{stat}->[2] != $other->{stat}->[2] or
 456: 			     $mine->{stat}->[4] != $other->{stat}->[4] or
 457: 			     $mine->{stat}->[5] != $other->{stat}->[5] or
 458: 			     $mine->{stat}->[1] == $other->{stat}->[1] );
 459: 
 460: 		    # gather SHA1 checksums
 461: 		    $my_files_h->{$filename} = digest_file "$dir/$filename"
 462: 			unless exists $my_files_h->{$filename}->{sha1};
 463: 		    $files_h->{$filename} = digest_file "$other_dir/$filename"
 464: 			unless exists $files_h->{$filename}->{sha1};
 465: 
 466: 		    ($mine, $other)
 467: 			= ($my_files_h->{$filename}, $files_h->{$filename});
 468: 
 469: 		    # skip if different ("sha1" won't be set if
 470: 		    # this isn't a regular file)
 471: 		    whisper "not suitable for unification", next
 472: 			unless ( $mine->{sha1} and $other->{sha1} );
 473: 
 474: 		    whisper "sha1: $mine->{sha1} vs $other->{sha1}";
 475: 		    next unless ( $mine->{sha1} eq $other->{sha1} );
 476: 
 477: 		    my $diff = ($mine->{stat}->[3] - $other->{stat}->[3]);
 478: 
 479: 		    my ($src, $dest)
 480: 			= ("$dir/$filename", "$other_dir/$filename");
 481: 
 482: 		    if ( $diff < 0 or (!$diff and rand(1) < 0.5)) {
 483: 			($src, $dest) = ($dest, $src);
 484: 			$my_files_h->{$filename} =
 485: 			    $files_h->{$filename};
 486: 		    } else {
 487: 			$files_h->{$filename} =
 488: 			    $my_files_h->{$filename};
 489: 		    }
 490: 
 491: 		    mutter "Linking $src on top of $dest";
 492: 
 493: 		    chattr($src, $mode) if ( $mode and $mode eq "iI" );
 494: 		    chattr($src, "") if ( $mode and $mode eq "i" );
 495: 
 496: 		    link $src, $dest."unify$$" or do { 
 497: 			die "link ($src,${dest}unify$$) failed; $!";
 498: 			next;
 499: 		    };
 500: 
 501: 		    rename $dest."unify$$", $dest or do {
 502: 			moan "rename (${dest}unify$$, $dest) failed; $!";
 503: 			unlink $dest."unify$$";
 504: 			exit(1);
 505: 			next;
 506: 		    };
 507: 
 508: 		    chattr($src, $mode) if ( $mode and $mode ne "iI" );
 509: 
 510: 		    @{ $my_files_h->{$filename}->{stat} } = lstat $src;
 511: 
 512: 		}
 513: 	    } else {
 514: 		# first directory to have this file, just remember the
 515: 		# inode number.
 516: 		$files{$filename} = $inode;
 517: 	    }
 518: 	}
 519:     }
 520: 
 521:     %files=%dir=();
 522: 
 523:     # sponsored by the Friends of Recursion Society of Great Britain.
 524:     for my $subdir ( @subdirs ) {
 525: 	unify_dirs ($mode, map { "$_/$subdir" } @dirs);
 526:     }
 527: }
 528: 
 529: 
 530: BEGIN {
 531: 
 532:     eval "use Pod::Constants -trim => 1, \@POD_HOOKS";
 533:     die $@ if $@;
 534: 
 535: }
 536: 
 537: #---------------------------------------------------------------------
 538: #  Usage functions
 539: #---------------------------------------------------------------------
 540: 
 541: sub short_usage {
 542:     return ("Usage: ${\(SYNOPSIS)}\n"
 543: 	    ."Try `${\(PROGNAME)} --help' for a summary of options."
 544: 	    ."\n");
 545: }
 546: 
 547: use Text::Wrap qw(wrap fill);
 548: use Term::ReadKey;
 549: 
 550: =head2 usage
 551: 
 552: Prints the program usage (extracted from the POD).
 553: 
 554: =cut
 555: 
 556: sub usage {
 557: 
 558:     # alright, I'm admit this function is silly.
 559: 
 560:     my $options_string;
 561:     my $OPTIONS_INDENT = 2;
 562:     my $OPTIONS_WIDTH = 20;
 563:     my $OPTIONS_GAP = 2;
 564:     my $TOTAL_WIDTH = (GetTerminalSize())[0] - 10 || 70;
 565: 
 566:     my $DESCRIPTION_WIDTH = ($TOTAL_WIDTH - $OPTIONS_GAP -
 567: 			     $OPTIONS_INDENT - $OPTIONS_WIDTH);
 568: 
 569:     # go through each option, and format it for the screen
 570: 
 571:     for ( my $i = 0; $i < (@options>>1); $i ++ ) {
 572: 	my $option = $options[$i*2 + 1];
 573: 
 574: 	$Text::Wrap::huge = "overflow";
 575: 	$Text::Wrap::columns = $OPTIONS_WIDTH;
 576: 	my @lhs = map { split /\n/ }
 577: 	    wrap("","",join ", ",
 578: 		 sort { length $a <=> length $b }
 579: 		 @{$option->{options}});
 580: 
 581: 	$Text::Wrap::huge = "wrap";
 582: 	$Text::Wrap::columns = $DESCRIPTION_WIDTH;
 583: 	my @rhs = map { split /\n/ }
 584: 	    fill("","",$option->{description});
 585: 
 586: 	while ( @lhs or @rhs ) {
 587: 	    my $left = shift @lhs;
 588: 	    my $right = shift @rhs;
 589: 	    chomp($left);
 590: 	    $options_string .= join
 591: 		("",
 592: 		 " " x $OPTIONS_INDENT,
 593: 		 $left . (" " x ($OPTIONS_WIDTH - length $left)),
 594: 		 " " x $OPTIONS_GAP,
 595: 		 $right,
 596: 		 "\n");
 597: 	}
 598:     }
 599: 
 600:     $Text::Wrap::huge = "overflow";
 601:     $Text::Wrap::columns = $TOTAL_WIDTH;
 602: 
 603: 
 604:     return (fill("","",PROGNAME . " - " . SHORT_DESC)
 605: 	    ."\n\n"
 606: 	    ."Usage: ".SYNOPSIS."\n\n"
 607: 	    .fill("  ","",DESCRIPTION)."\n\n"
 608: 	    .fill("  ","","Command line options:")
 609: 	    ."\n\n"
 610: 	    .$options_string."\n"
 611: 	    ."See `perldoc $0' for more information.\n");
 612: }
 613: 
 614: __END__
 615: 
 616: =head1 AUTHOR
 617: 
 618: Sam Vilain, <sam@vilain.net>
 619: 
 620: =cut
 621: 

Replies are listed 'Best First'.
Re: Unify two directory structures (make identical files hard links)
by Matts (Deacon) on Mar 11, 2002 at 10:19 UTC
    I didn't read it in detail, but you could potentially have saved yourself quite a bit of code by using File::Same.