comment on

0: #!/usr/bin/perl -w
1: 
2: use strict;
3: 
4: BEGIN {
5: 
6:     use vars qw(@POD_HOOKS $be_random);
7:     no strict 'refs';
8: 
9: =head1 NAME
10: 
11: unify-dirs - Make identical files in two directories the same inode
12: 
13: =cut
14: 
15:     push @POD_HOOKS, NAME => sub {
16: 	my @m;
17: 	( @m = m/(\S+) - (.*)/ ) &&
18: 	    do { *{PROGNAME}   = sub { $m[0] };
19: 		 *{SHORT_DESC} = sub { $m[1] }; }
20: 	};
21: 
22: =head1 SYNOPSIS
23: 
24: unify-dirs [options] dir1 dir2 [...]
25: 
26: =cut
27: 
28:     push @POD_HOOKS, SYNOPSIS => sub {
29: 	my $a = $_;
30: 	*{SYNOPSIS} = sub { $a }
31:     };
32: 
33: =head1 DESCRIPTION
34: 
35: unify-dirs will traverse the given passed directories, and if it finds
36: files that are identical (in name, contents, permission and mode), it
37: will hard-link them and set the "immutable" and "immutable linkage
38: invert" flags (if selected).
39: 
40: =cut
41: 
42:     push @POD_HOOKS, DESCRIPTION => sub {
43: 	my $a = $_;
44: 	*{DESCRIPTION} = sub { $a }
45:     };
46: 
47: =head1 COMMAND LINE OPTIONS
48: 
49: The following command line options are available:
50: 
51: =cut
52: 
53:     # Extract the command line options for the "usage" screen from the
54:     # POD ;-)
55:     use vars qw(@options);
56: 
57:     push @POD_HOOKS, 'COMMAND LINE OPTIONS' => sub {
58: 	# This hook is deleted below under RELEASE
59: 	&Pod::Constants::add_hook
60: 		(#-debug => 1,
61: 		 '*item' => sub {
62: 		     my ($switches, $description) =
63: 			 m/^(.*?)\n\n(.*)/s;
64: 		     my (@switches, $longest);
65: 		     $longest = "";
66: 		     for my $switch
67: 			 ($switches =~ m/\G
68: 					 ((?:-\w|--\w+))
69: 					 (?:,\s*)?
70: 					 /gx) {
71: 			     push @switches, $switch;
72: 			     if ( length $switch > length $longest) {
73: 				 $longest = $switch;
74: 			     }
75: 			 }
76: 		     $longest =~ s/^-*//;
77: 		     push @options,
78: 			 $longest, {
79: 				    options => \@switches,
80: 				    description => $description,
81: 				   };
82: 		 }
83: 		);
84:     };
85: 
86: =over 4
87: 
88: =item -h, --help
89: 
90: Display program usage
91: 
92: =item -v, --verbose
93: 
94: Verbose program execution
95: 
96: =item -d, --debug
97: 
98: Even more verbose program execution
99: 
100: =item -V, --version
101: 
102: Print the program version
103: 
104: =item -i, --immutable
105: 
106: Sets the "immutable" inode attribute.
107: 
108: =item -l, --linkage
109: 
110: Sets the "immutable linkage invert" inode attribute.
111: 
112: =item -r, --random
113: 
114: Turns on randomising of directory scanning and tree traversal.  This
115: option tries to prevent against racing symlink attacks.  A better
116: solution is planned.
117: 
118: =back
119: 
120: =head1 INODE ATTRIBUTES AND IMMUTABILITY
121: 
122: Hard linking identical files between directories has a drawback: if
123: one is changed, then the other one changes too.
124: 
125: To counter this, you can set the "immutable" inode attribute on
126: combined files (see L<chattr>).
127: 
128: Setting inode attribute requires root privileges, C<CAP_SYS_ATTR>, and
129: a filesystem that supports it.  Currently this includes default ext2
130: and ext3 in any recent kernel, or reiserfs with the "inode attributes"
131: patch applied (available from
132: C<ftp://ftp.namesys.com/pub/reiserfs-for-2.4/2.4.18.pending/>).
133: 
134: The problem with setting "immutable" is that then the file can not be
135: unlinked or renamed.  In the case where you have a user without
136: CAP_SYS_ATTR, but otherwise with write permission to a file, they
137: cannot then change it.
138: 
139: In comes the "immutable linkage invert" flag.  This flag will toggle
140: immutability of the file E<name>, but leave the file E<contents>,
141: E<permissions>, etc protected.  This means that you can unlink the
142: file, and hence replace it, edit it with most editors, etc.
143: 
144: This option requires a kernel patch - it is included in the s_context
145: patch for the vserver project, which is at
146: L<http://www.solucorp.qc.ca/miscprj/s_context.hc>).  It is also
147: available on its own from L<http://sam.vilain.net/immutable/>.
148: 
149: This works well with ext2 and ext3, but is a little trickier to get
150: working with reiserfs, as inode attributes are not a standard reiserfs
151: feature.  See the above link for more information.
152: 
153: =head1 RELEASE
154: 
155: This is unify-dirs version 0.1.
156: 
157: =cut
158: 
159:     push @POD_HOOKS, RELEASE => sub {
160: 	Pod::Constants::delete_hook("*item");
161: 	my $v;
162: 	(($v) = m/(\d+\.\d+)/) && (*{VERSION} = sub {$v});
163:     };
164: 
165: 
166: };
167: 
168: no strict 'subs';
169: 
170: sub abort {
171:     print STDERR &PROGNAME.": aborting: @_\n", short_usage();
172:     exit(1);
173: }
174: 
175: sub barf { print STDERR &PROGNAME.": @_\n"; exit(1) }
176: sub moan { print STDERR &PROGNAME.": WARNING: @_\n"; }
177: sub say { print &PROGNAME.": @_\n"; }
178: sub mutter { }
179: sub whisper { }
180: 
181: use strict 'subs';
182: 
183: #=====================================================================
184: #  MAIN SECTION STARTS HERE
185: #=====================================================================
186: my ($action, @dirs, $immutable, $linkage, $mode);
187: 
188: {
189:     use Getopt::Long;
190: 
191:     no strict "refs", 'vars';
192:     local ($^W) = 0;
193: 
194:     $action = "unify";
195:     $mode = "";
196: 
197:     Getopt::Long::config("bundling");
198:     #Getopt::Long::config("pass_through");
199:     Getopt::Long::GetOptions
200: 	    (
201: 	     'help|h' => sub { $action = "show_help" },
202: 	     'version|V' => sub { $action = "show_version" },
203: 	     'verbose|v' => sub { *{"mutter"} = \&say },
204: 	     'debug|d' => sub { *{"whisper"} = *{"mutter"} = \&say },
205: 	     'immutable|i' => \$immutable,
206: 	     'linkage|l' => \$linkage,
207: 	     'random|r' => sub { $be_random = 1 },
208: 	    );
209: 
210:     $mode .= "i" if $immutable;
211:     $mode .= "I" if $linkage;
212: 
213:     if ( ! -t STDOUT ) {
214: 	if ( -t STDERR ) {
215: 	    eval "sub say { print STDERR \"\@_\n\" }";
216: 	} else {
217: 	    eval "sub say { }";
218: 	}
219:     }
220: 
221:     if ($action eq "show_help") {
222: 	print usage();
223: 	exit(0);
224:     } elsif ($action eq "show_version") {
225: 	print version(), "\n";
226: 	exit(0);
227:     }
228: 
229: }
230: 
231: while ( my $dir = shift @ARGV ) {
232:     -d $dir or abort "Invalid directory $dir";
233:     push @dirs, $dir;
234: }
235: 
236: (@dirs > 1) or abort "Need at least two paths to unify";
237: 
238: say "Unifying: @dirs";
239: 
240: unify_dirs($mode, @dirs);
241: 
242: say "done";
243: exit(0);
244: 
245: =head1 INTERNAL FUNCTIONS
246: 
247: Documented for prosperity
248: 
249: =head2 digest_file($filename)
250: 
251: returns a HASH ref containing stat into and a checksum of the file
252: named by the given filename:
253: 
254:     { sha1 => $sha1_checksum,
255:       stat => [ stat_info ]   }
256: 
257: "sha1" will be missing from the hash if the file is not regular (ie, a
258: block special device, etc).
259: 
260: =cut
261: 
262: use Digest::SHA1;
263: use IO::File;
264: 
265: sub digest_file {
266:     my $filename = shift;
267: 
268:     my @stat = lstat $filename;
269: 
270:     my %returnval = ( stat => \@stat );
271: 
272:     if ( -f _ ) {
273: 
274: 	whisper "digesting $filename";
275: 
276: 	sysopen FILE, $filename, O_RDONLY
277: 	    or die "could not open $filename for reading; $!";
278: 
279: 	my $block_size = $stat[11] || 2**16;
280: 	my ($buffer, $bytes_read);
281: 	my $sha1 = Digest::SHA1->new;
282: 
283: 	$sha1->add($buffer)
284: 	    while ( $bytes_read = sysread FILE, $buffer, $block_size );
285: 
286: 	die "Error digesting $filename; $!" unless defined $bytes_read;
287: 
288: 	close FILE;
289: 	$returnval{sha1} = $sha1->hexdigest;
290: 	whisper "disgest is $returnval{sha1}";
291: 
292:     } else {
293: 	whisper "not digesting $filename - not a regular file";
294:     }
295: 
296:     return \%returnval;
297: }
298: 
299: =head2 chattr ($filename, $attr_string)
300: 
301: Calls the C<EXT2_IOC_SETFLAGS> IOCTL on $filename, setting flags as
302: per $attr_string.
303: 
304: =over
305: 
306: =item Permitted chattr attributes
307: 
308: The following table lists the allowed contents of $attr_string, and
309: the corresponding bitmask to the IOCTL.
310: 
311:   i => 0x00000010    ("immutable")
312:   I => 0x00008000    ("immutable linkage invert")
313: 
314: =back
315: 
316: =cut
317: 
318: BEGIN { use vars qw(%attr);
319: 	push @POD_HOOKS,
320: 	    "Permitted chattr attributes" => sub {
321: 		%attr = map {
322: 		    if ( m/(\w) => (0x\w+)/) {
323: 			$1 => pack("L", hex($2));
324: 		    } else { () }
325: 		} split /\r?\n/, $_;
326: 	    }; }
327: 
328: sub chattr {
329:     my( $file, $attr_str ) = @_;
330:     whisper ("chattr($file, $attr_str)");
331:     my $EXT2_IOC_SETFLAGS = 0x40046602;
332:     my $flags = pack("L", 0);
333:     while( my $flag = chop($attr_str) ){
334: 	$flags |= $attr{$flag};
335:     }
336:     open( F, $file ) or die "Can't open $file : $!";
337:     ioctl( F, $EXT2_IOC_SETFLAGS, $flags ) or die "Can't set attr($flags) on $file: $!";
338:     close F;
339: }
340: 
341: =head2 unify_dirs($mode, @dirs)
342: 
343: This is the function that does the main work; it takes an arbitrary
344: list of directories and combines them.
345: 
346: This function calls itself recursively for sub-directories.  It also
347: uses the same trick "find" does to avoid performing unnecessary
348: C<stat()>'s (see L<perlfunc>) where possible.
349: 
350: It is probably quite succeptible to missing possible unification in
351: race condition situations; however it should never perform an
352: incorrect unification.
353: 
354: If $mode is set, it specifies arguments to the C<chattr> function,
355: above.
356: 
357: FIXME - should probably put a secure chdir() in here later, otherwise
358: we may be vulnerable to racing symlink attacks.  To counter this, I've
359: added some entropy throughout the function, to make it really hard to
360: predict what order the program is going to do things.  Enable it with
361: `C<-r>'
362: 
363: =cut
364: 
365: use ReadDir qw(&readdir_inode);
366: 
367: sub unify_dirs {
368:     my $mode = shift;
369: 
370:     # get rid of the invalid directories
371:     my @dirs = map { -d $_ ? $_ : () } @_;
372:     return if @dirs == 1;
373:     (@dirs = sort { $be_random *= -1 } @dirs) if ( $be_random );
374: 
375:     # %files is a (filename => inode) hash for this directory
376:     my %files;
377: 
378:     # This holds per-directory-to-be-unified information on the files;
379:     # $dir{$dir}->{$filename} = { sha1 => $sha1_checksum,
380:     #                             stat => [ stat_info ]   }
381:     # or just the inode number if it hasn't been stat'd yet
382:     my %dir;
383: 
384:     # this contains a list of sub-directories of the current directory
385:     my @subdirs;
386: 
387:     for my $dir ( @dirs ) {
388: 	whisper "Processing $dir...";
389: 
390: 	$dir{$dir} = { };
391: 	my @dirents = readdir_inode $dir;
392: 	whisper "Readdir OK";
393: 	(@dirents = sort { $be_random *= -1 } @dirents)
394: 	    if ( $be_random );
395: 
396: 	# So we don't stat every file in a directory if we don't need
397: 	# to.  This is the same optimisation that "find" uses.
398: 	my $num_dirs = ((lstat $dir)[3]) - 2;
399: 
400: 	# for each directory entry, see if any other dir has the same
401: 	# filename
402: 	while ( my $ref = shift @dirents ) {
403: 	    my ($filename, $inode) = (@$ref);
404: 	    next if ($filename =~ m/^\.\.?$/);
405: 
406: 	    # we only have to stat entries where there are
407: 	    # subdirectories remaining.
408: 	    if ( $num_dirs ) {
409: 
410: 		# there are still sub-directories left in here.
411: 		# look for them.
412: 		$dir{$dir}->{$filename} =
413: 		    { stat => [ lstat "$dir/$filename" ] };
414: 
415: 		if ( -d _ ) {
416: 		    # found one!
417: 		    $num_dirs--;
418: 		    push @subdirs, $filename;
419: 		    next;
420: 		}
421: 	    } else {
422: 		# non-directory; remember the inode number
423: 		$dir{$dir}->{$filename} = $inode;
424: 	    }
425: 
426: 	    if ( exists $files{$filename}
427: 		 and $files{$filename} != $inode ) {
428: 
429: 		# some other mirror has this same filename under a
430: 		# different inode.  Ler's see if we can unify them.
431: 		my $my_files_h = $dir{$dir};
432: 		while ( my ($other_dir, $files_h) = each %dir ) {
433: 		    next if $other_dir eq $dir;
434: 
435: 		    # does this other directory have this file too?
436: 		    next unless exists $files_h->{$filename};
437: 
438: 		    whisper "COMPARE: $dir/$filename vs $other_dir/$filename";
439: 
440: 		    # stat both files if we haven't already
441: 		    $my_files_h->{$filename} =
442: 			{ stat => [ lstat "$dir/$filename" ] }
443: 			    unless ref $my_files_h->{$filename} eq "HASH";
444: 		    $files_h->{$filename} =
445: 			{ stat => [ lstat "$other_dir/$filename" ] }
446: 			    unless ref $files_h->{$filename} eq "HASH";
447: 
448: 		    my ($mine, $other)
449: 			= ($my_files_h->{$filename}, $files_h->{$filename});
450: 
451: 		    # compare file sizes, skip if different
452: 		    whisper "file stat() info different", next
453: 			if ( $mine->{stat}->[7] != $other->{stat}->[7] or
454: 			     $mine->{stat}->[2] != $other->{stat}->[2] or
455: 			     $mine->{stat}->[4] != $other->{stat}->[4] or
456: 			     $mine->{stat}->[5] != $other->{stat}->[5] or
457: 			     $mine->{stat}->[1] == $other->{stat}->[1] );
458: 
459: 		    # gather SHA1 checksums
460: 		    $my_files_h->{$filename} = digest_file "$dir/$filename"
461: 			unless exists $my_files_h->{$filename}->{sha1};
462: 		    $files_h->{$filename} = digest_file "$other_dir/$filename"
463: 			unless exists $files_h->{$filename}->{sha1};
464: 
465: 		    ($mine, $other)
466: 			= ($my_files_h->{$filename}, $files_h->{$filename});
467: 
468: 		    # skip if different ("sha1" won't be set if
469: 		    # this isn't a regular file)
470: 		    whisper "not suitable for unification", next
471: 			unless ( $mine->{sha1} and $other->{sha1} );
472: 
473: 		    whisper "sha1: $mine->{sha1} vs $other->{sha1}";
474: 		    next unless ( $mine->{sha1} eq $other->{sha1} );
475: 
476: 		    my $diff = ($mine->{stat}->[3] - $other->{stat}->[3]);
477: 
478: 		    my ($src, $dest)
479: 			= ("$dir/$filename", "$other_dir/$filename");
480: 
481: 		    if ( $diff < 0 or (!$diff and rand(1) < 0.5)) {
482: 			($src, $dest) = ($dest, $src);
483: 			$my_files_h->{$filename} =
484: 			    $files_h->{$filename};
485: 		    } else {
486: 			$files_h->{$filename} =
487: 			    $my_files_h->{$filename};
488: 		    }
489: 
490: 		    mutter "Linking $src on top of $dest";
491: 
492: 		    chattr($src, $mode) if ( $mode and $mode eq "iI" );
493: 		    chattr($src, "") if ( $mode and $mode eq "i" );
494: 
495: 		    link $src, $dest."unify$$" or do { 
496: 			die "link ($src,${dest}unify$$) failed; $!";
497: 			next;
498: 		    };
499: 
500: 		    rename $dest."unify$$", $dest or do {
501: 			moan "rename (${dest}unify$$, $dest) failed; $!";
502: 			unlink $dest."unify$$";
503: 			exit(1);
504: 			next;
505: 		    };
506: 
507: 		    chattr($src, $mode) if ( $mode and $mode ne "iI" );
508: 
509: 		    @{ $my_files_h->{$filename}->{stat} } = lstat $src;
510: 
511: 		}
512: 	    } else {
513: 		# first directory to have this file, just remember the
514: 		# inode number.
515: 		$files{$filename} = $inode;
516: 	    }
517: 	}
518:     }
519: 
520:     %files=%dir=();
521: 
522:     # sponsored by the Friends of Recursion Society of Great Britain.
523:     for my $subdir ( @subdirs ) {
524: 	unify_dirs ($mode, map { "$_/$subdir" } @dirs);
525:     }
526: }
527: 
528: 
529: BEGIN {
530: 
531:     eval "use Pod::Constants -trim => 1, \@POD_HOOKS";
532:     die $@ if $@;
533: 
534: }
535: 
536: #---------------------------------------------------------------------
537: #  Usage functions
538: #---------------------------------------------------------------------
539: 
540: sub short_usage {
541:     return ("Usage: ${\(SYNOPSIS)}\n"
542: 	    ."Try `${\(PROGNAME)} --help' for a summary of options."
543: 	    ."\n");
544: }
545: 
546: use Text::Wrap qw(wrap fill);
547: use Term::ReadKey;
548: 
549: =head2 usage
550: 
551: Prints the program usage (extracted from the POD).
552: 
553: =cut
554: 
555: sub usage {
556: 
557:     # alright, I'm admit this function is silly.
558: 
559:     my $options_string;
560:     my $OPTIONS_INDENT = 2;
561:     my $OPTIONS_WIDTH = 20;
562:     my $OPTIONS_GAP = 2;
563:     my $TOTAL_WIDTH = (GetTerminalSize())[0] - 10 || 70;
564: 
565:     my $DESCRIPTION_WIDTH = ($TOTAL_WIDTH - $OPTIONS_GAP -
566: 			     $OPTIONS_INDENT - $OPTIONS_WIDTH);
567: 
568:     # go through each option, and format it for the screen
569: 
570:     for ( my $i = 0; $i < (@options>>1); $i ++ ) {
571: 	my $option = $options[$i*2 + 1];
572: 
573: 	$Text::Wrap::huge = "overflow";
574: 	$Text::Wrap::columns = $OPTIONS_WIDTH;
575: 	my @lhs = map { split /\n/ }
576: 	    wrap("","",join ", ",
577: 		 sort { length $a <=> length $b }
578: 		 @{$option->{options}});
579: 
580: 	$Text::Wrap::huge = "wrap";
581: 	$Text::Wrap::columns = $DESCRIPTION_WIDTH;
582: 	my @rhs = map { split /\n/ }
583: 	    fill("","",$option->{description});
584: 
585: 	while ( @lhs or @rhs ) {
586: 	    my $left = shift @lhs;
587: 	    my $right = shift @rhs;
588: 	    chomp($left);
589: 	    $options_string .= join
590: 		("",
591: 		 " " x $OPTIONS_INDENT,
592: 		 $left . (" " x ($OPTIONS_WIDTH - length $left)),
593: 		 " " x $OPTIONS_GAP,
594: 		 $right,
595: 		 "\n");
596: 	}
597:     }
598: 
599:     $Text::Wrap::huge = "overflow";
600:     $Text::Wrap::columns = $TOTAL_WIDTH;
601: 
602: 
603:     return (fill("","",PROGNAME . " - " . SHORT_DESC)
604: 	    ."\n\n"
605: 	    ."Usage: ".SYNOPSIS."\n\n"
606: 	    .fill("  ","",DESCRIPTION)."\n\n"
607: 	    .fill("  ","","Command line options:")
608: 	    ."\n\n"
609: 	    .$options_string."\n"
610: 	    ."See `perldoc $0' for more information.\n");
611: }
612: 
613: __END__
614: 
615: =head1 AUTHOR
616: 
617: Sam Vilain, <sam@vilain.net>
618: 
619: =cut
620:
In reply to Unify two directory structures (make identical files hard links) by mugwumpjism
Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
Read Where should I post X? if you're not absolutely sure you're posting in the right place.
Please read these before you post! —
Posts may use any of the Perl Monks Approved HTML tags:
a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`
Link using PerlMonks shortcuts! What shortcuts can I use for linking?
See Writeup Formatting Tips and other pages linked from there for more info.