1: #!/usr/bin/perl -w
2:
3: use strict;
4:
5: BEGIN {
6:
7: use vars qw(@POD_HOOKS $be_random);
8: no strict 'refs';
9:
10: =head1 NAME
11:
12: unify-dirs - Make identical files in two directories the same inode
13:
14: =cut
15:
16: push @POD_HOOKS, NAME => sub {
17: my @m;
18: ( @m = m/(\S+) - (.*)/ ) &&
19: do { *{PROGNAME} = sub { $m[0] };
20: *{SHORT_DESC} = sub { $m[1] }; }
21: };
22:
23: =head1 SYNOPSIS
24:
25: unify-dirs [options] dir1 dir2 [...]
26:
27: =cut
28:
29: push @POD_HOOKS, SYNOPSIS => sub {
30: my $a = $_;
31: *{SYNOPSIS} = sub { $a }
32: };
33:
34: =head1 DESCRIPTION
35:
36: unify-dirs will traverse the given passed directories, and if it finds
37: files that are identical (in name, contents, permission and mode), it
38: will hard-link them and set the "immutable" and "immutable linkage
39: invert" flags (if selected).
40:
41: =cut
42:
43: push @POD_HOOKS, DESCRIPTION => sub {
44: my $a = $_;
45: *{DESCRIPTION} = sub { $a }
46: };
47:
48: =head1 COMMAND LINE OPTIONS
49:
50: The following command line options are available:
51:
52: =cut
53:
54: # Extract the command line options for the "usage" screen from the
55: # POD ;-)
56: use vars qw(@options);
57:
58: push @POD_HOOKS, 'COMMAND LINE OPTIONS' => sub {
59: # This hook is deleted below under RELEASE
60: &Pod::Constants::add_hook
61: (#-debug => 1,
62: '*item' => sub {
63: my ($switches, $description) =
64: m/^(.*?)\n\n(.*)/s;
65: my (@switches, $longest);
66: $longest = "";
67: for my $switch
68: ($switches =~ m/\G
69: ((?:-\w|--\w+))
70: (?:,\s*)?
71: /gx) {
72: push @switches, $switch;
73: if ( length $switch > length $longest) {
74: $longest = $switch;
75: }
76: }
77: $longest =~ s/^-*//;
78: push @options,
79: $longest, {
80: options => \@switches,
81: description => $description,
82: };
83: }
84: );
85: };
86:
87: =over 4
88:
89: =item -h, --help
90:
91: Display program usage
92:
93: =item -v, --verbose
94:
95: Verbose program execution
96:
97: =item -d, --debug
98:
99: Even more verbose program execution
100:
101: =item -V, --version
102:
103: Print the program version
104:
105: =item -i, --immutable
106:
107: Sets the "immutable" inode attribute.
108:
109: =item -l, --linkage
110:
111: Sets the "immutable linkage invert" inode attribute.
112:
113: =item -r, --random
114:
115: Turns on randomising of directory scanning and tree traversal. This
116: option tries to prevent against racing symlink attacks. A better
117: solution is planned.
118:
119: =back
120:
121: =head1 INODE ATTRIBUTES AND IMMUTABILITY
122:
123: Hard linking identical files between directories has a drawback: if
124: one is changed, then the other one changes too.
125:
126: To counter this, you can set the "immutable" inode attribute on
127: combined files (see L<chattr>).
128:
129: Setting inode attribute requires root privileges, C<CAP_SYS_ATTR>, and
130: a filesystem that supports it. Currently this includes default ext2
131: and ext3 in any recent kernel, or reiserfs with the "inode attributes"
132: patch applied (available from
133: C<ftp://ftp.namesys.com/pub/reiserfs-for-2.4/2.4.18.pending/>).
134:
135: The problem with setting "immutable" is that then the file can not be
136: unlinked or renamed. In the case where you have a user without
137: CAP_SYS_ATTR, but otherwise with write permission to a file, they
138: cannot then change it.
139:
140: In comes the "immutable linkage invert" flag. This flag will toggle
141: immutability of the file E<name>, but leave the file E<contents>,
142: E<permissions>, etc protected. This means that you can unlink the
143: file, and hence replace it, edit it with most editors, etc.
144:
145: This option requires a kernel patch - it is included in the s_context
146: patch for the vserver project, which is at
147: L<http://www.solucorp.qc.ca/miscprj/s_context.hc>). It is also
148: available on its own from L<http://sam.vilain.net/immutable/>.
149:
150: This works well with ext2 and ext3, but is a little trickier to get
151: working with reiserfs, as inode attributes are not a standard reiserfs
152: feature. See the above link for more information.
153:
154: =head1 RELEASE
155:
156: This is unify-dirs version 0.1.
157:
158: =cut
159:
160: push @POD_HOOKS, RELEASE => sub {
161: Pod::Constants::delete_hook("*item");
162: my $v;
163: (($v) = m/(\d+\.\d+)/) && (*{VERSION} = sub {$v});
164: };
165:
166:
167: };
168:
169: no strict 'subs';
170:
171: sub abort {
172: print STDERR &PROGNAME.": aborting: @_\n", short_usage();
173: exit(1);
174: }
175:
176: sub barf { print STDERR &PROGNAME.": @_\n"; exit(1) }
177: sub moan { print STDERR &PROGNAME.": WARNING: @_\n"; }
178: sub say { print &PROGNAME.": @_\n"; }
179: sub mutter { }
180: sub whisper { }
181:
182: use strict 'subs';
183:
184: #=====================================================================
185: # MAIN SECTION STARTS HERE
186: #=====================================================================
187: my ($action, @dirs, $immutable, $linkage, $mode);
188:
189: {
190: use Getopt::Long;
191:
192: no strict "refs", 'vars';
193: local ($^W) = 0;
194:
195: $action = "unify";
196: $mode = "";
197:
198: Getopt::Long::config("bundling");
199: #Getopt::Long::config("pass_through");
200: Getopt::Long::GetOptions
201: (
202: 'help|h' => sub { $action = "show_help" },
203: 'version|V' => sub { $action = "show_version" },
204: 'verbose|v' => sub { *{"mutter"} = \&say },
205: 'debug|d' => sub { *{"whisper"} = *{"mutter"} = \&say },
206: 'immutable|i' => \$immutable,
207: 'linkage|l' => \$linkage,
208: 'random|r' => sub { $be_random = 1 },
209: );
210:
211: $mode .= "i" if $immutable;
212: $mode .= "I" if $linkage;
213:
214: if ( ! -t STDOUT ) {
215: if ( -t STDERR ) {
216: eval "sub say { print STDERR \"\@_\n\" }";
217: } else {
218: eval "sub say { }";
219: }
220: }
221:
222: if ($action eq "show_help") {
223: print usage();
224: exit(0);
225: } elsif ($action eq "show_version") {
226: print version(), "\n";
227: exit(0);
228: }
229:
230: }
231:
232: while ( my $dir = shift @ARGV ) {
233: -d $dir or abort "Invalid directory $dir";
234: push @dirs, $dir;
235: }
236:
237: (@dirs > 1) or abort "Need at least two paths to unify";
238:
239: say "Unifying: @dirs";
240:
241: unify_dirs($mode, @dirs);
242:
243: say "done";
244: exit(0);
245:
246: =head1 INTERNAL FUNCTIONS
247:
248: Documented for prosperity
249:
250: =head2 digest_file($filename)
251:
252: returns a HASH ref containing stat into and a checksum of the file
253: named by the given filename:
254:
255: { sha1 => $sha1_checksum,
256: stat => [ stat_info ] }
257:
258: "sha1" will be missing from the hash if the file is not regular (ie, a
259: block special device, etc).
260:
261: =cut
262:
263: use Digest::SHA1;
264: use IO::File;
265:
266: sub digest_file {
267: my $filename = shift;
268:
269: my @stat = lstat $filename;
270:
271: my %returnval = ( stat => \@stat );
272:
273: if ( -f _ ) {
274:
275: whisper "digesting $filename";
276:
277: sysopen FILE, $filename, O_RDONLY
278: or die "could not open $filename for reading; $!";
279:
280: my $block_size = $stat[11] || 2**16;
281: my ($buffer, $bytes_read);
282: my $sha1 = Digest::SHA1->new;
283:
284: $sha1->add($buffer)
285: while ( $bytes_read = sysread FILE, $buffer, $block_size );
286:
287: die "Error digesting $filename; $!" unless defined $bytes_read;
288:
289: close FILE;
290: $returnval{sha1} = $sha1->hexdigest;
291: whisper "disgest is $returnval{sha1}";
292:
293: } else {
294: whisper "not digesting $filename - not a regular file";
295: }
296:
297: return \%returnval;
298: }
299:
300: =head2 chattr ($filename, $attr_string)
301:
302: Calls the C<EXT2_IOC_SETFLAGS> IOCTL on $filename, setting flags as
303: per $attr_string.
304:
305: =over
306:
307: =item Permitted chattr attributes
308:
309: The following table lists the allowed contents of $attr_string, and
310: the corresponding bitmask to the IOCTL.
311:
312: i => 0x00000010 ("immutable")
313: I => 0x00008000 ("immutable linkage invert")
314:
315: =back
316:
317: =cut
318:
319: BEGIN { use vars qw(%attr);
320: push @POD_HOOKS,
321: "Permitted chattr attributes" => sub {
322: %attr = map {
323: if ( m/(\w) => (0x\w+)/) {
324: $1 => pack("L", hex($2));
325: } else { () }
326: } split /\r?\n/, $_;
327: }; }
328:
329: sub chattr {
330: my( $file, $attr_str ) = @_;
331: whisper ("chattr($file, $attr_str)");
332: my $EXT2_IOC_SETFLAGS = 0x40046602;
333: my $flags = pack("L", 0);
334: while( my $flag = chop($attr_str) ){
335: $flags |= $attr{$flag};
336: }
337: open( F, $file ) or die "Can't open $file : $!";
338: ioctl( F, $EXT2_IOC_SETFLAGS, $flags ) or die "Can't set attr($flags) on $file: $!";
339: close F;
340: }
341:
342: =head2 unify_dirs($mode, @dirs)
343:
344: This is the function that does the main work; it takes an arbitrary
345: list of directories and combines them.
346:
347: This function calls itself recursively for sub-directories. It also
348: uses the same trick "find" does to avoid performing unnecessary
349: C<stat()>'s (see L<perlfunc>) where possible.
350:
351: It is probably quite succeptible to missing possible unification in
352: race condition situations; however it should never perform an
353: incorrect unification.
354:
355: If $mode is set, it specifies arguments to the C<chattr> function,
356: above.
357:
358: FIXME - should probably put a secure chdir() in here later, otherwise
359: we may be vulnerable to racing symlink attacks. To counter this, I've
360: added some entropy throughout the function, to make it really hard to
361: predict what order the program is going to do things. Enable it with
362: `C<-r>'
363:
364: =cut
365:
366: use ReadDir qw(&readdir_inode);
367:
368: sub unify_dirs {
369: my $mode = shift;
370:
371: # get rid of the invalid directories
372: my @dirs = map { -d $_ ? $_ : () } @_;
373: return if @dirs == 1;
374: (@dirs = sort { $be_random *= -1 } @dirs) if ( $be_random );
375:
376: # %files is a (filename => inode) hash for this directory
377: my %files;
378:
379: # This holds per-directory-to-be-unified information on the files;
380: # $dir{$dir}->{$filename} = { sha1 => $sha1_checksum,
381: # stat => [ stat_info ] }
382: # or just the inode number if it hasn't been stat'd yet
383: my %dir;
384:
385: # this contains a list of sub-directories of the current directory
386: my @subdirs;
387:
388: for my $dir ( @dirs ) {
389: whisper "Processing $dir...";
390:
391: $dir{$dir} = { };
392: my @dirents = readdir_inode $dir;
393: whisper "Readdir OK";
394: (@dirents = sort { $be_random *= -1 } @dirents)
395: if ( $be_random );
396:
397: # So we don't stat every file in a directory if we don't need
398: # to. This is the same optimisation that "find" uses.
399: my $num_dirs = ((lstat $dir)[3]) - 2;
400:
401: # for each directory entry, see if any other dir has the same
402: # filename
403: while ( my $ref = shift @dirents ) {
404: my ($filename, $inode) = (@$ref);
405: next if ($filename =~ m/^\.\.?$/);
406:
407: # we only have to stat entries where there are
408: # subdirectories remaining.
409: if ( $num_dirs ) {
410:
411: # there are still sub-directories left in here.
412: # look for them.
413: $dir{$dir}->{$filename} =
414: { stat => [ lstat "$dir/$filename" ] };
415:
416: if ( -d _ ) {
417: # found one!
418: $num_dirs--;
419: push @subdirs, $filename;
420: next;
421: }
422: } else {
423: # non-directory; remember the inode number
424: $dir{$dir}->{$filename} = $inode;
425: }
426:
427: if ( exists $files{$filename}
428: and $files{$filename} != $inode ) {
429:
430: # some other mirror has this same filename under a
431: # different inode. Ler's see if we can unify them.
432: my $my_files_h = $dir{$dir};
433: while ( my ($other_dir, $files_h) = each %dir ) {
434: next if $other_dir eq $dir;
435:
436: # does this other directory have this file too?
437: next unless exists $files_h->{$filename};
438:
439: whisper "COMPARE: $dir/$filename vs $other_dir/$filename";
440:
441: # stat both files if we haven't already
442: $my_files_h->{$filename} =
443: { stat => [ lstat "$dir/$filename" ] }
444: unless ref $my_files_h->{$filename} eq "HASH";
445: $files_h->{$filename} =
446: { stat => [ lstat "$other_dir/$filename" ] }
447: unless ref $files_h->{$filename} eq "HASH";
448:
449: my ($mine, $other)
450: = ($my_files_h->{$filename}, $files_h->{$filename});
451:
452: # compare file sizes, skip if different
453: whisper "file stat() info different", next
454: if ( $mine->{stat}->[7] != $other->{stat}->[7] or
455: $mine->{stat}->[2] != $other->{stat}->[2] or
456: $mine->{stat}->[4] != $other->{stat}->[4] or
457: $mine->{stat}->[5] != $other->{stat}->[5] or
458: $mine->{stat}->[1] == $other->{stat}->[1] );
459:
460: # gather SHA1 checksums
461: $my_files_h->{$filename} = digest_file "$dir/$filename"
462: unless exists $my_files_h->{$filename}->{sha1};
463: $files_h->{$filename} = digest_file "$other_dir/$filename"
464: unless exists $files_h->{$filename}->{sha1};
465:
466: ($mine, $other)
467: = ($my_files_h->{$filename}, $files_h->{$filename});
468:
469: # skip if different ("sha1" won't be set if
470: # this isn't a regular file)
471: whisper "not suitable for unification", next
472: unless ( $mine->{sha1} and $other->{sha1} );
473:
474: whisper "sha1: $mine->{sha1} vs $other->{sha1}";
475: next unless ( $mine->{sha1} eq $other->{sha1} );
476:
477: my $diff = ($mine->{stat}->[3] - $other->{stat}->[3]);
478:
479: my ($src, $dest)
480: = ("$dir/$filename", "$other_dir/$filename");
481:
482: if ( $diff < 0 or (!$diff and rand(1) < 0.5)) {
483: ($src, $dest) = ($dest, $src);
484: $my_files_h->{$filename} =
485: $files_h->{$filename};
486: } else {
487: $files_h->{$filename} =
488: $my_files_h->{$filename};
489: }
490:
491: mutter "Linking $src on top of $dest";
492:
493: chattr($src, $mode) if ( $mode and $mode eq "iI" );
494: chattr($src, "") if ( $mode and $mode eq "i" );
495:
496: link $src, $dest."unify$$" or do {
497: die "link ($src,${dest}unify$$) failed; $!";
498: next;
499: };
500:
501: rename $dest."unify$$", $dest or do {
502: moan "rename (${dest}unify$$, $dest) failed; $!";
503: unlink $dest."unify$$";
504: exit(1);
505: next;
506: };
507:
508: chattr($src, $mode) if ( $mode and $mode ne "iI" );
509:
510: @{ $my_files_h->{$filename}->{stat} } = lstat $src;
511:
512: }
513: } else {
514: # first directory to have this file, just remember the
515: # inode number.
516: $files{$filename} = $inode;
517: }
518: }
519: }
520:
521: %files=%dir=();
522:
523: # sponsored by the Friends of Recursion Society of Great Britain.
524: for my $subdir ( @subdirs ) {
525: unify_dirs ($mode, map { "$_/$subdir" } @dirs);
526: }
527: }
528:
529:
530: BEGIN {
531:
532: eval "use Pod::Constants -trim => 1, \@POD_HOOKS";
533: die $@ if $@;
534:
535: }
536:
537: #---------------------------------------------------------------------
538: # Usage functions
539: #---------------------------------------------------------------------
540:
541: sub short_usage {
542: return ("Usage: ${\(SYNOPSIS)}\n"
543: ."Try `${\(PROGNAME)} --help' for a summary of options."
544: ."\n");
545: }
546:
547: use Text::Wrap qw(wrap fill);
548: use Term::ReadKey;
549:
550: =head2 usage
551:
552: Prints the program usage (extracted from the POD).
553:
554: =cut
555:
556: sub usage {
557:
558: # alright, I'm admit this function is silly.
559:
560: my $options_string;
561: my $OPTIONS_INDENT = 2;
562: my $OPTIONS_WIDTH = 20;
563: my $OPTIONS_GAP = 2;
564: my $TOTAL_WIDTH = (GetTerminalSize())[0] - 10 || 70;
565:
566: my $DESCRIPTION_WIDTH = ($TOTAL_WIDTH - $OPTIONS_GAP -
567: $OPTIONS_INDENT - $OPTIONS_WIDTH);
568:
569: # go through each option, and format it for the screen
570:
571: for ( my $i = 0; $i < (@options>>1); $i ++ ) {
572: my $option = $options[$i*2 + 1];
573:
574: $Text::Wrap::huge = "overflow";
575: $Text::Wrap::columns = $OPTIONS_WIDTH;
576: my @lhs = map { split /\n/ }
577: wrap("","",join ", ",
578: sort { length $a <=> length $b }
579: @{$option->{options}});
580:
581: $Text::Wrap::huge = "wrap";
582: $Text::Wrap::columns = $DESCRIPTION_WIDTH;
583: my @rhs = map { split /\n/ }
584: fill("","",$option->{description});
585:
586: while ( @lhs or @rhs ) {
587: my $left = shift @lhs;
588: my $right = shift @rhs;
589: chomp($left);
590: $options_string .= join
591: ("",
592: " " x $OPTIONS_INDENT,
593: $left . (" " x ($OPTIONS_WIDTH - length $left)),
594: " " x $OPTIONS_GAP,
595: $right,
596: "\n");
597: }
598: }
599:
600: $Text::Wrap::huge = "overflow";
601: $Text::Wrap::columns = $TOTAL_WIDTH;
602:
603:
604: return (fill("","",PROGNAME . " - " . SHORT_DESC)
605: ."\n\n"
606: ."Usage: ".SYNOPSIS."\n\n"
607: .fill(" ","",DESCRIPTION)."\n\n"
608: .fill(" ","","Command line options:")
609: ."\n\n"
610: .$options_string."\n"
611: ."See `perldoc $0' for more information.\n");
612: }
613:
614: __END__
615:
616: =head1 AUTHOR
617:
618: Sam Vilain, <sam@vilain.net>
619:
620: =cut
621: