0: #!/usr/bin/perl -w
1:
2: use strict;
3:
4: BEGIN {
5:
6: use vars qw(@POD_HOOKS $be_random);
7: no strict 'refs';
8:
9: =head1 NAME
10:
11: unify-dirs - Make identical files in two directories the same inode
12:
13: =cut
14:
15: push @POD_HOOKS, NAME => sub {
16: my @m;
17: ( @m = m/(\S+) - (.*)/ ) &&
18: do { *{PROGNAME} = sub { $m[0] };
19: *{SHORT_DESC} = sub { $m[1] }; }
20: };
21:
22: =head1 SYNOPSIS
23:
24: unify-dirs [options] dir1 dir2 [...]
25:
26: =cut
27:
28: push @POD_HOOKS, SYNOPSIS => sub {
29: my $a = $_;
30: *{SYNOPSIS} = sub { $a }
31: };
32:
33: =head1 DESCRIPTION
34:
35: unify-dirs will traverse the given passed directories, and if it finds
36: files that are identical (in name, contents, permission and mode), it
37: will hard-link them and set the "immutable" and "immutable linkage
38: invert" flags (if selected).
39:
40: =cut
41:
42: push @POD_HOOKS, DESCRIPTION => sub {
43: my $a = $_;
44: *{DESCRIPTION} = sub { $a }
45: };
46:
47: =head1 COMMAND LINE OPTIONS
48:
49: The following command line options are available:
50:
51: =cut
52:
53: # Extract the command line options for the "usage" screen from the
54: # POD ;-)
55: use vars qw(@options);
56:
57: push @POD_HOOKS, 'COMMAND LINE OPTIONS' => sub {
58: # This hook is deleted below under RELEASE
59: &Pod::Constants::add_hook
60: (#-debug => 1,
61: '*item' => sub {
62: my ($switches, $description) =
63: m/^(.*?)\n\n(.*)/s;
64: my (@switches, $longest);
65: $longest = "";
66: for my $switch
67: ($switches =~ m/\G
68: ((?:-\w|--\w+))
69: (?:,\s*)?
70: /gx) {
71: push @switches, $switch;
72: if ( length $switch > length $longest) {
73: $longest = $switch;
74: }
75: }
76: $longest =~ s/^-*//;
77: push @options,
78: $longest, {
79: options => \@switches,
80: description => $description,
81: };
82: }
83: );
84: };
85:
86: =over 4
87:
88: =item -h, --help
89:
90: Display program usage
91:
92: =item -v, --verbose
93:
94: Verbose program execution
95:
96: =item -d, --debug
97:
98: Even more verbose program execution
99:
100: =item -V, --version
101:
102: Print the program version
103:
104: =item -i, --immutable
105:
106: Sets the "immutable" inode attribute.
107:
108: =item -l, --linkage
109:
110: Sets the "immutable linkage invert" inode attribute.
111:
112: =item -r, --random
113:
114: Turns on randomising of directory scanning and tree traversal. This
115: option tries to prevent against racing symlink attacks. A better
116: solution is planned.
117:
118: =back
119:
120: =head1 INODE ATTRIBUTES AND IMMUTABILITY
121:
122: Hard linking identical files between directories has a drawback: if
123: one is changed, then the other one changes too.
124:
125: To counter this, you can set the "immutable" inode attribute on
126: combined files (see L<chattr>).
127:
128: Setting inode attribute requires root privileges, C<CAP_SYS_ATTR>, and
129: a filesystem that supports it. Currently this includes default ext2
130: and ext3 in any recent kernel, or reiserfs with the "inode attributes"
131: patch applied (available from
132: C<ftp://ftp.namesys.com/pub/reiserfs-for-2.4/2.4.18.pending/>).
133:
134: The problem with setting "immutable" is that then the file can not be
135: unlinked or renamed. In the case where you have a user without
136: CAP_SYS_ATTR, but otherwise with write permission to a file, they
137: cannot then change it.
138:
139: In comes the "immutable linkage invert" flag. This flag will toggle
140: immutability of the file E<name>, but leave the file E<contents>,
141: E<permissions>, etc protected. This means that you can unlink the
142: file, and hence replace it, edit it with most editors, etc.
143:
144: This option requires a kernel patch - it is included in the s_context
145: patch for the vserver project, which is at
146: L<http://www.solucorp.qc.ca/miscprj/s_context.hc>). It is also
147: available on its own from L<http://sam.vilain.net/immutable/>.
148:
149: This works well with ext2 and ext3, but is a little trickier to get
150: working with reiserfs, as inode attributes are not a standard reiserfs
151: feature. See the above link for more information.
152:
153: =head1 RELEASE
154:
155: This is unify-dirs version 0.1.
156:
157: =cut
158:
159: push @POD_HOOKS, RELEASE => sub {
160: Pod::Constants::delete_hook("*item");
161: my $v;
162: (($v) = m/(\d+\.\d+)/) && (*{VERSION} = sub {$v});
163: };
164:
165:
166: };
167:
168: no strict 'subs';
169:
170: sub abort {
171: print STDERR &PROGNAME.": aborting: @_\n", short_usage();
172: exit(1);
173: }
174:
175: sub barf { print STDERR &PROGNAME.": @_\n"; exit(1) }
176: sub moan { print STDERR &PROGNAME.": WARNING: @_\n"; }
177: sub say { print &PROGNAME.": @_\n"; }
178: sub mutter { }
179: sub whisper { }
180:
181: use strict 'subs';
182:
183: #=====================================================================
184: # MAIN SECTION STARTS HERE
185: #=====================================================================
186: my ($action, @dirs, $immutable, $linkage, $mode);
187:
188: {
189: use Getopt::Long;
190:
191: no strict "refs", 'vars';
192: local ($^W) = 0;
193:
194: $action = "unify";
195: $mode = "";
196:
197: Getopt::Long::config("bundling");
198: #Getopt::Long::config("pass_through");
199: Getopt::Long::GetOptions
200: (
201: 'help|h' => sub { $action = "show_help" },
202: 'version|V' => sub { $action = "show_version" },
203: 'verbose|v' => sub { *{"mutter"} = \&say },
204: 'debug|d' => sub { *{"whisper"} = *{"mutter"} = \&say },
205: 'immutable|i' => \$immutable,
206: 'linkage|l' => \$linkage,
207: 'random|r' => sub { $be_random = 1 },
208: );
209:
210: $mode .= "i" if $immutable;
211: $mode .= "I" if $linkage;
212:
213: if ( ! -t STDOUT ) {
214: if ( -t STDERR ) {
215: eval "sub say { print STDERR \"\@_\n\" }";
216: } else {
217: eval "sub say { }";
218: }
219: }
220:
221: if ($action eq "show_help") {
222: print usage();
223: exit(0);
224: } elsif ($action eq "show_version") {
225: print version(), "\n";
226: exit(0);
227: }
228:
229: }
230:
231: while ( my $dir = shift @ARGV ) {
232: -d $dir or abort "Invalid directory $dir";
233: push @dirs, $dir;
234: }
235:
236: (@dirs > 1) or abort "Need at least two paths to unify";
237:
238: say "Unifying: @dirs";
239:
240: unify_dirs($mode, @dirs);
241:
242: say "done";
243: exit(0);
244:
245: =head1 INTERNAL FUNCTIONS
246:
247: Documented for prosperity
248:
249: =head2 digest_file($filename)
250:
251: returns a HASH ref containing stat into and a checksum of the file
252: named by the given filename:
253:
254: { sha1 => $sha1_checksum,
255: stat => [ stat_info ] }
256:
257: "sha1" will be missing from the hash if the file is not regular (ie, a
258: block special device, etc).
259:
260: =cut
261:
262: use Digest::SHA1;
263: use IO::File;
264:
265: sub digest_file {
266: my $filename = shift;
267:
268: my @stat = lstat $filename;
269:
270: my %returnval = ( stat => \@stat );
271:
272: if ( -f _ ) {
273:
274: whisper "digesting $filename";
275:
276: sysopen FILE, $filename, O_RDONLY
277: or die "could not open $filename for reading; $!";
278:
279: my $block_size = $stat[11] || 2**16;
280: my ($buffer, $bytes_read);
281: my $sha1 = Digest::SHA1->new;
282:
283: $sha1->add($buffer)
284: while ( $bytes_read = sysread FILE, $buffer, $block_size );
285:
286: die "Error digesting $filename; $!" unless defined $bytes_read;
287:
288: close FILE;
289: $returnval{sha1} = $sha1->hexdigest;
290: whisper "disgest is $returnval{sha1}";
291:
292: } else {
293: whisper "not digesting $filename - not a regular file";
294: }
295:
296: return \%returnval;
297: }
298:
299: =head2 chattr ($filename, $attr_string)
300:
301: Calls the C<EXT2_IOC_SETFLAGS> IOCTL on $filename, setting flags as
302: per $attr_string.
303:
304: =over
305:
306: =item Permitted chattr attributes
307:
308: The following table lists the allowed contents of $attr_string, and
309: the corresponding bitmask to the IOCTL.
310:
311: i => 0x00000010 ("immutable")
312: I => 0x00008000 ("immutable linkage invert")
313:
314: =back
315:
316: =cut
317:
318: BEGIN { use vars qw(%attr);
319: push @POD_HOOKS,
320: "Permitted chattr attributes" => sub {
321: %attr = map {
322: if ( m/(\w) => (0x\w+)/) {
323: $1 => pack("L", hex($2));
324: } else { () }
325: } split /\r?\n/, $_;
326: }; }
327:
328: sub chattr {
329: my( $file, $attr_str ) = @_;
330: whisper ("chattr($file, $attr_str)");
331: my $EXT2_IOC_SETFLAGS = 0x40046602;
332: my $flags = pack("L", 0);
333: while( my $flag = chop($attr_str) ){
334: $flags |= $attr{$flag};
335: }
336: open( F, $file ) or die "Can't open $file : $!";
337: ioctl( F, $EXT2_IOC_SETFLAGS, $flags ) or die "Can't set attr($flags) on $file: $!";
338: close F;
339: }
340:
341: =head2 unify_dirs($mode, @dirs)
342:
343: This is the function that does the main work; it takes an arbitrary
344: list of directories and combines them.
345:
346: This function calls itself recursively for sub-directories. It also
347: uses the same trick "find" does to avoid performing unnecessary
348: C<stat()>'s (see L<perlfunc>) where possible.
349:
350: It is probably quite succeptible to missing possible unification in
351: race condition situations; however it should never perform an
352: incorrect unification.
353:
354: If $mode is set, it specifies arguments to the C<chattr> function,
355: above.
356:
357: FIXME - should probably put a secure chdir() in here later, otherwise
358: we may be vulnerable to racing symlink attacks. To counter this, I've
359: added some entropy throughout the function, to make it really hard to
360: predict what order the program is going to do things. Enable it with
361: `C<-r>'
362:
363: =cut
364:
365: use ReadDir qw(&readdir_inode);
366:
367: sub unify_dirs {
368: my $mode = shift;
369:
370: # get rid of the invalid directories
371: my @dirs = map { -d $_ ? $_ : () } @_;
372: return if @dirs == 1;
373: (@dirs = sort { $be_random *= -1 } @dirs) if ( $be_random );
374:
375: # %files is a (filename => inode) hash for this directory
376: my %files;
377:
378: # This holds per-directory-to-be-unified information on the files;
379: # $dir{$dir}->{$filename} = { sha1 => $sha1_checksum,
380: # stat => [ stat_info ] }
381: # or just the inode number if it hasn't been stat'd yet
382: my %dir;
383:
384: # this contains a list of sub-directories of the current directory
385: my @subdirs;
386:
387: for my $dir ( @dirs ) {
388: whisper "Processing $dir...";
389:
390: $dir{$dir} = { };
391: my @dirents = readdir_inode $dir;
392: whisper "Readdir OK";
393: (@dirents = sort { $be_random *= -1 } @dirents)
394: if ( $be_random );
395:
396: # So we don't stat every file in a directory if we don't need
397: # to. This is the same optimisation that "find" uses.
398: my $num_dirs = ((lstat $dir)[3]) - 2;
399:
400: # for each directory entry, see if any other dir has the same
401: # filename
402: while ( my $ref = shift @dirents ) {
403: my ($filename, $inode) = (@$ref);
404: next if ($filename =~ m/^\.\.?$/);
405:
406: # we only have to stat entries where there are
407: # subdirectories remaining.
408: if ( $num_dirs ) {
409:
410: # there are still sub-directories left in here.
411: # look for them.
412: $dir{$dir}->{$filename} =
413: { stat => [ lstat "$dir/$filename" ] };
414:
415: if ( -d _ ) {
416: # found one!
417: $num_dirs--;
418: push @subdirs, $filename;
419: next;
420: }
421: } else {
422: # non-directory; remember the inode number
423: $dir{$dir}->{$filename} = $inode;
424: }
425:
426: if ( exists $files{$filename}
427: and $files{$filename} != $inode ) {
428:
429: # some other mirror has this same filename under a
430: # different inode. Ler's see if we can unify them.
431: my $my_files_h = $dir{$dir};
432: while ( my ($other_dir, $files_h) = each %dir ) {
433: next if $other_dir eq $dir;
434:
435: # does this other directory have this file too?
436: next unless exists $files_h->{$filename};
437:
438: whisper "COMPARE: $dir/$filename vs $other_dir/$filename";
439:
440: # stat both files if we haven't already
441: $my_files_h->{$filename} =
442: { stat => [ lstat "$dir/$filename" ] }
443: unless ref $my_files_h->{$filename} eq "HASH";
444: $files_h->{$filename} =
445: { stat => [ lstat "$other_dir/$filename" ] }
446: unless ref $files_h->{$filename} eq "HASH";
447:
448: my ($mine, $other)
449: = ($my_files_h->{$filename}, $files_h->{$filename});
450:
451: # compare file sizes, skip if different
452: whisper "file stat() info different", next
453: if ( $mine->{stat}->[7] != $other->{stat}->[7] or
454: $mine->{stat}->[2] != $other->{stat}->[2] or
455: $mine->{stat}->[4] != $other->{stat}->[4] or
456: $mine->{stat}->[5] != $other->{stat}->[5] or
457: $mine->{stat}->[1] == $other->{stat}->[1] );
458:
459: # gather SHA1 checksums
460: $my_files_h->{$filename} = digest_file "$dir/$filename"
461: unless exists $my_files_h->{$filename}->{sha1};
462: $files_h->{$filename} = digest_file "$other_dir/$filename"
463: unless exists $files_h->{$filename}->{sha1};
464:
465: ($mine, $other)
466: = ($my_files_h->{$filename}, $files_h->{$filename});
467:
468: # skip if different ("sha1" won't be set if
469: # this isn't a regular file)
470: whisper "not suitable for unification", next
471: unless ( $mine->{sha1} and $other->{sha1} );
472:
473: whisper "sha1: $mine->{sha1} vs $other->{sha1}";
474: next unless ( $mine->{sha1} eq $other->{sha1} );
475:
476: my $diff = ($mine->{stat}->[3] - $other->{stat}->[3]);
477:
478: my ($src, $dest)
479: = ("$dir/$filename", "$other_dir/$filename");
480:
481: if ( $diff < 0 or (!$diff and rand(1) < 0.5)) {
482: ($src, $dest) = ($dest, $src);
483: $my_files_h->{$filename} =
484: $files_h->{$filename};
485: } else {
486: $files_h->{$filename} =
487: $my_files_h->{$filename};
488: }
489:
490: mutter "Linking $src on top of $dest";
491:
492: chattr($src, $mode) if ( $mode and $mode eq "iI" );
493: chattr($src, "") if ( $mode and $mode eq "i" );
494:
495: link $src, $dest."unify$$" or do {
496: die "link ($src,${dest}unify$$) failed; $!";
497: next;
498: };
499:
500: rename $dest."unify$$", $dest or do {
501: moan "rename (${dest}unify$$, $dest) failed; $!";
502: unlink $dest."unify$$";
503: exit(1);
504: next;
505: };
506:
507: chattr($src, $mode) if ( $mode and $mode ne "iI" );
508:
509: @{ $my_files_h->{$filename}->{stat} } = lstat $src;
510:
511: }
512: } else {
513: # first directory to have this file, just remember the
514: # inode number.
515: $files{$filename} = $inode;
516: }
517: }
518: }
519:
520: %files=%dir=();
521:
522: # sponsored by the Friends of Recursion Society of Great Britain.
523: for my $subdir ( @subdirs ) {
524: unify_dirs ($mode, map { "$_/$subdir" } @dirs);
525: }
526: }
527:
528:
529: BEGIN {
530:
531: eval "use Pod::Constants -trim => 1, \@POD_HOOKS";
532: die $@ if $@;
533:
534: }
535:
536: #---------------------------------------------------------------------
537: # Usage functions
538: #---------------------------------------------------------------------
539:
540: sub short_usage {
541: return ("Usage: ${\(SYNOPSIS)}\n"
542: ."Try `${\(PROGNAME)} --help' for a summary of options."
543: ."\n");
544: }
545:
546: use Text::Wrap qw(wrap fill);
547: use Term::ReadKey;
548:
549: =head2 usage
550:
551: Prints the program usage (extracted from the POD).
552:
553: =cut
554:
555: sub usage {
556:
557: # alright, I'm admit this function is silly.
558:
559: my $options_string;
560: my $OPTIONS_INDENT = 2;
561: my $OPTIONS_WIDTH = 20;
562: my $OPTIONS_GAP = 2;
563: my $TOTAL_WIDTH = (GetTerminalSize())[0] - 10 || 70;
564:
565: my $DESCRIPTION_WIDTH = ($TOTAL_WIDTH - $OPTIONS_GAP -
566: $OPTIONS_INDENT - $OPTIONS_WIDTH);
567:
568: # go through each option, and format it for the screen
569:
570: for ( my $i = 0; $i < (@options>>1); $i ++ ) {
571: my $option = $options[$i*2 + 1];
572:
573: $Text::Wrap::huge = "overflow";
574: $Text::Wrap::columns = $OPTIONS_WIDTH;
575: my @lhs = map { split /\n/ }
576: wrap("","",join ", ",
577: sort { length $a <=> length $b }
578: @{$option->{options}});
579:
580: $Text::Wrap::huge = "wrap";
581: $Text::Wrap::columns = $DESCRIPTION_WIDTH;
582: my @rhs = map { split /\n/ }
583: fill("","",$option->{description});
584:
585: while ( @lhs or @rhs ) {
586: my $left = shift @lhs;
587: my $right = shift @rhs;
588: chomp($left);
589: $options_string .= join
590: ("",
591: " " x $OPTIONS_INDENT,
592: $left . (" " x ($OPTIONS_WIDTH - length $left)),
593: " " x $OPTIONS_GAP,
594: $right,
595: "\n");
596: }
597: }
598:
599: $Text::Wrap::huge = "overflow";
600: $Text::Wrap::columns = $TOTAL_WIDTH;
601:
602:
603: return (fill("","",PROGNAME . " - " . SHORT_DESC)
604: ."\n\n"
605: ."Usage: ".SYNOPSIS."\n\n"
606: .fill(" ","",DESCRIPTION)."\n\n"
607: .fill(" ","","Command line options:")
608: ."\n\n"
609: .$options_string."\n"
610: ."See `perldoc $0' for more information.\n");
611: }
612:
613: __END__
614:
615: =head1 AUTHOR
616:
617: Sam Vilain, <sam@vilain.net>
618:
619: =cut
620:
Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
Read Where should I post X? if you're not absolutely sure you're posting in the right place.
Please read these before you post! —
Posts may use any of the Perl Monks Approved HTML tags:
- a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
| |
For: |
|
Use: |
| & | | & |
| < | | < |
| > | | > |
| [ | | [ |
| ] | | ] |
Link using PerlMonks shortcuts! What shortcuts can I use for linking?
See Writeup Formatting Tips and other pages linked from there for more info.