1: #!/usr/bin/perl
2:
3: # Simple program to remove duplicate email messages
4: # from an mbox file. This program only looks at the content
5: # of the message for uniqueness, not entire message with the headers.
6: # There is no file locking, use this program on a backup
7: # of your mbox file.
8: # Enjoy.
9:
10: use strict;
11: use warnings;
12: use Digest::MD5 qw(md5_hex);
13:
14: #grab file names from the program parameters.
15: #and do some error checking.
16: my ($from, $to) = @_;
17: die "usage: $0 from to" unless (defined $from && defined $to);
18: my (%uniq, $msg);
19: my ($head, $body);
20: my $i = 0;
21:
22: $|++;
23:
24: open (my $fh, "<$from") || die "cannot open $from: $!";
25: while(<$fh>) {
26: #emails in mbox files always begin with ^From
27: #when /^From / is matched, process the previous message
28: #then start on this message
29: if(m/^From /) {
30: next if ($msg eq "");
31: #increment the counter for a status report
32: $i++;
33: #print a status report if necessary.
34: #I like to do it this way
35: print '.' if(($i % 50) == 0);
36: print " $i\n" if(($i % 1000) == 0);
37: #since evolution can give different headers on the same message,
38: #only hash the body of the message, and use that to compare to other
39: #emails. The entire message will be stored in the hash though.
40: ($head, $body) = split /\n\n/, $msg;
41: #standard perl technique for removing duplicates, using hashes and
42: #md5 files.
43: $uniq{md5_hex($body)} = $msg;
44:
45: #done processing the previous message, start the next message
46: $msg = $_;
47: } else {
48: #current line didn't match /^From / so this line is part of the
49: #middle of the current message. Just tack it on to the end.
50: $msg .= $_;
51: }
52: }
53:
54: #print the results to a file.
55: open (my $th, ">$to") || die "cannot open $to: $!";
56: while(my ($k, $v) = each %uniq) {
57: print $th $v;
58: }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Remove Duplicates from a mbox file
by Anonymous Monk on Sep 23, 2003 at 22:36 UTC | |
by Anonymous Monk on Mar 24, 2010 at 02:44 UTC | |
by coolmichael (Deacon) on Sep 24, 2003 at 05:41 UTC | |
by Anonymous Monk on Oct 11, 2007 at 03:20 UTC | |
by Anonymous Monk on Oct 21, 2009 at 13:40 UTC |