#!/usr/local/bin/perl $VERSION = '0.05'; use warnings; use strict; # undupe-mail-body - Print only the non duplicate messages based on b +ody. # MODIFIED: Nov 17 2005 use Digest::MD5; use Mail::Mbox::MessageParser; use Pod::Usage; pod2usage( { '-exitval' => 0 , '-verbose' => 2} ) if scalar map { $_ =~ m/^ -+ (?: \? | h(?:elp)? ) $/ix ? 1 : () } @ARGV; pod2usage( { '-exitval' => 1 , '-verbose' => 0} ) unless scalar @ARGV; Mail::Mbox::MessageParser::SETUP_CACHE ( { 'file_name' => '/tmp/cache.mbox-parser' } ); my @mboxes = @ARGV; foreach my $mb ( @mboxes ) { my $parser = Mail::Mbox::MessageParser->new ( { 'debug' => 1 , 'enable_cache' => 1 , 'enable_grep' => 1 , 'file_name' => $mb } ); do { warn $parser; next; } unless ref $parser; # Save for later to print. my $prolog = $parser->prologue(); my $ordered = 1; my $msgs = find_unique( $parser , $ordered); print_mail ( $ordered ? order($msgs) : [ map $_->[0] , @$msgs ] , $prolog ); } exit; # Return an array reference of array references containing a message +& its # (optional) order, given Mail::Mbox::MessageParser object & an indic +ator for # ordering request. sub find_unique { my ($parser , $ordered) = @_; my %seen; my $count = 0; while( ! $parser->end_of_file() ) { my $mail = $parser->read_next_email; my $digest = get_body_digest($mail); $seen{$digest} = [ $mail , $ordered ? $count++ : () ] unless exists $seen{$digest} ; } return [values %seen]; } # Return array reference of ordered messages (as scalar references) g +iven in # an array reference containing a message & order in another array re +ference. sub order { my ($messages) = @_; return [ map $_->[0] , sort { $a->[1] <=> $b->[1] } @$messages ]; } # Print messages given as an array reference of scalar references, w/ # optional prologue. sub print_mail { my ($mails , $prolog) = @_; print $prolog if defined $prolog; print $$_ , $/ for @$mails; } { my ( $md5, $start_body ); # Return digest of body given a email message. sub get_body_digest { my ($text) = @_; $md5 = Digest::MD5->new; # Extract body from a message. $start_body = undef; while ( $$text =~ m/^(.*)$/mg ) { $start_body = 1 if $1 =~ m/^$/; $md5->add( $1 || '' ) if $start_body; } # For Debugging. #printf STDERR "==>> %s \%s\n" , $md5->clone->hexdigest , $text; #printf STDERR "==>> \%s\n" , $text; return $md5->hexdigest; } } __END__ =pod =head1 NAME undupe-mail-body - Print only the non duplicate messages based on body. =head1 SYNOPSIS undupe-mail-body -help undupe-mail-body <mbox> [mbox2 , [mbox3 , ... ]] undupe-mail-body mbox-with-body-dups > mbox-without-body-dups =head1 DESCRIPTION I found at least two ways -- L<procmail(1)> and L<mutt(1)> -- which can delete duplicate messages based on the C<Message-ID> header. Failed i to find anything which would delete messages based on duplicate BODIES. Given I<mbox>-format mailboxes, this program prints, on I<standard out>, only those messages which are unique based on only the body. Original mailbox is accessed only for reading. Only the first encountered instance (of multiplicates) is retained. =head2 Incorrect start of email found For some of the messages in a mailbox, which otherwise load up fine in L<mutt(1)>, L<Mail::Mbox::MessageParser> indicates C<Incorrect start of email found>. Turning off C<enable_cache> (and C<enable_grep>) on the first run, or a rerun with C<enable_*> options turned on does not cause C<Incorrect start of email> to be printed. So, please do not be alarmed (like i did) if the above happens. =head1 OPTIONS =over 2 =item B<help> Show help message. =item B<ordered> Keep the order of output same as input, minus any duplicates. B<Currently, it is a hard coded value.> =back =head1 TO DO Allow I<ordered> option to be set on command line. I would like this program to be a filter such that it gathers the input on I<standard in> in addition. This can be achieved by giving C<*STDIN> to C<Mail::Mbox::MessageParser-E<gt>new()>. =head1 BUGS =over 2 =item After building a cache for the first time for a mailbox, same SCALAR reference is printed (via C<printf STDERR ...> in get_body_digest()) for all the messages. Any subsequent runs produce the expected output. =back =head1 AUTHOR, LICENSE, DISTRIBUTION, ETC. Parv, parv_@yahoo.com Modified: Nov 17 2005 This software is free to be used in any form only if proper credit is given. I am not responsible for any kind of damage or loss. Use it at your own risk. =cut
In reply to Remove messages w/ duplicate bodies from mbox(es) by parv
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |