| Category: | Text Processing |
| Author/Contact Info | vpolyakov@katrillion.com |
| Description: | This is a little something that parses an mbox file and grabs email address out of it (I use it at work to parse a bounce file and grab email addresses out of it for various purposes). Feel free to modify it, use it, whatever. (Credit info: this was actually not written by me, but by the previous network admin) |
#!/usr/bin/perl -w
use strict;
my $file = shift;
my $msg = undef;
my $count = 0;
my $count_match = 0;
my $blank = 1;
my $matched = 0;
my $addr_flag = undef;
open(MBOX, "< $file")
or die "Couldn't open mbox: !$\n";
open(ADDR_LOG, ">> addr.list")
or die "Couldn't open list: !$\n";
open(NOMATCH, ">> nomatch.mbox")
or die "Couldn't open nomatch: !$\n";
sub print_addr
{
my $addr = shift;
print "$addr_flag\n" if ($addr eq '1');
print ADDR_LOG "$addr\n";
$addr_flag = undef;
$matched = 1;
$count_match++;
}
while (<MBOX>)
{
if ($blank && /\AFrom .*\d{4}/)
{
$count++;
print NOMATCH "$msg" if (!$matched && defined($msg));
$msg = $_;
$blank = 0;
$matched = 0;
} else {
$msg .= $_;
$blank = m#\A\Z#o ? 1 : 0;
if (!$blank && !$matched)
{
if (!defined($addr_flag))
{
if (/^\s-+ The following addresses had
+ permanent fatal errors -+$/)
{
$addr_flag = "std";
} elsif (/not accepting mail with atta
+chments or embedded images:?$/) {
my ($addr) = /Your mail to (.*
+) could not/;
print_addr("$addr\@aol.com");
} elsif (/permanent error; I've given
+up\. Sorry it didn't work out\.$/) {
$addr_flag = "std";
} elsif (/undeliverable to the followi
+ng:$/) {
$addr_flag = "postfix";
} elsif (/Final-Recipient:/) {
my ($addr) = /822;(.*)/;
if (defined($addr))
{
$addr =~ s/^\s//;
if ($addr =~ /<.*>/)
{
$_ = $addr;
($addr) = /<(.
+*)>/;
}
print_addr($addr);
}
} elsif (/Receiver not found:/) {
my ($addr) = /Receiver not fou
+nd:(.*)/;
$addr =~ s/^\s//;
print_addr("$addr\@compuserve.
+com");
} elsif (/delete existing messages and
+ then empty their trash/) {
$addr_flag = "std";
} elsif (/^was not delivered to:$/) {
$addr_flag = "space";
} elsif (/^Your message$/) {
$addr_flag = "to";
} elsif (/^recipients\. The following
+address\(es\) failed:$/) {
$addr_flag = "space";
} elsif (/^Delivery to the following r
+ecipients failed\.$/) {
$addr_flag = "space";
} elsif (/Here is your List of Failed
+Recipients/) {
$addr_flag = "std";
} elsif (/The user\(s\) account is tem
+porarily over quota/) {
$addr_flag = "std";
} elsif (/-+Transcript of session foll
+ows -+/) {
$addr_flag = "space";
} elsif (/Reason: Not in authenticatio
+n system/) {
my ($addr) = /to '(.*)'/;
print_addr($addr);
} elsif (/Reason: User .* is not found
+ in the cc:Mail Directory/) {
my ($addr) = /User "(.*)"/;
print_addr($addr);
} elsif (/^User unknown: /) {
my ($addr) = /^User unknown: (
+.*)/;
print_addr($addr);
} elsif (/User mailbox exceeds allowed
+ size/) {
my ($addr) = /allowed size: (.
+*)/;
print_addr($addr);
}
} else {
if ($addr_flag eq "std")
{
my ($addr) = /<(.*)>/;
print_addr($addr);
} elsif ($addr_flag eq "to") {
my ($addr) = /\sTo:\s*(.*)/;
if (defined($addr))
{
print_addr($addr);
} else {
$addr_flag = undef;
}
} elsif ($addr_flag eq "postfix") {
my ($addr) = /\s(.*) \(user no
+t found\)/;
print_addr($addr);
} elsif ($addr_flag eq "space") {
my ($addr) = /\s*(.*):?/;
print_addr($addr);
} elsif ($addr_flag eq "wrap-std") {
$addr_flag = "std";
} elsif ($addr_flag eq "wrap-to") {
$addr_flag = "to";
} elsif ($addr_flag eq "wrap-space") {
$addr_flag = "space";
}
}
}
}
}
print "Total: $count\n";
print "Match: $count_match\n";
print "Miss : " . ($count - $count_match) . "\n";
close(ADDR_LOG);
close(MBOX);
close(NOMATCH);
Edit by dws to add <code> tags |
|
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: parsembox
by ehdonhon (Curate) on Jul 29, 2002 at 00:14 UTC | |
|
Re: parsembox
by vxp (Pilgrim) on Jul 28, 2002 at 21:55 UTC | |
by Nightblade (Beadle) on Jul 28, 2002 at 22:00 UTC |