mhearse has asked for the wisdom of the Perl Monks concerning the following question:
The utf-8 subjects show up like this in the mysql table: UTF8?B?5LuO5Y2a5a6i5paH56ug5Lit5p+l5om+5oKo5oSf5YW06Laj55qE5Li7?==?UTF-8?B?6aKY?=
I am able to manually insert a utf-8 encoded string into the subject column. So I've got the column encoding set correctly.
My program is able to store the utf-8 message body correctly.
Also, the Mime::Parser module doesn't seem to be able to handle gb2312, big5, or gbk character encodings.
Can someone offer guidance?
MAIN PROGRAM #!/usr/bin/perl use strict; use DBI; use File::Type; use Date::Parse; use MIME::Parser; use Mail::POP3Client; use Getopt::Std; use Data::Dumper; use POSIX qw(strftime); use Compress::Zlib qw(compress); binmode(STDOUT, ":utf8"); $|++; my %opts; getopts('d', \%opts); my $dbh = DBI->connect( 'dbi:mysql:mail_archive', 'username', 'password', { AutoCommit => 1 } ) or die $DBI::errstr; $dbh->{mysql_enable_utf8} = 1; $dbh->do("set character set utf8"); $dbh->do("set names utf8"); my %query; $query{max_pack} = $dbh->prepare(<<EOQ); set max_allowed_packet=16776192 EOQ $query{last_id} = $dbh->prepare(<<EOQ); select last_insert_id() EOQ $query{body_md5_match} = $dbh->prepare(<<EOQ); select body_id from MA_body where check_sum = md5(?) and body = ? EOQ $query{attach_md5_match} = $dbh->prepare(<<EOQ); select attach_id from MA_attach where check_sum = md5(?) and attach = +? EOQ $query{ins_header} = $dbh->prepare(<<EOQ); insert into MA_hdr ( subject, msgid, body_id, full_header ) values (?,?,?,?) EOQ $query{ins_addr} = $dbh->prepare(<<EOQ); insert into MA_addr ( email_header_id, hdr_id, header_type, email_id ) values (?,?,?,?) EOQ $query{lkup_email} = $dbh->prepare(<<EOQ); select email_id from MA_email where email_addr = ? EOQ $query{ins_email} = $dbh->prepare(<<EOQ); insert ignore into MA_email (email_addr) values (?) EOQ $query{ins_body} = $dbh->prepare(<<EOQ); insert into MA_body ( body, check_sum ) values (?, md5(?) ) EOQ $query{ins_attach} = $dbh->prepare(<<EOQ); insert into MA_attach ( attach_name, mime_type, check_sum, attach ) values (?, ?, md5(?), ?) EOQ $query{ins_attach_addr} = $dbh->prepare(<<EOQ); insert into MA_attach_addr ( hdr_id, attach_id ) values (?,?) EOQ $query{max_pack}->execute(); my $ft = File::Type->new(); POP3_CONNECTION: my $pop = new Mail::POP3Client(HOST => "mail.tradetech.net"); $pop->User("mail_archive_mirror"); $pop->Pass("PTWH7EJU"); ### Loop forever. while (1) { if (! $pop->Connect()) { sleep 2; goto POP3_CONNECTION; } ### Now we iterate over each message present on the server. for (my $num = 1; $num <= $pop->Count(); $num++) { my $message = $pop->Retrieve($num); ### Cut message into parts. my $parts = parse_message($message); ### Store parts to sql tables. create_record($parts); ### Delete the message. $pop->Delete($num); } sleep 1; } ################################################################# sub debug { ################################################################# print @_, "\n" if $opts{d}; } ################################################################# sub create_record { ################################################################# my $parts = shift(); my $header_id; my $body_id; my $header_cntr = 1; ### Check to see if we have an existing body record. $query{body_md5_match}->execute($parts->{body}, $parts->{body}); if ($query{body_md5_match}->rows() == 1) { ($body_id) = $query{body_md5_match}->fetchrow_array(); } if (!$body_id) { ### We need to add a body record, becuase one doesn't exist. $query{ins_body}->execute( $parts->{body}, $parts->{body}, ); $query{last_id}->execute(); ($body_id) = $query{last_id}->fetchrow_array(); } ### Insert the header record for the message. $query{ins_header}->execute( $parts->{subject}, $parts->{'Message-ID'}, $body_id, $parts->{full_header}, ); $query{last_id}->execute(); ($header_id) = $query{last_id}->fetchrow_array(); ### Insert addr records for the header parts. for my $addr qw(from to cc) { for my $email (@{$parts->{"distinct_$addr"}}) { $query{ins_email}->execute($email); $query{lkup_email}->execute($email); my ($email_id) = $query{lkup_email}->fetchrow_array(); $query{ins_addr}->execute( $header_cntr++, $header_id, $addr, $email_id, ); } } ### Add attachments records. if (-d $parts->{output_dir}) { opendir DIR, $parts->{output_dir}; my @attachments = grep { ! /^\./ && ! /^msg/ && ! /\.txt$/ } r +eaddir DIR; closedir DIR; chdir $parts->{output_dir}; ATTACH_LOOP: for my $attachment (@attachments) { open FILE, $attachment; my $contents = do { local $/; <FILE> }; close FILE; my $mt = $ft->mime_type($contents); my $contents_gz = compress($contents, 9); ### We skip large attachments. $query{attach_md5_match}->execute($contents_gz, $contents_ +gz) or next ATTACH_LOOP; my ($attach_id) = $query{attach_md5_match}->fetchrow_array +(); debug("# attach_id attach_md5_match: $attach_id"); if (!$attach_id) { $query{ins_attach}->execute( $attachment, $mt, $contents_gz, $contents_gz, ); $query{last_id}->execute(); ($attach_id) = $query{last_id}->fetchrow_array(); debug("# new attachment inserted into MA_attach: $atta +ch_id"); } ### Insert attach_addr record. $query{ins_attach_addr}->execute( $header_id, $attach_id, ); } } } ################################################################# sub parse_message { ################################################################# my $message = shift; my $parser = MIME::Parser->new() or return 0; $parser->ignore_errors(1) or return 0; $parser->extract_uuencode(1) or return 0; $parser->output_under('/tmp'); my $entity = $parser->parse_data($message) or die $!; my $header = $entity->head() or die $!; my $parts = { map { my $val = $header->get($_); chomp $val if $val; $_ => $val; } qw(subject date to from cc Message-ID) }; $parts->{full_header} = $header->as_string(); ### Remove Trash from mail address fields. for my $var (qw(from to cc)) { $parts->{$var} = lc $parts->{$var}; # my @matches = $parts->{$var} =~ /[a-z0-9\._]*@[a-z0-9\._]*\. +[a-z0-9]*/g; my @matches = $parts->{$var} =~ /[a-z0-9\._-]*@[a-z0-9\._-]*\. +[a-z0-9\._-]*/g; my %seen; ### Get a uniqe list. for my $elmt (@matches) { $seen{$elmt}++; } @matches = keys %seen; ### Separate values with comma if (@matches) { $parts->{$var} = join ',', @matches; # $parts->{$var} =~ s/.$//; if ($var eq "from" || $var eq "to" || $var eq "cc") { $parts->{"distinct_$var"} = \@matches; } } } ### Gives the name of the output directory tree. $parts->{output_dir} = $parser->output_dir(); ### Put body together. if (-d $parts->{output_dir}) { $parts->{body} = ""; opendir DIR, $parts->{output_dir}; my @body_parts = grep { /^msg/ } readdir DIR; sort @body_parts; closedir DIR; chdir $parts->{output_dir}; for my $part (@body_parts) { open FILE, $part; my $contents = do { local $/; <FILE> }; close FILE; $parts->{body} .= $contents; } } ### $parts->{body} = join "", @{$entity->body()}; debug("# Email parts"); debug(Dumper($parts)); return $parts; } __END__
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Mime::Parser utf-8 issue
by blahblahblah (Priest) on Jun 12, 2009 at 03:30 UTC | |
by mhearse (Chaplain) on Jun 12, 2009 at 19:02 UTC | |
by blahblahblah (Priest) on Jun 17, 2009 at 00:42 UTC | |
|
Re: Mime::Parser utf-8 issue
by runrig (Abbot) on Jan 22, 2013 at 22:51 UTC |