sub new {
my $class = shift;
bless {
text_formatter => Text::Markdown->new(),
@_,
} => $class;
};
sub from_imap_client {
my ($package, $conn, $uid, %info) = @_;
#weaken $conn;
my $dt = $conn->date( $uid );
my $timestamp;
if( $dt ) {
$dt =~ s!\s*\(([A-Z]+.*?)\)\s*$!!; # strip clear name of timezone at end
$dt =~ s!\s*$!!; # Strip whitespace at end
$dt =~ s!\bUTC$!GMT!; # "UTC" -> "GMT", ah, well...
# Let's be lenient about the time format:
my $format = '%d %b %Y %H:%M:%S';
if( $dt =~ /,/ ) {
# As good citizens, they added the weekday name:
$format = "%a, $format";
};
if( $dt =~ /[A-Z]+$/ ) {
# Instead of an offset, we have the name :-/
$format = $format . " %Z";
} else {
# The default, for well-formed servers, an TZ offset
$format .= " %z";
};
if( ! eval { $timestamp = Time::Piece->strptime( $dt, $format ); 1 }) {
die "$@:\n$format - [$dt]\n";
};
} else {
$dt ||= '';
warn "No timestamp for $uid?! [$dt]";
};
my $self = $package->new(
%info,
uid => $uid,
date => $timestamp,
imap => $conn,
body => $conn->get_bodystructure($uid),
envelope => $conn->get_envelope($uid),
categories => [],
);
$self
};
sub imap() {
$_[0]->{imap}
};
=head2 C<< ->body( $pref ) >>
Finds the preferred body type or one of HTML, PLAIN
(in that order)
=cut
# XXX Check for inline images
# XXX Check for other linked media and inline it
sub best_alternative {
my( $self, $body ) = @_;
my $part;
($part)= grep { warn $_ . " (". $_->bodysubtype . ")";
'HTML' eq uc $_->bodysubtype
}
grep { 'HEAD' ne $_->bodytype }
$body->bodystructure;
if(! $part) {
($part) = grep { warn $_ . " (". $_->bodysubtype . ")";
'PLAIN' eq $_->bodysubtype
}
grep { 'HEAD' ne $_->bodytype }
$body->bodystructure;
};
return $part
}
sub body {
my $self = shift;
# We should query:
# If multipart/alternative:
# Choose text/html
# Choose text/plain
# else
# use whatever we got
# And then look for attached images
#warn Dumper [ $self->{body}->parts ];
#warn Dumper $self->{body}->bodystructure;
#warn $self->{body}->bodytype . "/" . $self->{body}->bodysubtype;
#for ($self->{body}->parts) {
# warn "Part: $_";
# warn Dumper $self->{imap}->bodypart_string( $self->{uid}, $_ );
#};
# XXX Recursively enumerate the body parts
my %elements= (
text => [],
images => [],
);
my $body = $self->{body};
my @toplevel = grep { $_->id =~ /^\d+$/ } $body->bodystructure;
if( ! @toplevel ) {
@toplevel = $body;
};
for my $part (@toplevel) {
# This walks the whole mail structure
#warn sprintf "Part %s", $part->id;
my $mime= sprintf "%s/%s", lc $part->bodytype, lc $part->bodysubtype;
#warn $mime;
if( 'multipart/alternative' eq $mime ) {
# -> subroutine: get_best_alternative()
warn "Have multipart bodytype, finding best alternative";
# find suitable part
# Assume it's about text anyway
# Cascade from HTML -> plaintext
$part = $self->best_alternative( $part );
push @{ $elements{ text } }, {
content => $self->imap()->bodypart_string( $self->{uid}, $part->id ),
mime => $mime,
part => $part
};
#warn Dumper $elements{ text }->[-1];
} elsif( 'multipart/mixed' eq $mime ) {
# Find the different types (image, text, sound?) and use
# one from each, resp concatenate
# -> subroutine: get_parts()
} else {
#warn sprintf "Have only one body type (%s)", $part->bodytype;
# take what we got
if( 'image' eq $part->bodytype ) {
my $ref = $part->bodyparms;
next if( ! ref $part->bodyparms );
my $name = $part->bodyparms->{name};
# Download the image so we can later serve it up again under its name
#push @{ $elements{ images } }, $part->textlines;
$body = sprintf '
',
$self->permabase,
$name;
#warn "Image detected, serving up as '$body'";
@{ $elements{ text } } = { content => $body, mime => "text/html", part => undef };
} elsif( 'text/plain' eq $mime ) {
push @{ $elements{ text } }, {
content => $self->imap()->bodypart_string( $self->{uid}, $part->id ),
mime => $mime,
part => $part
};
# we should handle text/html more gracefully!
} else {
#my $body= $part->textlines;
#warn "Unknown / unhandled part $mime, hope it's text.";
#warn Dumper $part;
push @{ $elements{ text } }, {
content => $self->imap()->bodypart_string( $self->{uid}, $part->id ),
mime => $mime,
part => $part
};
};
};
};
if( ! @{$elements{ text }}) {
return 'No mail body';
};
$body= $elements{ text }->[0]->{content};
my $type= $elements{ text }->[0]->{mime};
my $part= $elements{ text }->[0]->{part};
# Decode the transport encoding
if( $part and 'base64' eq $part->bodyenc ) {
$body = decode_base64( $body );
} elsif( $part and 'quoted-printable' eq $part->bodyenc ) {
$body = decode_qp( $body );
};
# Decode to appropriate charset
if( $part and ref $part and my $enc = $part->bodyparms->{charset}) {
$body = decode( $enc, $body );
};
# Find out whether we have HTML or plaintext
if( 'text/plain' eq $type ) {
$part= $body;
# XXX Maybe this should go into a separate sub/module
# Strip mail footer
$body=~ s!\n--\s*\n.*!!sm;
# Strip obvious reply:
$body=~ s!----- Reply message -----.*!!sm;
# Strip quoted part in case we have a reply
# A reply counts as
# a line ending with ":" as the "$foo wrote on $date:"
# followed by more than three (consecutive!) quoted lines
$body=~ s!^[A-Z].*:(?:\s*^>.*$){3,}!!mg;
#or warn "No quoted part found";
# XXX Fix the hardcoded ->markdown method
# XXX Maybe this should go into a separate sub/module
$body= $self->{text_formatter}->markdown( $body );
} elsif( 'text/html' eq $type ) {
# XXX Strip JS and other stuff, resp. only allow good stuff
# Most likely Clinton's HTML::StripScripts is the module to use
# XXX Maybe this should go into a separate sub/module
# XXX Also, we should use an HTML parser
$body=~ s!
\s*--\s*
.*!!ms; # Strip mail footer
};
$body
};