There's loads of tools to download yahoo mail, but I couldn't get any to work the way I wanted. So I wrote this one, and I hated to just throw it away, so I put it here. Maybe someone can find some use for it. It's only 150 lines and doesn't need anything besides LWP, Digest::MD5 or (optionally) ssleay.
#!/usr/bin/perl
# This software is released free under the terms of the GPL
# This program downloads all messages (read and unread) from
# a folder in your Yahoo mail account.
# It probably doesn't work Internationally, but I'm sure
# you could fix it without much fuss.
# This program stores message ids in the message file to
# avoid downloading the same message twice.
# This works as of 1/25/2005. Yahoo can change it any time they want.
+It'll be busted then.
# Yahoo lets you get about 200 hits in quick succession then locks you
+r account for about
# 15 minutes. Sometimes SSL works when regular HTTP doesn't, so I try
+both.
# I read somewhere that loging into other countries might work too, bu
+t I didn't try that.
# Anyway, you can use the delay option if you want it to download all
+night, but it takes
# an average of 11 secs per message. That's worked fine for me when I'
+m patient.
# Need MD5 for non-ssl logins. If you have ssleay and don't have md5 t
+hen skip it.
# (If you have ssleay and don't have md5 then you're weird.)
use Digest::MD5 qw(md5_hex);
# Set up lwp
use LWP;
my $browser = LWP::UserAgent->new;
# Pretend to be IE -- eeeeww
$whitelie='Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR
+ 1.1.4322)';
# we need to handle redirection manually -- lwp isn't entirely compati
+ble
@{$browser->requests_redirectable}=undef;
$browser->cookie_jar({});
# generalized http get and post functions
sub get{ my $r=$browser->get(@_,'User-Agent' => $whitelie); er($r) }
# post requires a %f global. Tacky. Sorry. Fix it if you care. I don't
+. I resent it that you do.
sub post { my $r=$browser->post(shift,[%f],$whitelie); er($r) }
# if get or post go wrong use this -- handles redirects
sub er {
my $r=shift;
if($r->is_success) { return $r->content }
my $h=$r->as_string;
if($h=~/\nLocation: (http.*)/i) {
my $d=$1;
$d=~s/\.com,\+.*/.com/;
#print STDERR "redirect: $d\n";
return get($d);
}
print STDERR "get or post HTTP error: ".$r->as_string."\n\n";
return undef;
}
die "usage: $0 username password {folder(s)} {usedelay}
-- folders can be separated by commas
-- Defaults to Inbox if not specified
-- enter yes or 1 after the folder to wait between messages to avoid
+yahoo blocking you.
" unless $#ARGV > 0;
($uname,$passwd,$folders,$delay)=@ARGV;
$folder='Inbox' unless defined($folder);
print STDERR "Logging in...";
#login();
loginssl() unless $url;
$p=get($url) if $url;
print STDERR "\n";
($host,$yy)=$p=~/\/\/(.*?)\/ym.*[\?\&]yy=([0-9A-Fa-f]*)/i;
if(! $yy) {
open(E,">er.txt"); print E $p; close E;
die "Couldn't log in. Either something's wrong or you have to wait 15
+ minutes to try again.\n";
}
for $folder (split(',',$folders)) {
$folder=~s/^ *(.*?) *$/$1/;
$out="$uname-$folder.txt";
print STDERR "Freshening $out\n";
$folder=~y/ /+/; # urlencode -- should do more, but I'm lazy.
# scan the file for already downloaded messages.
open(F,"<$out");
while(<F>) {
$have{$1}=1 if /^mid=(.*)/;
}
$url="http://$host/ym/ShowFolder?rb=$folder&reset=1&YY=$yy";
open(F,">>$out");
while($url) {
$p=get($url);
($totmes)=$p=~/Messages \S* of (\d+)/i;
($page)=$url=~/&Npos=(\d+)/i;
printf STDERR "$folder page %d of %d \r",$page+1,int($to
+tmes/25)+1;
while($p=~/href="(\/ym\/ShowLetter\?MsgId=[^"]*)"/gi) {
$m=$1;
($mid)=$m=~/MsgID=(.*?)&/i;
next if $have{$mid};
$have{$mid}=1;
($idx)=$m=~/Idx=(\d+)/i;
print STDERR "$folder message $idx of $totmes \r";
$b=get("http://$host$m&Nhead=f");
if(! $b=~/START TOC/) {
open(E,">er.txt");
print E $b;
die "It stopped working -- check er.txt";
}
next unless $b=~/START TOC/;
$b=~s/^.*START TOC.*?\n(.*)<!-- END TOC .*/$1/is;
$b=~s/<[^>]*>//g; $b=~s/ +\n/\n/g; $b=~s/\n\n+/\n\n/g;
$b=~s/ ?/ /gi;
$b=~s/>?/>/gi;
$b=~s/<?/</gi;
$b=~s/&#(\d+);?/chr($1)/ge;
print F "$b\nmid=$mid\n-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n\n";
sleep(1+rand()*20) if $delay;
}
($url)=$p=~/="([^"]*)">\s*Next/i;
$url="http://$host$url" if $url;
}
if( $p=~/ \| Next \| Last/i) {
print STDERR "\n$folder Complete.\n"
} else {
print STDERR "\n$folder Download not complete -- wait 15 minutes an
+d try again.\n";
}
}
# global login functions set $url on success
sub loginssl {
local %f;
my $p=get('https://mail.yahoo.com');
print STDERR ".";
my ($f)=$p=~/(<form.*?type="password".*?<\/form)/is;
($url) = $f=~/action="(https.*?)"/i;
%f=(
login => $uname,
passwd => $passwd,
'.persistent'=>'',
'.save'=>'Sign In',
);
while($f=~/hidden"?\s*name\s*=\s*"([^"]*)"\s*value\s*=\s*"([^"]*)"/i
+gs) {
$f{$1}=$2;
}
$p=post($url);
print STDERR ".";
($url)=$p=~/window\.location\.replace\("(.*?)"/;
}
sub login {
my ($p,$f,%f);
$p=get('http://mail.yahoo.com');
print STDERR ".";
($f)=$p=~/(<form.*?type="password".*?<\/form)/is;
$url = "http://login.yahoo.com/config/login?";
while($f=~/hidden"?\s*name\s*=\s*"([^"]*)"\s*value\s*=\s*"([^"]*)"/i
+gs) {
$f{$1}=$2;
$url.="$1=$2&";
}
$url.="login=$uname&passwd=".md5_hex(md5_hex($passwd).$f{'.challenge
+'}).
"&.persistent=&.save=Sign+In&.hash=1&.js=1&.md5=1";
}