Beefy Boxes and Bandwidth Generously Provided by pair Networks
"be consistent"
 
PerlMonks  

Download yahoo mail

by jhanna (Scribe)
on Jan 25, 2005 at 18:41 UTC ( [id://424988]=CUFP: print w/replies, xml ) Need Help??

There's loads of tools to download yahoo mail, but I couldn't get any to work the way I wanted. So I wrote this one, and I hated to just throw it away, so I put it here. Maybe someone can find some use for it. It's only 150 lines and doesn't need anything besides LWP, Digest::MD5 or (optionally) ssleay.
#!/usr/bin/perl # This software is released free under the terms of the GPL # This program downloads all messages (read and unread) from # a folder in your Yahoo mail account. # It probably doesn't work Internationally, but I'm sure # you could fix it without much fuss. # This program stores message ids in the message file to # avoid downloading the same message twice. # This works as of 1/25/2005. Yahoo can change it any time they want. +It'll be busted then. # Yahoo lets you get about 200 hits in quick succession then locks you +r account for about # 15 minutes. Sometimes SSL works when regular HTTP doesn't, so I try +both. # I read somewhere that loging into other countries might work too, bu +t I didn't try that. # Anyway, you can use the delay option if you want it to download all +night, but it takes # an average of 11 secs per message. That's worked fine for me when I' +m patient. # Need MD5 for non-ssl logins. If you have ssleay and don't have md5 t +hen skip it. # (If you have ssleay and don't have md5 then you're weird.) use Digest::MD5 qw(md5_hex); # Set up lwp use LWP; my $browser = LWP::UserAgent->new; # Pretend to be IE -- eeeeww $whitelie='Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR + 1.1.4322)'; # we need to handle redirection manually -- lwp isn't entirely compati +ble @{$browser->requests_redirectable}=undef; $browser->cookie_jar({}); # generalized http get and post functions sub get{ my $r=$browser->get(@_,'User-Agent' => $whitelie); er($r) } # post requires a %f global. Tacky. Sorry. Fix it if you care. I don't +. I resent it that you do. sub post { my $r=$browser->post(shift,[%f],$whitelie); er($r) } # if get or post go wrong use this -- handles redirects sub er { my $r=shift; if($r->is_success) { return $r->content } my $h=$r->as_string; if($h=~/\nLocation: (http.*)/i) { my $d=$1; $d=~s/\.com,\+.*/.com/; #print STDERR "redirect: $d\n"; return get($d); } print STDERR "get or post HTTP error: ".$r->as_string."\n\n"; return undef; } die "usage: $0 username password {folder(s)} {usedelay} -- folders can be separated by commas -- Defaults to Inbox if not specified -- enter yes or 1 after the folder to wait between messages to avoid +yahoo blocking you. " unless $#ARGV > 0; ($uname,$passwd,$folders,$delay)=@ARGV; $folder='Inbox' unless defined($folder); print STDERR "Logging in..."; #login(); loginssl() unless $url; $p=get($url) if $url; print STDERR "\n"; ($host,$yy)=$p=~/\/\/(.*?)\/ym.*[\?\&]yy=([0-9A-Fa-f]*)/i; if(! $yy) { open(E,">er.txt"); print E $p; close E; die "Couldn't log in. Either something's wrong or you have to wait 15 + minutes to try again.\n"; } for $folder (split(',',$folders)) { $folder=~s/^ *(.*?) *$/$1/; $out="$uname-$folder.txt"; print STDERR "Freshening $out\n"; $folder=~y/ /+/; # urlencode -- should do more, but I'm lazy. # scan the file for already downloaded messages. open(F,"<$out"); while(<F>) { $have{$1}=1 if /^mid=(.*)/; } $url="http://$host/ym/ShowFolder?rb=$folder&reset=1&YY=$yy"; open(F,">>$out"); while($url) { $p=get($url); ($totmes)=$p=~/Messages \S* of (\d+)/i; ($page)=$url=~/&Npos=(\d+)/i; printf STDERR "$folder page %d of %d \r",$page+1,int($to +tmes/25)+1; while($p=~/href="(\/ym\/ShowLetter\?MsgId=[^"]*)"/gi) { $m=$1; ($mid)=$m=~/MsgID=(.*?)&/i; next if $have{$mid}; $have{$mid}=1; ($idx)=$m=~/Idx=(\d+)/i; print STDERR "$folder message $idx of $totmes \r"; $b=get("http://$host$m&Nhead=f"); if(! $b=~/START TOC/) { open(E,">er.txt"); print E $b; die "It stopped working -- check er.txt"; } next unless $b=~/START TOC/; $b=~s/^.*START TOC.*?\n(.*)<!-- END TOC .*/$1/is; $b=~s/<[^>]*>//g; $b=~s/ +\n/\n/g; $b=~s/\n\n+/\n\n/g; $b=~s/&nbsp;?/ /gi; $b=~s/&gt;?/>/gi; $b=~s/&lt;?/</gi; $b=~s/&#(\d+);?/chr($1)/ge; print F "$b\nmid=$mid\n-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n\n"; sleep(1+rand()*20) if $delay; } ($url)=$p=~/="([^"]*)">\s*Next/i; $url="http://$host$url" if $url; } if( $p=~/ \| Next \| Last/i) { print STDERR "\n$folder Complete.\n" } else { print STDERR "\n$folder Download not complete -- wait 15 minutes an +d try again.\n"; } } # global login functions set $url on success sub loginssl { local %f; my $p=get('https://mail.yahoo.com'); print STDERR "."; my ($f)=$p=~/(<form.*?type="password".*?<\/form)/is; ($url) = $f=~/action="(https.*?)"/i; %f=( login => $uname, passwd => $passwd, '.persistent'=>'', '.save'=>'Sign In', ); while($f=~/hidden"?\s*name\s*=\s*"([^"]*)"\s*value\s*=\s*"([^"]*)"/i +gs) { $f{$1}=$2; } $p=post($url); print STDERR "."; ($url)=$p=~/window\.location\.replace\("(.*?)"/; } sub login { my ($p,$f,%f); $p=get('http://mail.yahoo.com'); print STDERR "."; ($f)=$p=~/(<form.*?type="password".*?<\/form)/is; $url = "http://login.yahoo.com/config/login?"; while($f=~/hidden"?\s*name\s*=\s*"([^"]*)"\s*value\s*=\s*"([^"]*)"/i +gs) { $f{$1}=$2; $url.="$1=$2&"; } $url.="login=$uname&passwd=".md5_hex(md5_hex($passwd).$f{'.challenge +'}). "&.persistent=&.save=Sign+In&.hash=1&.js=1&.md5=1"; }

Replies are listed 'Best First'.
A reply falls below the community's threshold of quality. You may see it by logging in.

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: CUFP [id://424988]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others browsing the Monastery: (3)
As of 2024-04-19 00:49 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found