#!/usr/bin/perl -W 
###############################################################################
# Name: FetchYahoo
# Purpose: retrieves messages from Yahoo! Mail, saving them to a local spool
# Description:  FetchYahoo is a Perl script that downloads mail from a Yahoo! 
#               webmail account to a local mail spool. It is meant to replace
#               fetchmail for people using Yahoo! mail since Yahoo!'s POP service
#               is no longer free. It downloads messages to a local mail spool, 
#               stripping all HTML tags and attachments . It then deletes messages
#               without multiple parts and attachments. Messages with multiple
#               parts or attachments are left on the server (a future release will
#               include them).
# Author:  Ravi Ramkissoon
# Author's E-mail: ravi_ramkissoon@yahoo.com
# License: Gnu Public License
# Created: 04.12.02
# Modified: 04.17.03
# Version: .0.6
# Credit: Vadim Zeitlin <vadim@wxwindows.org>, author of yahoo2mobx.pl
# For: Structure and ideas (and doing the hard work for me :)
# Credit: David Symonds <ds@seul.org>, author of NAMG for inspiration.
###############################################################################

# Installation instructions
#  1. download this fetchyahoo.pl (you probably already have)
#  2. chmod a+rx fetchyahoo.pl
#  3. edit fetchyahoo.pl, using your username and password
#  4. edit fetchyahoo.pl, putting in your mail spool or an mbox file
#       (usually /var/spool/mail/username )
#  5. run ./fetchyahoo.pl  (--newonly to download new msgs only)
#  6. use your mail client to get mail from your spool
#  7. e-mail ravi_ramkissoon@yahoo.com with bugs and feature requests

# Procmail instructions
#  1. use "/path/to/procmail" as your mail spool name
#  2. edit the $spool declaration line to read
#       my $spool = "|" . $spoolname ;
 
# TODO/Changelog:
#  1. Get login working - DONE
#  2. Download summary page - DONE
#  3. Parse summary page for number of messages - DONE
#  4. Code main loop to parse each msgid from yahoo - DONE
#  5. Parse message page for raw message text - DONE
#  6. Send message to spool - DONE
#  7. Delete successfully grabbed msgs - DONE
#  8. Get options working, option to d/load new messages only - DONE
#  9. Remove use of the specific us.f149 webmail server - DONE
# 10. Include directions for using procmail - DONE
# 11. Make sure (internal and header) From_ lines are RFC 822 compliant - DONE
# 12. Add a --nodelete option which specifies no deleting of messages - DONE
# 13. Make parsing of the From_ line more rigorous (make one if necessary) - DONE
# 14. Better error messages for wrong username/password - DONE
# 14. Get attachments working (download or include in message) - TODO
#     14.1 Cycle through bodyParts grabbing each one in turn
#     14.2 Make a mail up from pieces and then spool that (MIME::Entity)
# 15. Additional features. Forward to another address ? - TODO

use strict;

use Getopt::Long ();
use HTML::Entities ();
use HTML::HeadParser ();
use HTML::TokeParser ();
use HTTP::Request::Common qw(GET POST);
use HTTP::Cookies ();
use LWP::UserAgent ();
use LWP::Simple ();

# MUST configure these
my $username = "yahoousername";
my $password = "yahoopassword";
my $spoolname = "/var/spool/mail/localusername";

# May need to edit these (possibly in the future)
my $homesuff = "/ym/ShowFolder?box=Inbox";
my $msgsuff = "/ym/ShowLetter?box=Inbox&PRINT=1&Nhead=f&MsgId=";
my $spool = ">>" . $spoolname ;
my $versionString = "FetchYahoo Version .0.6\n";

# turn this on for tons of debugging messages
# use LWP::Debug qw(+);

my $usage = <<EOF
$0 [options] 

Retrieves messages from the inbox of a Yahoo user using the web interface
and stores them in the specified local spool/mbox file. Deletes the messages
with no attachments.

--version       print the version and exit
--help          give the usage message showing the program options
--newonly       only download new (i.e. unread) messages
--nodelete      do not delete messages after downloading them

NOTE: Username, password and path to mail spool need to be set
by editing fetchyahoo.pl
EOF
;


###############################################################################
# Subroutines
###############################################################################

# extract the text of the message itself as plain text from the HTML page for
# the message

# return the URL we're redirected to
sub GetRedirectUrl($) {
    my $response = $_[0];

    my $url = $response->header('Location') || return undef;

    # the Location URL is sometimes non-absolute which is not allowed, fix it
    local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1;
    my $base = $response->base;
    $url = $HTTP::URI_CLASS->new($url, $base)->abs($base);

    return $url;
}

# START

# flag indicating whether to download all or just new messages
my $newOnly = 0;

# flag indicating whether to delete or not
my $noDelete = 0;

# flag for help and version
my $help = 0;
my $version = 0;

# parse input options
Getopt::Long::GetOptions
(
 'newOnly'     => \$newOnly,
 'help'        => \$help,
 'version'     => \$version,
 'noDelete'    => \$noDelete);

# unbuffer STDOUT
select((select(STDOUT), $| = 1)[0]);

if ($help) {
    print $versionString . "\n" . $usage;
    exit;
}

if ($version) {
    print $versionString;
    exit;
}

# grab login cookies
my $ua = LWP::UserAgent->new;
$ua->agent("fetchyahoo/0.003");
my $cookie_jar = HTTP::Cookies->new();
$ua->cookie_jar($cookie_jar);
my $request = POST 'http://login.yahoo.com/config/login',
    [
     ".tries" => '1',
     #".done"  => 'URL to go to later',
     ".src"   => 'ym',
     ".intl"  => 'us',
     "login"  => $username,
     "passwd" => $password,
     ];

$request->content_type('application/x-www-form-urlencoded');
$request->header('Accept' => '*/*');
$request->header('Allowed' => 'GET HEAD PUT');
my $response = $ua->simple_request($request);
my $url="NULL";
while ( $response->is_redirect ) {
    $cookie_jar->extract_cookies($response);
    $url = GetRedirectUrl($response);
    # go to the new page
    $request = GET $url;
    $response = $ua->simple_request($request);
}

if ( !$response->is_success ) { print "Failed!\n"; die "Couldn't log in\n"; }

if ( ($response->content) =~ /Invalid Password/ ) {
    print "Failed!\n"; die "Wrong Password entered\n"; }

if ( ($response->content) =~ /ID does not exist/ ) {
    print "Failed!\n"; die "Yahoo user does not exist\n"; }


print "Successfully Logged in!\n"; 

$url =~ /(http:\/\/.*?)\// ;
my $baseurl = $1;
my $homeurl = $baseurl . $homesuff ;
my $msgurl = $baseurl . $msgsuff ; 

# get all message IDs
my $msgcount = 0;
my $pagecount = 0;
my $numMsgs ;
my $startMsg ;
my $endMsg ;
my @msgids ;
my $crumb ;

if ( $newOnly ) {
    print "Only retrieving new messages\n";
    $homeurl = $homeurl . "\&Nview=u";
}

do {

    my $tmpurl = $homeurl . "\&Npos=$pagecount" ; 
    $request = GET  $tmpurl ;
    $response = $ua->simple_request($request);
    while ( $response->is_redirect ) {
	$cookie_jar->extract_cookies($response); # manually extract cookies
	my $url = GetRedirectUrl($response);     # get new page
	$request = GET $url;                     # go to the new page 
	$response = $ua->simple_request($request);
    }
    
    if ( !$response->is_success ) { print "Failed!\n";
				    die "Couldn't get Inbox listing.\n"; }
    
    #parse for number of messages
    my $mainPage = $response->content;
    if ($mainPage =~ /showing (\d+)-(\d+) of (\d+)/ ) { 
	$startMsg = $1 ;
	$endMsg = $2 ;
	$numMsgs = $3;  }
    elsif ($mainPage =~ /Folder\s*Inbox\s*has\s*no\s+/) {
	print "There are no messages to retrieve.\n"; exit; }
    else { print "Failed!\n"; die "Can't retrieve number of messages.\n"; }
    
    $mainPage =~ /name=\".crumb\" value=\"(.*?)\"/ ;
    $crumb = $1;
    print "Getting Message IDs for messages $startMsg - $endMsg.\n";
    
    # parse mainPage for message IDs
    foreach my $word (split ' ', $mainPage) {
	if ($word =~ /ShowLetter\?MsgId=([0-9_]+)/ ) { 
	    $msgcount = $msgcount + 1;
	    $msgids[$msgcount-1] = $1 ;
	}
    }
    $pagecount = $pagecount+1 ;
} until $numMsgs == $endMsg ; 
  
print "Got $msgcount Message IDs\n";

my $delurl = $homeurl . "\&DEL=Delete";
my $left = 0;

# loop over all Messag IDs
foreach my $msgid (@msgids) { 
    my $tmpurl = $msgurl . $msgid ; 
    my $request = GET  $tmpurl ;
    my $response = $ua->simple_request($request) ||
	die "Failed to get body of message $msgid.\n";
    my $msgText = $response->content ;

    if ($msgText =~ /Printable\&nbsp;View/ ) {  # sometimes we get a 
	$tmpurl = $msgurl . $msgid ;            # non-printable view
	$request = GET  $tmpurl ;
	$response = $ua->simple_request($request) ||
	    die "Failed to get body of message $msgid.\n";
	$msgText = $response->content ;
    }

    # message mangling goes here :)

    $msgText =~ s/<b>/\n/ ;       # insert a break after the first <b>. dunno y
    $msgText =~ s/<style>.*?<\/style>//gsi ;    # strip <style>....</style>
    $msgText =~ s/<xml>.*?<\/xml>//gsi ;        # strip <xml>....</xml>
    $msgText =~ s/<.*?>//gs ;     # strip all raw html tags
    $msgText =~ s/\&\#34;/\"/g ;  # convert html character codes 
    $msgText =~ s/\&\#39;/\'/g ;
    $msgText =~ s/\&\#147;/\"/g ;
    $msgText =~ s/\&\#148;/\"/g ;
    $msgText =~ s/\&\#183;/./g ;
    $msgText =~ s/\&\#8217;/\'/g ;
    $msgText =~ s/\&\#8220;/\"/g ;
    $msgText =~ s/\&\#8221;/\"/g ;
    $msgText =~ s/\&\#8230;/.../g ;
    $msgText =~ s/\&nbsp;/ /gi ;
    $msgText =~ s/\&lt;/</gi ; 
    $msgText =~ s/\&gt;/>/gi ; 
    $msgText =~ s/\&amp;/\&/gi ;   
    $msgText =~ s/\&quot;/\"/gi ;   
    $msgText =~ s/<[ \t\n\r\fa-zA-Z_0-9;.,\-]*?>//gs ;
    # ^^ strip all in-message html tags, careful not to strip e-mail addies!
    # or html links (ie exclude @ and :)

    # check if this message should be deleted or left
    if ( !( $msgText =~ /\nContent-Type: multipart\//si )  &&
	 !( $msgText =~ /Scan With Norton Antivirus\s*Download File/) ) 
    { $delurl = $delurl . "\&Mid=$msgid"; }
    else { $left = $left+1 ; }

    $msgText =~ s/\nContent-Type: multipart\/mixed;.*?boundary=.*?\n/\n/si ;
    $msgText =~ s/\nContent-Type: multipart\/alternative;.*?boundary=.*?\n/\n/si;
    $msgText =~ s/\nContent-Type: multipart\/related;.*?boundary=.*?\n/\n/si ;
                                   # strip attachment boundary
    $msgText =~ s/\nContent-Length:\s*[0-9]+\s*?\n/\n/si ; 
                                   # strip Content-Length line
    $msgText =~ s/^\s*//s ;        # strip leading whitespace
    $msgText =~ s/\s*$//s ;        # strip trailing whitespace
    $msgText =~ s/\n\s*\n\s*\n*/\n\n/gs ; # kill excessive line breaks
    $msgText = $msgText . "\n\n" ;   # put a break at the end

    # Yahoo's from line is not RFC 822 compliant. This fixes it.

    # Remove the From_ line temporarily
    $msgText =~ s/^From (.*?)\s*((Mon|Tue|Wed|Thu|Fri|Sat|Sun).*?\n)//m;
    my $fromName; my $fromRest;    

    # If we don't find a From_ line we need to generate one
    if ( !  defined ($1) ) {
	$msgText =~ /^From:\s*(.*?)\n/m ;
	if ( !  defined ($1) ) {   # all is lost, no From: field either
	    print "\nNo From_ line found in this e-mail and none could be ";
	    print "generated. Terminating (no messages deleted).\n";
	    die "Error finding/creating a From_ line" ; }
	$fromName = $1 ; 
	$fromRest = scalar localtime ;
    }
    else { $fromName = $1 ; $fromRest = $2 ; }
 
    $fromName =~ s/ /_/g;
    $msgText =~ s/\n>From /\n>>From /g ; # slightly extended RFC 822
    $msgText =~ s/\nFrom /\n>From /g ; 
    $msgText = "From " . $fromName . " " . $fromRest 
	. $msgText ;  # add new From_ line

    # send message text to the specified spool/file    
    open SPOOL, "$spool" or
	die "Can't open output file: $spoolname";
    print SPOOL $msgText;          
    close SPOOL;
}

print "Finished downloading $msgcount messages.\n";

if ( ! $noDelete) {

    if ($left>0) { 
	print "$left message(s) with attachments or multiple parts will not be "
	    . "deleted.\n" } ;
    if ($left<$msgcount) { 
        # delete messages without attachments/multiple parts
	$delurl = $delurl . "\&.crumb=$crumb";
	$request = GET  $delurl ;
	$response = $ua->simple_request($request) ||
	    die "Failed to delete messages.\n";
	print $numMsgs-$left . " message(s) have been deleted.\n";
    } else { print "No messages to delete.\n"; }
}
