recoll/src/internfile/mh_mail.cpp

#ifndef lint
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.3 2005-04-04 13:18:46 dockes Exp $ (C) 2005 J.F.Dockes";
#endif

#include <stdio.h>
#include <fcntl.h>
#include <errno.h>

#include <map>
#include <sstream>
using std::stringstream;
using std::map;

#include "mimehandler.h"
#include "debuglog.h"
#include "csguess.h"
#include "readfile.h"
#include "transcode.h"
#include "mimeparse.h"
#include "indextext.h"
#include "mail.h"
#include "debuglog.h"
#include "smallut.h"
#include "mimeparse.h"
#include "html.h"

// binc imap mime definitions
#include "mime.h"

static void
walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc, int depth);

using namespace std;

MimeHandlerMail::~MimeHandlerMail()
{
    if (vfp) {
	fclose((FILE *)vfp);
	vfp = 0;
    }
}

// We are called for two different file types: mbox-type folders
// holding multiple messages, and maildir-type files with one message
MimeHandler::Status
MimeHandlerMail::worker(RclConfig *cnf, const string &fn,
			const string &mtype, Rcl::Doc &docout, string& ipath)
{
    LOGDEB2(("MimeHandlerMail::worker: %s [%s]\n", mtype.c_str(), fn.c_str()));
    conf = cnf;

    if (!stringlowercmp("message/rfc822", mtype)) {
	ipath = "";
	int fd;
	if ((fd = open(fn.c_str(), 0)) < 0) {
	    LOGERR(("MimeHandlerMail::worker: open(%s) errno %d\n",
		    fn.c_str(), errno));
	    return MimeHandler::MHError;
	}
	Binc::MimeDocument doc;
	doc.parseFull(fd);
	MimeHandler::Status ret = processone(fn, doc, docout);
	close(fd);
	return ret;
    } else  if (!stringlowercmp("text/x-mail", mtype)) {
	return processmbox(fn, docout, ipath);
    } else // hu ho
	return MimeHandler::MHError;
}

MimeHandler::Status
MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
{
    int mtarg = 0;
    if (ipath != "") {
	sscanf(ipath.c_str(), "%d", &mtarg);
    }
    LOGDEB2(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(),
	    mtarg));

    FILE *fp;
    if (vfp) {
	fp = (FILE *)vfp;
    } else {
	fp = fopen(fn.c_str(), "r");
	if (fp == 0) {
	    LOGERR(("MimeHandlerMail::processmbox: error opening %s\n",
		    fn.c_str()));
	    return MimeHandler::MHError;
	}
	vfp = fp;
    }
    if (mtarg > 0) {
	fseek(fp, 0, SEEK_SET);
	msgnum = 0;
    }

    off_t start, end;
    bool iseof = false;
    do  {
	// Look for next 'From ' Line, start of message. Set start to
	// line after this
	char line[301];
	for (;;) {
	    if (!fgets(line, 300, fp)) {
		// Eof hit while looking for 'From ' -> file done. We'd need
		// another return code here
		return MimeHandler::MHError;
	    }

	    if (!strncmp("From ", line, 5)) {
		start = ftello(fp);
		break;
	    }
	}

	// Look for next 'From ' line or eof, end of message (we let a
	// spurious empty line in)
	for (;;) {
	    end = ftello(fp);
	    if (!fgets(line, 300, fp) || !strncmp("From ", line, 5)) {
		if (ferror(fp) || feof(fp))
		    iseof = true;
		break;
	    }
	}
	msgnum++;
	fseek(fp, end, SEEK_SET);
    } while (mtarg > 0 && msgnum < mtarg);


    size_t size = end - start;
    fseek(fp, start, SEEK_SET);
    char *cp = (char *)malloc(size);
    if (cp == 0) {
	LOGERR(("MimeHandlerMail::processmbox: malloc(%d) failed\n", size));
	return MimeHandler::MHError;
    }
    if (fread(cp, 1, size, fp) != size) {
	LOGERR(("MimeHandlerMail::processmbox: fread failed (errno %d)\n",
		errno));
	free(cp);
	return MimeHandler::MHError;
    }
    string msgbuf(cp, size);
    free(cp);
    stringstream s(msgbuf);
    Binc::MimeDocument doc;
    doc.parseFull(s);
    MimeHandler::Status ret = processone(fn, doc, docout);
    if (ret == MimeHandler::MHError)
	return ret;
    char buf[20];
    sprintf(buf, "%d", msgnum);
    ipath = buf;
    return iseof ? MimeHandler::MHDone :
	(mtarg > 0) ? MimeHandler::MHDone : MimeHandler::MHAgain;
}


// Transform a single message into a document. The subject becomes the
// title, and any simple body part with a content-type of text or html
// and content-disposition inline gets concatenated as text.
MimeHandler::Status
MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc,
			    Rcl::Doc &docout)
{
    if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
	LOGERR(("MimeHandlerMail::processone: mime parse error for %s\n",
		fn.c_str()));
	return MimeHandler::MHError;
    }

    // Handle some headers. We should process rfc2047 encoding here
    // Also there should be no 8bit chars, but there sometimes are. So
    // we transcode as if from iso-8859-1, which is better than
    // getting utf8 conversion errors later on
    Binc::HeaderItem hi;
    string transcoded;
    if (doc.h.getFirstHeader("Subject", hi)) {
	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
	docout.title = transcoded;
    }
    if (doc.h.getFirstHeader("From", hi)) {
	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
	docout.text += string("From: ") + transcoded + string("\n");
    }
    if (doc.h.getFirstHeader("To", hi)) {
	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
	docout.text += string("To: ") + transcoded + string("\n");
    }
    if (doc.h.getFirstHeader("Date", hi)) {
	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
	docout.text += string("Date: ") + transcoded + string("\n");
    }
    if (doc.h.getFirstHeader("Subject", hi)) {
	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
	docout.text += string("Subject: ") + transcoded + string("\n");
    }

    LOGDEB2(("MimeHandlerMail::processone: ismultipart %d mime subtype '%s'\n",
	    doc.isMultipart(), doc.getSubType().c_str()));
    walkmime(conf, docout.text, doc, 0);

    //LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str()));
    return MimeHandler::MHDone;
}

// Recursively walk the message mime parts and concatenate all the
// inline html or text that we find anywhere.
static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc,
		     int depth)
{
    if (depth > 5) {
	LOGINFO(("walkmime: max depth exceeded\n"));
	return;
    }

    if (doc.isMultipart()) {
	LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n",
		doc.isMultipart(), doc.getSubType().c_str()));
	// We only handle alternative, related and mixed for now. For
	// alternative, we look for a text/plain part, else html and
	// process it For mixed and related, we process each part.
	std::vector<Binc::MimePart>::iterator it;
	if (!stringicmp("mixed", doc.getSubType()) ||
	    !stringicmp("related", doc.getSubType())) {
	    for (it = doc.members.begin(); it != doc.members.end();it++) {
		walkmime(cnf, out, *it, depth+1);
	    }
	} else if (!stringicmp("alternative", doc.getSubType())) {
	    std::vector<Binc::MimePart>::iterator ittxt, ithtml;
	    ittxt = ithtml = doc.members.end();
	    for (it = doc.members.begin(); it != doc.members.end();it++) {
		// Get and parse content-type header
		Binc::HeaderItem hi;
		if (!doc.h.getFirstHeader("Content-Type", hi))
		    continue;
		MimeHeaderValue content_type;
		parseMimeHeaderValue(hi.getValue(), content_type);
		if (!stringlowercmp("text/plain", content_type.value))
		    ittxt = it;
		else if (!stringlowercmp("text/html", content_type.value))
		    ithtml = it;
	    }
	    if (ittxt != doc.members.end()) {
		walkmime(cnf, out, *ittxt, depth+1);
	    } else if (ithtml != doc.members.end()) {
		walkmime(cnf, out, *ithtml, depth+1);
	    }
	}
    } else {
	// If content-type is text or html and content-disposition is inline,
	// decode and add to text.

	// Get and parse content-type header.
	Binc::HeaderItem hi;
	string ctt = "text/plain";
	if (doc.h.getFirstHeader("Content-Type", hi)) {
	    ctt = hi.getValue();
	}
	LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
	MimeHeaderValue content_type;
	parseMimeHeaderValue(ctt, content_type);
	if (stringlowercmp("text/plain", content_type.value) &&
	    stringlowercmp("text/html", content_type.value)) {
	    return;
	}

	// Normally the default charset is us-ascii. But it happens that
	// 8 bit chars exist in a message that is stated as us-ascii. Ie the
	// mailer used by yahoo support ('KANA') does this. We could convert
	// to iso-8859 only if the transfer-encoding is 8 bit, or test for
	// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
	string charset = "iso-8859-1";
	map<string,string>::const_iterator it;
	it = content_type.params.find(string("charset"));
	if (it != content_type.params.end())
	    charset = it->second;
	if (charset.empty() ||
	    !stringlowercmp("us-ascii", charset) ||
	    !stringlowercmp("default", charset) ||
	    !stringlowercmp("x-user-defined", charset) ||
	    !stringlowercmp("x-unknown", charset) ||
	    !stringlowercmp("unknown", charset) ) {
	    charset = "iso-8859-1";
	}

	// Content disposition
	string ctd = "inline";
	if (doc.h.getFirstHeader("Content-Disposition", hi)) {
	    ctd = hi.getValue();
	}
	MimeHeaderValue content_disposition;
	parseMimeHeaderValue(ctd, content_disposition);
	if (stringlowercmp("inline", content_disposition.value)) {
	    return;
	}

	// Content transfer encoding
	string cte = "7bit";
	if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {
	    cte = hi.getValue();
	}

	LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
		 doc.getBodyStartOffset(), doc.getBodyLength()));
	string body;
	doc.getBody(body, 0, doc.bodylength);

	// Decode content transfer encoding
	if (!stringlowercmp("quoted-printable", cte)) {
	    string decoded;
	    qp_decode(body, decoded);
	    body = decoded;
	} else if (!stringlowercmp("base64", cte)) {
	    string decoded;
	    base64_decode(body, decoded);
	    body = decoded;
	}


	string transcoded;
	if (!stringlowercmp("text/html", content_type.value)) {
	    MimeHandlerHtml mh;
	    Rcl::Doc hdoc;
	    mh.charsethint = charset;
	    mh.worker1(cnf, "", body, content_type.value,  hdoc);
	    transcoded = hdoc.text;
	} else {
	    // Transcode to utf-8
	    if (!transcode(body, transcoded, charset, "UTF-8")) {
		LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
			charset.c_str()));
		transcoded = body;
	    }
	}

	out += string("\r\n") + transcoded;
    }
}