walk the full mime tree instead of staying at level 1

2006-09-19 14:30:39 +00:00 · 2006-09-19 14:30:39 +00:00 · 3e2bccd259
commit 3e2bccd259
parent 6424efca57
2 changed files with 203 additions and 152 deletions
--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.17 2006-09-15 16:50:44 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.18 2006-09-19 14:30:39 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -77,7 +77,12 @@ MimeHandlerMail::mkDoc(RclConfig *cnf, const string &fn,
 	}
 	Binc::MimeDocument doc;
 	doc.parseFull(fd);
-	MimeHandler::Status ret = processone(fn, doc, docout);
+	if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
 	    LOGERR(("MimeHandlerMail::mkDoc: mime parse error for %s\n",
 		    fn.c_str()));
 	    return MimeHandler::MHError;
 	}
 	MimeHandler::Status ret = processMsg(docout, doc, 0);
 	close(fd);
 	return ret;
    } else  if (!stringlowercmp("text/x-mail", mtype)) {
@ -175,7 +180,12 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
    stringstream s(msgbuf);
    Binc::MimeDocument doc;
    doc.parseFull(s);
-    MimeHandler::Status ret = processone(fn, doc, docout);
+    if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
 	LOGERR(("MimeHandlerMail::processMbox: mime parse error for %s\n",
 		fn.c_str()));
 	return MimeHandler::MHError;
    }
    MimeHandler::Status ret = processMsg(docout, doc, 0);
    if (ret == MimeHandler::MHError)
 	return ret;
    char buf[20];
@ -189,23 +199,23 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
 // Transform a single message into a document. The subject becomes the
 // title, and any simple body part with a content-type of text or html
 // and content-disposition inline gets concatenated as text.
 // 
 // If depth is not zero, we're called recursively for an
 // message/rfc822 part and we must not touch the doc fields except the
 // text
 MimeHandler::Status 
-MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc, 
+MimeHandlerMail::processMsg(Rcl::Doc &docout, Binc::MimePart& doc, 
-			    Rcl::Doc &docout)
+			    int depth)
 {
-    if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
+    if (depth >= 5) {
-	LOGERR(("MimeHandlerMail::processone: mime parse error for %s\n", 
+	// Have to stop somewhere
-		fn.c_str()));
+	LOGDEB(("MimeHandlerMail::processMsg: stopping at depth 5\n"));
-	return MimeHandler::MHError;
+	return MimeHandler::MHDone;
    }
-
+	
    // Handle some headers. 
    Binc::HeaderItem hi;
    string transcoded;
    if (doc.h.getFirstHeader("Subject", hi)) {
 	rfc2047_decode(hi.getValue(), transcoded);
 	docout.title = transcoded;
    }
    if (doc.h.getFirstHeader("From", hi)) {
 	rfc2047_decode(hi.getValue(), transcoded);
 	docout.text += string("From: ") + transcoded + string("\n");
@ -216,37 +226,50 @@ MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc,
    }
    if (doc.h.getFirstHeader("Date", hi)) {
 	rfc2047_decode(hi.getValue(), transcoded);
-	time_t t = rfc2822DateToUxTime(transcoded);
+	if (depth == 0) {
-	if (t != (time_t)-1) {
+	    time_t t = rfc2822DateToUxTime(transcoded);
-	    char ascuxtime[100];
+	    if (t != (time_t)-1) {
-	    sprintf(ascuxtime, "%ld", (long)t);
+		char ascuxtime[100];
-	    docout.dmtime = ascuxtime;
+		sprintf(ascuxtime, "%ld", (long)t);
-	} else {
+		docout.dmtime = ascuxtime;
-	    // Leave mtime field alone, ftime will be used instead.
+	    } else {
-	    LOGDEB(("rfc2822Date...: failed for [%s]\n", transcoded.c_str()));
+		// Leave mtime field alone, ftime will be used instead.
 		LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
 	    }
 	}
 	docout.text += string("Date: ") + transcoded + string("\n");
    }
    if (doc.h.getFirstHeader("Subject", hi)) {
 	rfc2047_decode(hi.getValue(), transcoded);
 	if (depth == 0)
 	    docout.title = transcoded;
 	docout.text += string("Subject: ") + transcoded + string("\n");
    }
-    LOGDEB2(("MimeHandlerMail::processone:ismultipart %d mime subtype '%s'\n",
+    LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n",
 	    doc.isMultipart(), doc.getSubType().c_str()));
-    walkmime(docout.text, doc, 0);
+    walkmime(docout, doc, depth);
-    LOGDEB2(("MimeHandlerMail::processone:text:[%s]\n", docout.text.c_str()));
+    LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n", docout.text.c_str()));
    return MimeHandler::MHDone;
 }
 // Recursively walk the message mime parts and concatenate all the
-// inline html or text that we find anywhere.
+// inline html or text that we find anywhere.  
-void MimeHandlerMail::walkmime(string &out, Binc::MimePart& doc, int depth)
+//
 // RFC2046 reminder: 
 // Top level media types: 
 //      Simple:    text, image, audio, video, application, 
 //      Composite: multipart, message.
 // 
 // multipart can be mixed, alternative, parallel, digest.
 // message/rfc822 may also be of interest.
 void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc, int depth)
 {
    string &out = docout.text;
    if (depth > 5) {
-	LOGINFO(("walkmime: max depth exceeded\n"));
+	LOGINFO(("walkmime: max depth (5) exceeded\n"));
 	return;
    }
@ -255,12 +278,12 @@ void MimeHandlerMail::walkmime(string &out, Binc::MimePart& doc, int depth)
 		doc.isMultipart(), doc.getSubType().c_str()));
 	// We only handle alternative, related and mixed for now. For
 	// alternative, we look for a text/plain part, else html and
-	// process it For mixed and related, we process each part.
+	// process it. For mixed and related, we process each part.
 	std::vector<Binc::MimePart>::iterator it;
 	if (!stringicmp("mixed", doc.getSubType()) || 
 	    !stringicmp("related", doc.getSubType())) {
 	    for (it = doc.members.begin(); it != doc.members.end();it++) {
-		walkmime(out, *it, depth+1);
+		walkmime(docout, *it, depth+1);
 	    }
 	} else if (!stringicmp("alternative", doc.getSubType())) {
 	    std::vector<Binc::MimePart>::iterator ittxt, ithtml;
@ -283,137 +306,165 @@ void MimeHandlerMail::walkmime(string &out, Binc::MimePart& doc, int depth)
 	    }
 	    if (ittxt != doc.members.end()) {
 		LOGDEB2(("walkmime: alternative: chose text/plain part\n"))
-		walkmime(out, *ittxt, depth+1);
+		walkmime(docout, *ittxt, depth+1);
 	    } else if (ithtml != doc.members.end()) {
 		LOGDEB2(("walkmime: alternative: chose text/html part\n"))
-		walkmime(out, *ithtml, depth+1);
+		walkmime(docout, *ithtml, depth+1);
 	    }
 	}
-    } else {
+	return;
-	// "Simple" part. See what it is:
+    } 
    // Part is not multipart: it must be either simple or message. Take
    // a look at interesting headers and a possible filename parameter
-	// Get and parse content-type header.
+    // Get and parse content-type header.
-	Binc::HeaderItem hi;
+    Binc::HeaderItem hi;
-	string ctt = "text/plain";
+    string ctt = "text/plain";
-	if (doc.h.getFirstHeader("Content-Type", hi)) {
+    if (doc.h.getFirstHeader("Content-Type", hi)) {
-	    ctt = hi.getValue();
+	ctt = hi.getValue();
-	}
+    }
-	LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
+    LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
-	MimeHeaderValue content_type;
+    MimeHeaderValue content_type;
-	parseMimeHeaderValue(ctt, content_type);
+    parseMimeHeaderValue(ctt, content_type);
-	// Get and parse Content-Disposition header
+    // Get and parse Content-Disposition header
-	string ctd = "inline";
+    string ctd = "inline";
-	if (doc.h.getFirstHeader("Content-Disposition", hi)) {
+    if (doc.h.getFirstHeader("Content-Disposition", hi)) {
-	    ctd = hi.getValue();
+	ctd = hi.getValue();
    }
    MimeHeaderValue content_disposition;
    parseMimeHeaderValue(ctd, content_disposition);
    LOGDEB2(("Content_disposition:[%s]\n", content_disposition.value.c_str()));
    string dispindic;
    if (stringlowercmp("inline", content_disposition.value))
 	dispindic = "Attachment";
    else 
 	dispindic = "Inline";
    // See if we have a filename.
    string filename;
    map<string,string>::const_iterator it;
    it = content_disposition.params.find(string("filename"));
    if (it != content_disposition.params.end())
 	filename = it->second;
    if (doc.isMessageRFC822()) {
 	LOGDEB2(("walkmime: message/RFC822 part\n"));
 	// The first part is the already parsed message.
 	// Call processMsg instead of walkmime so tha mail headers get 
 	// printed. The depth will tell it what to do
 	if (doc.members.empty()) {
 	    //??
 	    return;
 	}
-	MimeHeaderValue content_disposition;
+	out += "\n";
-	parseMimeHeaderValue(ctd, content_disposition);
+	if (m_forPreview)
 	    out += "[" + dispindic + " " + content_type.value + ": ";
 	out += filename;
 	if (m_forPreview)
 	    out += "]";
 	out += "\n\n";
 	processMsg(docout, doc.members[0], depth+1);
 	return;
    }
-	LOGDEB2(("Content_disposition:[%s]\n", 
+    // "Simple" part. 
-		content_disposition.value.c_str()));
+    LOGDEB2(("walkmime: simple  part\n"));
-	// If this is an attachment, we index the file name if any and, when
+    // If the Content-Disposition is not inline, we treat it as
-	// previewing, at least show that it was there.
+    // attachment, as per rfc2183. We don't process attachments
-	if (!stringlowercmp("attachment", content_disposition.value)) {
+    // for now, except for indexing/displaying the file name
-	    string afn;
+    // If it is inline but not text or html, same thing.
-	    map<string,string>::const_iterator it;
+    if (stringlowercmp("inline", content_disposition.value) ||
-	    it = content_disposition.params.find(string("filename"));
+	(stringlowercmp("text/plain", content_type.value) && 
-	    if (it != content_disposition.params.end())
+	 stringlowercmp("text/html", content_type.value)) ) {
-		afn = it->second;
+	if (!filename.empty()) {
 	    out += "\n";
 	    if (m_forPreview)
-		out += "[Attachment: ";
+		out += "[" + dispindic + " " + content_type.value + ": ";
-	    out += afn;
+	    out += filename;
 	    if (m_forPreview)
 		out += "]";
 	    out += "\n\n";
 	    // Attachment: we're done with this part
 	    return;
 	}
-
+	// We're done with this part
-	// The only other disposition that interests us is "inline", and then
+	return;
 	// this has to be plain text or html
 	if (stringlowercmp("inline", content_disposition.value)) {
 	    return;
 	}
 	if (stringlowercmp("text/plain", content_type.value) && 
 	    stringlowercmp("text/html", content_type.value)) {
 	    return;
 	}
 	// Normally the default charset is us-ascii. But it happens that
 	// 8 bit chars exist in a message that is stated as us-ascii. Ie the 
 	// mailer used by yahoo support ('KANA') does this. We could convert 
 	// to iso-8859 only if the transfer-encoding is 8 bit, or test for
 	// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
 	string charset = "iso-8859-1";
 	map<string,string>::const_iterator it;
 	it = content_type.params.find(string("charset"));
 	if (it != content_type.params.end())
 	    charset = it->second;
 	if (charset.empty() || 
 	    !stringlowercmp("us-ascii", charset) || 
 	    !stringlowercmp("default", charset) || 
 	    !stringlowercmp("x-user-defined", charset) || 
 	    !stringlowercmp("x-unknown", charset) || 
 	    !stringlowercmp("unknown", charset) ) {
 	    charset = "iso-8859-1";
 	}
 	// Content transfer encoding
 	string cte = "7bit";
 	if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {
 	    cte = hi.getValue();
 	} 
 	LOGDEB2(("walkmime: final: body start offset %d, length %d\n", 
 		 doc.getBodyStartOffset(), doc.getBodyLength()));
 	string body;
 	doc.getBody(body, 0, doc.bodylength);
 	// Decode according to content transfer encoding
 	if (!stringlowercmp("quoted-printable", cte)) {
 	    string decoded;
 	    if (!qp_decode(body, decoded)) {
 		LOGERR(("walkmime: quoted-printable decoding failed !\n"));
 		return;
 	    }
 	    body = decoded;
 	} else if (!stringlowercmp("base64", cte)) {
 	    string decoded;
 	    if (!base64_decode(body, decoded)) {
 		LOGERR(("walkmime: base64 decoding failed !\n"));
 #if 0
 		FILE *fp = fopen("/tmp/recoll_decodefail", "w");
 		if (fp) {
 		    fprintf(fp, "%s", body.c_str());
 		    fclose(fp);
 		}
 #endif
 		return;
 	    }
 	    body = decoded;
 	}
 	// Handle html stripping and transcoding to utf8
 	string utf8;
 	if (!stringlowercmp("text/html", content_type.value)) {
 	    MimeHandlerHtml mh;
 	    Rcl::Doc hdoc;
 	    mh.charsethint = charset;
 	    mh.mkDoc(m_conf, "", body, content_type.value,  hdoc);
 	    utf8 = hdoc.text;
 	} else {
 	    // Transcode to utf-8 
 	    if (!transcode(body, utf8, charset, "UTF-8")) {
 		LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
 			charset.c_str()));
 		utf8 = body;
 	    }
 	}
 	out += string("\r\n") + utf8;
 	LOGDEB2(("walkmime: out now: [%s]\n", out.c_str()));
    }
    // We are dealing with an inline part of text/plain or text/html type
    // Normally the default charset is us-ascii. But it happens that
    // 8 bit chars exist in a message that is stated as us-ascii. Ie the 
    // mailer used by yahoo support ('KANA') does this. We could convert 
    // to iso-8859 only if the transfer-encoding is 8 bit, or test for
    // actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
    string charset = "iso-8859-1";
    it = content_type.params.find(string("charset"));
    if (it != content_type.params.end())
 	charset = it->second;
    if (charset.empty() || 
 	!stringlowercmp("us-ascii", charset) || 
 	!stringlowercmp("default", charset) || 
 	!stringlowercmp("x-user-defined", charset) || 
 	!stringlowercmp("x-unknown", charset) || 
 	!stringlowercmp("unknown", charset) ) {
 	charset = "iso-8859-1";
    }
    // Content transfer encoding
    string cte = "7bit";
    if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {
 	cte = hi.getValue();
    } 
    LOGDEB2(("walkmime: final: body start offset %d, length %d\n", 
 	     doc.getBodyStartOffset(), doc.getBodyLength()));
    string body;
    doc.getBody(body, 0, doc.bodylength);
    // Decode according to content transfer encoding
    if (!stringlowercmp("quoted-printable", cte)) {
 	string decoded;
 	if (!qp_decode(body, decoded)) {
 	    LOGERR(("walkmime: quoted-printable decoding failed !\n"));
 	    return;
 	}
 	body = decoded;
    } else if (!stringlowercmp("base64", cte)) {
 	string decoded;
 	if (!base64_decode(body, decoded)) {
 	    LOGERR(("walkmime: base64 decoding failed !\n"));
 #if 0
 	    FILE *fp = fopen("/tmp/recoll_decodefail", "w");
 	    if (fp) {
 		fprintf(fp, "%s", body.c_str());
 		fclose(fp);
 	    }
 #endif
 	    return;
 	}
 	body = decoded;
    }
    // Handle html stripping and transcoding to utf8
    string utf8;
    if (!stringlowercmp("text/html", content_type.value)) {
 	MimeHandlerHtml mh;
 	Rcl::Doc hdoc;
 	mh.charsethint = charset;
 	mh.mkDoc(m_conf, "", body, content_type.value,  hdoc);
 	utf8 = hdoc.text;
    } else {
 	// Transcode to utf-8 
 	if (!transcode(body, utf8, charset, "UTF-8")) {
 	    LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
 		    charset.c_str()));
 	    utf8 = body;
 	}
    }
    out += string("\r\n") + utf8;
    LOGDEB2(("walkmime: out now: [%s]\n", out.c_str()));
 }
--- a/src/internfile/mh_mail.h
+++ b/src/internfile/mh_mail.h
@ -16,7 +16,7 @@
 */
 #ifndef _MAIL_H_INCLUDED_
 #define _MAIL_H_INCLUDED_
-/* @(#$Id: mh_mail.h,v 1.7 2006-09-05 08:05:02 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: mh_mail.h,v 1.8 2006-09-19 14:30:39 dockes Exp $  (C) 2004 J.F.Dockes */
 #include "mimehandler.h"
@ -45,11 +45,11 @@ class MimeHandlerMail : public MimeHandler {
    int        m_msgnum; // Current message number in folder. Starts at 1
    RclConfig *m_conf;   // Keep pointer to rclconfig around
    MimeHandler::Status processone(const string &fn, Binc::MimeDocument& doc,
 				   Rcl::Doc &docout);
    MimeHandler::Status processmbox(const string &fn, Rcl::Doc &docout, 
 				   string &ipath);
-    void walkmime(string &out, Binc::MimePart& doc, int depth);
+    MimeHandler::Status processMsg(Rcl::Doc &docout, Binc::MimePart& doc,
 				   int depth);
    void walkmime(Rcl::Doc &docout, Binc::MimePart& doc, int depth);
 };
 #endif /* _MAIL_H_INCLUDED_ */