walk the full mime tree instead of staying at level 1

This commit is contained in:
dockes 2006-09-19 14:30:39 +00:00
parent 6424efca57
commit 3e2bccd259
2 changed files with 203 additions and 152 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.17 2006-09-15 16:50:44 dockes Exp $ (C) 2005 J.F.Dockes"; static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.18 2006-09-19 14:30:39 dockes Exp $ (C) 2005 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -77,7 +77,12 @@ MimeHandlerMail::mkDoc(RclConfig *cnf, const string &fn,
} }
Binc::MimeDocument doc; Binc::MimeDocument doc;
doc.parseFull(fd); doc.parseFull(fd);
MimeHandler::Status ret = processone(fn, doc, docout); if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
LOGERR(("MimeHandlerMail::mkDoc: mime parse error for %s\n",
fn.c_str()));
return MimeHandler::MHError;
}
MimeHandler::Status ret = processMsg(docout, doc, 0);
close(fd); close(fd);
return ret; return ret;
} else if (!stringlowercmp("text/x-mail", mtype)) { } else if (!stringlowercmp("text/x-mail", mtype)) {
@ -175,7 +180,12 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
stringstream s(msgbuf); stringstream s(msgbuf);
Binc::MimeDocument doc; Binc::MimeDocument doc;
doc.parseFull(s); doc.parseFull(s);
MimeHandler::Status ret = processone(fn, doc, docout); if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
LOGERR(("MimeHandlerMail::processMbox: mime parse error for %s\n",
fn.c_str()));
return MimeHandler::MHError;
}
MimeHandler::Status ret = processMsg(docout, doc, 0);
if (ret == MimeHandler::MHError) if (ret == MimeHandler::MHError)
return ret; return ret;
char buf[20]; char buf[20];
@ -189,23 +199,23 @@ MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
// Transform a single message into a document. The subject becomes the // Transform a single message into a document. The subject becomes the
// title, and any simple body part with a content-type of text or html // title, and any simple body part with a content-type of text or html
// and content-disposition inline gets concatenated as text. // and content-disposition inline gets concatenated as text.
//
// If depth is not zero, we're called recursively for an
// message/rfc822 part and we must not touch the doc fields except the
// text
MimeHandler::Status MimeHandler::Status
MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc, MimeHandlerMail::processMsg(Rcl::Doc &docout, Binc::MimePart& doc,
Rcl::Doc &docout) int depth)
{ {
if (!doc.isHeaderParsed() && !doc.isAllParsed()) { if (depth >= 5) {
LOGERR(("MimeHandlerMail::processone: mime parse error for %s\n", // Have to stop somewhere
fn.c_str())); LOGDEB(("MimeHandlerMail::processMsg: stopping at depth 5\n"));
return MimeHandler::MHError; return MimeHandler::MHDone;
} }
// Handle some headers. // Handle some headers.
Binc::HeaderItem hi; Binc::HeaderItem hi;
string transcoded; string transcoded;
if (doc.h.getFirstHeader("Subject", hi)) {
rfc2047_decode(hi.getValue(), transcoded);
docout.title = transcoded;
}
if (doc.h.getFirstHeader("From", hi)) { if (doc.h.getFirstHeader("From", hi)) {
rfc2047_decode(hi.getValue(), transcoded); rfc2047_decode(hi.getValue(), transcoded);
docout.text += string("From: ") + transcoded + string("\n"); docout.text += string("From: ") + transcoded + string("\n");
@ -216,37 +226,50 @@ MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc,
} }
if (doc.h.getFirstHeader("Date", hi)) { if (doc.h.getFirstHeader("Date", hi)) {
rfc2047_decode(hi.getValue(), transcoded); rfc2047_decode(hi.getValue(), transcoded);
time_t t = rfc2822DateToUxTime(transcoded); if (depth == 0) {
if (t != (time_t)-1) { time_t t = rfc2822DateToUxTime(transcoded);
char ascuxtime[100]; if (t != (time_t)-1) {
sprintf(ascuxtime, "%ld", (long)t); char ascuxtime[100];
docout.dmtime = ascuxtime; sprintf(ascuxtime, "%ld", (long)t);
} else { docout.dmtime = ascuxtime;
// Leave mtime field alone, ftime will be used instead. } else {
LOGDEB(("rfc2822Date...: failed for [%s]\n", transcoded.c_str())); // Leave mtime field alone, ftime will be used instead.
LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
}
} }
docout.text += string("Date: ") + transcoded + string("\n"); docout.text += string("Date: ") + transcoded + string("\n");
} }
if (doc.h.getFirstHeader("Subject", hi)) { if (doc.h.getFirstHeader("Subject", hi)) {
rfc2047_decode(hi.getValue(), transcoded); rfc2047_decode(hi.getValue(), transcoded);
if (depth == 0)
docout.title = transcoded;
docout.text += string("Subject: ") + transcoded + string("\n"); docout.text += string("Subject: ") + transcoded + string("\n");
} }
LOGDEB2(("MimeHandlerMail::processone:ismultipart %d mime subtype '%s'\n", LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n",
doc.isMultipart(), doc.getSubType().c_str())); doc.isMultipart(), doc.getSubType().c_str()));
walkmime(docout.text, doc, 0); walkmime(docout, doc, depth);
LOGDEB2(("MimeHandlerMail::processone:text:[%s]\n", docout.text.c_str())); LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n", docout.text.c_str()));
return MimeHandler::MHDone; return MimeHandler::MHDone;
} }
// Recursively walk the message mime parts and concatenate all the // Recursively walk the message mime parts and concatenate all the
// inline html or text that we find anywhere. // inline html or text that we find anywhere.
void MimeHandlerMail::walkmime(string &out, Binc::MimePart& doc, int depth) //
// RFC2046 reminder:
// Top level media types:
// Simple: text, image, audio, video, application,
// Composite: multipart, message.
//
// multipart can be mixed, alternative, parallel, digest.
// message/rfc822 may also be of interest.
void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc, int depth)
{ {
string &out = docout.text;
if (depth > 5) { if (depth > 5) {
LOGINFO(("walkmime: max depth exceeded\n")); LOGINFO(("walkmime: max depth (5) exceeded\n"));
return; return;
} }
@ -255,12 +278,12 @@ void MimeHandlerMail::walkmime(string &out, Binc::MimePart& doc, int depth)
doc.isMultipart(), doc.getSubType().c_str())); doc.isMultipart(), doc.getSubType().c_str()));
// We only handle alternative, related and mixed for now. For // We only handle alternative, related and mixed for now. For
// alternative, we look for a text/plain part, else html and // alternative, we look for a text/plain part, else html and
// process it For mixed and related, we process each part. // process it. For mixed and related, we process each part.
std::vector<Binc::MimePart>::iterator it; std::vector<Binc::MimePart>::iterator it;
if (!stringicmp("mixed", doc.getSubType()) || if (!stringicmp("mixed", doc.getSubType()) ||
!stringicmp("related", doc.getSubType())) { !stringicmp("related", doc.getSubType())) {
for (it = doc.members.begin(); it != doc.members.end();it++) { for (it = doc.members.begin(); it != doc.members.end();it++) {
walkmime(out, *it, depth+1); walkmime(docout, *it, depth+1);
} }
} else if (!stringicmp("alternative", doc.getSubType())) { } else if (!stringicmp("alternative", doc.getSubType())) {
std::vector<Binc::MimePart>::iterator ittxt, ithtml; std::vector<Binc::MimePart>::iterator ittxt, ithtml;
@ -283,137 +306,165 @@ void MimeHandlerMail::walkmime(string &out, Binc::MimePart& doc, int depth)
} }
if (ittxt != doc.members.end()) { if (ittxt != doc.members.end()) {
LOGDEB2(("walkmime: alternative: chose text/plain part\n")) LOGDEB2(("walkmime: alternative: chose text/plain part\n"))
walkmime(out, *ittxt, depth+1); walkmime(docout, *ittxt, depth+1);
} else if (ithtml != doc.members.end()) { } else if (ithtml != doc.members.end()) {
LOGDEB2(("walkmime: alternative: chose text/html part\n")) LOGDEB2(("walkmime: alternative: chose text/html part\n"))
walkmime(out, *ithtml, depth+1); walkmime(docout, *ithtml, depth+1);
} }
} }
} else { return;
// "Simple" part. See what it is: }
// Part is not multipart: it must be either simple or message. Take
// a look at interesting headers and a possible filename parameter
// Get and parse content-type header. // Get and parse content-type header.
Binc::HeaderItem hi; Binc::HeaderItem hi;
string ctt = "text/plain"; string ctt = "text/plain";
if (doc.h.getFirstHeader("Content-Type", hi)) { if (doc.h.getFirstHeader("Content-Type", hi)) {
ctt = hi.getValue(); ctt = hi.getValue();
} }
LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str())); LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
MimeHeaderValue content_type; MimeHeaderValue content_type;
parseMimeHeaderValue(ctt, content_type); parseMimeHeaderValue(ctt, content_type);
// Get and parse Content-Disposition header // Get and parse Content-Disposition header
string ctd = "inline"; string ctd = "inline";
if (doc.h.getFirstHeader("Content-Disposition", hi)) { if (doc.h.getFirstHeader("Content-Disposition", hi)) {
ctd = hi.getValue(); ctd = hi.getValue();
}
MimeHeaderValue content_disposition;
parseMimeHeaderValue(ctd, content_disposition);
LOGDEB2(("Content_disposition:[%s]\n", content_disposition.value.c_str()));
string dispindic;
if (stringlowercmp("inline", content_disposition.value))
dispindic = "Attachment";
else
dispindic = "Inline";
// See if we have a filename.
string filename;
map<string,string>::const_iterator it;
it = content_disposition.params.find(string("filename"));
if (it != content_disposition.params.end())
filename = it->second;
if (doc.isMessageRFC822()) {
LOGDEB2(("walkmime: message/RFC822 part\n"));
// The first part is the already parsed message.
// Call processMsg instead of walkmime so tha mail headers get
// printed. The depth will tell it what to do
if (doc.members.empty()) {
//??
return;
} }
MimeHeaderValue content_disposition; out += "\n";
parseMimeHeaderValue(ctd, content_disposition); if (m_forPreview)
out += "[" + dispindic + " " + content_type.value + ": ";
out += filename;
if (m_forPreview)
out += "]";
out += "\n\n";
processMsg(docout, doc.members[0], depth+1);
return;
}
LOGDEB2(("Content_disposition:[%s]\n", // "Simple" part.
content_disposition.value.c_str())); LOGDEB2(("walkmime: simple part\n"));
// If this is an attachment, we index the file name if any and, when // If the Content-Disposition is not inline, we treat it as
// previewing, at least show that it was there. // attachment, as per rfc2183. We don't process attachments
if (!stringlowercmp("attachment", content_disposition.value)) { // for now, except for indexing/displaying the file name
string afn; // If it is inline but not text or html, same thing.
map<string,string>::const_iterator it; if (stringlowercmp("inline", content_disposition.value) ||
it = content_disposition.params.find(string("filename")); (stringlowercmp("text/plain", content_type.value) &&
if (it != content_disposition.params.end()) stringlowercmp("text/html", content_type.value)) ) {
afn = it->second; if (!filename.empty()) {
out += "\n"; out += "\n";
if (m_forPreview) if (m_forPreview)
out += "[Attachment: "; out += "[" + dispindic + " " + content_type.value + ": ";
out += afn; out += filename;
if (m_forPreview) if (m_forPreview)
out += "]"; out += "]";
out += "\n\n"; out += "\n\n";
// Attachment: we're done with this part
return;
} }
// We're done with this part
// The only other disposition that interests us is "inline", and then return;
// this has to be plain text or html
if (stringlowercmp("inline", content_disposition.value)) {
return;
}
if (stringlowercmp("text/plain", content_type.value) &&
stringlowercmp("text/html", content_type.value)) {
return;
}
// Normally the default charset is us-ascii. But it happens that
// 8 bit chars exist in a message that is stated as us-ascii. Ie the
// mailer used by yahoo support ('KANA') does this. We could convert
// to iso-8859 only if the transfer-encoding is 8 bit, or test for
// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
string charset = "iso-8859-1";
map<string,string>::const_iterator it;
it = content_type.params.find(string("charset"));
if (it != content_type.params.end())
charset = it->second;
if (charset.empty() ||
!stringlowercmp("us-ascii", charset) ||
!stringlowercmp("default", charset) ||
!stringlowercmp("x-user-defined", charset) ||
!stringlowercmp("x-unknown", charset) ||
!stringlowercmp("unknown", charset) ) {
charset = "iso-8859-1";
}
// Content transfer encoding
string cte = "7bit";
if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {
cte = hi.getValue();
}
LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
doc.getBodyStartOffset(), doc.getBodyLength()));
string body;
doc.getBody(body, 0, doc.bodylength);
// Decode according to content transfer encoding
if (!stringlowercmp("quoted-printable", cte)) {
string decoded;
if (!qp_decode(body, decoded)) {
LOGERR(("walkmime: quoted-printable decoding failed !\n"));
return;
}
body = decoded;
} else if (!stringlowercmp("base64", cte)) {
string decoded;
if (!base64_decode(body, decoded)) {
LOGERR(("walkmime: base64 decoding failed !\n"));
#if 0
FILE *fp = fopen("/tmp/recoll_decodefail", "w");
if (fp) {
fprintf(fp, "%s", body.c_str());
fclose(fp);
}
#endif
return;
}
body = decoded;
}
// Handle html stripping and transcoding to utf8
string utf8;
if (!stringlowercmp("text/html", content_type.value)) {
MimeHandlerHtml mh;
Rcl::Doc hdoc;
mh.charsethint = charset;
mh.mkDoc(m_conf, "", body, content_type.value, hdoc);
utf8 = hdoc.text;
} else {
// Transcode to utf-8
if (!transcode(body, utf8, charset, "UTF-8")) {
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
charset.c_str()));
utf8 = body;
}
}
out += string("\r\n") + utf8;
LOGDEB2(("walkmime: out now: [%s]\n", out.c_str()));
} }
// We are dealing with an inline part of text/plain or text/html type
// Normally the default charset is us-ascii. But it happens that
// 8 bit chars exist in a message that is stated as us-ascii. Ie the
// mailer used by yahoo support ('KANA') does this. We could convert
// to iso-8859 only if the transfer-encoding is 8 bit, or test for
// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
string charset = "iso-8859-1";
it = content_type.params.find(string("charset"));
if (it != content_type.params.end())
charset = it->second;
if (charset.empty() ||
!stringlowercmp("us-ascii", charset) ||
!stringlowercmp("default", charset) ||
!stringlowercmp("x-user-defined", charset) ||
!stringlowercmp("x-unknown", charset) ||
!stringlowercmp("unknown", charset) ) {
charset = "iso-8859-1";
}
// Content transfer encoding
string cte = "7bit";
if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {
cte = hi.getValue();
}
LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
doc.getBodyStartOffset(), doc.getBodyLength()));
string body;
doc.getBody(body, 0, doc.bodylength);
// Decode according to content transfer encoding
if (!stringlowercmp("quoted-printable", cte)) {
string decoded;
if (!qp_decode(body, decoded)) {
LOGERR(("walkmime: quoted-printable decoding failed !\n"));
return;
}
body = decoded;
} else if (!stringlowercmp("base64", cte)) {
string decoded;
if (!base64_decode(body, decoded)) {
LOGERR(("walkmime: base64 decoding failed !\n"));
#if 0
FILE *fp = fopen("/tmp/recoll_decodefail", "w");
if (fp) {
fprintf(fp, "%s", body.c_str());
fclose(fp);
}
#endif
return;
}
body = decoded;
}
// Handle html stripping and transcoding to utf8
string utf8;
if (!stringlowercmp("text/html", content_type.value)) {
MimeHandlerHtml mh;
Rcl::Doc hdoc;
mh.charsethint = charset;
mh.mkDoc(m_conf, "", body, content_type.value, hdoc);
utf8 = hdoc.text;
} else {
// Transcode to utf-8
if (!transcode(body, utf8, charset, "UTF-8")) {
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
charset.c_str()));
utf8 = body;
}
}
out += string("\r\n") + utf8;
LOGDEB2(("walkmime: out now: [%s]\n", out.c_str()));
} }

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _MAIL_H_INCLUDED_ #ifndef _MAIL_H_INCLUDED_
#define _MAIL_H_INCLUDED_ #define _MAIL_H_INCLUDED_
/* @(#$Id: mh_mail.h,v 1.7 2006-09-05 08:05:02 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: mh_mail.h,v 1.8 2006-09-19 14:30:39 dockes Exp $ (C) 2004 J.F.Dockes */
#include "mimehandler.h" #include "mimehandler.h"
@ -45,11 +45,11 @@ class MimeHandlerMail : public MimeHandler {
int m_msgnum; // Current message number in folder. Starts at 1 int m_msgnum; // Current message number in folder. Starts at 1
RclConfig *m_conf; // Keep pointer to rclconfig around RclConfig *m_conf; // Keep pointer to rclconfig around
MimeHandler::Status processone(const string &fn, Binc::MimeDocument& doc,
Rcl::Doc &docout);
MimeHandler::Status processmbox(const string &fn, Rcl::Doc &docout, MimeHandler::Status processmbox(const string &fn, Rcl::Doc &docout,
string &ipath); string &ipath);
void walkmime(string &out, Binc::MimePart& doc, int depth); MimeHandler::Status processMsg(Rcl::Doc &docout, Binc::MimePart& doc,
int depth);
void walkmime(Rcl::Doc &docout, Binc::MimePart& doc, int depth);
}; };
#endif /* _MAIL_H_INCLUDED_ */ #endif /* _MAIL_H_INCLUDED_ */