#ifndef lint static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.22 2006-12-07 07:06:28 dockes Exp $ (C) 2005 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include #include #include #include "mimehandler.h" #include "debuglog.h" #include "csguess.h" #include "readfile.h" #include "transcode.h" #include "mimeparse.h" #include "indextext.h" #include "mh_mail.h" #include "debuglog.h" #include "smallut.h" #include "mimeparse.h" #include "mh_html.h" // binc imap mime definitions #include "mime.h" #ifndef NO_NAMESPACES using namespace std; #endif /* NO_NAMESPACES */ static const int maxdepth = 20; MimeHandlerMail::~MimeHandlerMail() { if (m_vfp) { fclose((FILE *)m_vfp); m_vfp = 0; } } // We are called for two different file types: mbox-type folders // holding multiple messages, and maildir-type files with one message // ipath is non empty only when we are called for retrieving a single message // for preview. It is always empty during indexing, and we fill it up with // the message number for the returned doc MimeHandler::Status MimeHandlerMail::mkDoc(RclConfig *cnf, const string &fn, const string &mtype, Rcl::Doc &docout, string& ipath) { LOGDEB2(("MimeHandlerMail::mkDoc: %s [%s]\n", mtype.c_str(), fn.c_str())); m_conf = cnf; if (!stringlowercmp("message/rfc822", mtype)) { ipath = ""; int fd; if ((fd = open(fn.c_str(), 0)) < 0) { LOGERR(("MimeHandlerMail::mkDoc: open(%s) errno %d\n", fn.c_str(), errno)); return MimeHandler::MHError; } Binc::MimeDocument doc; doc.parseFull(fd); if (!doc.isHeaderParsed() && !doc.isAllParsed()) { LOGERR(("MimeHandlerMail::mkDoc: mime parse error for %s\n", fn.c_str())); return MimeHandler::MHError; } MimeHandler::Status ret = processMsg(docout, doc, 0); close(fd); return ret; } else if (!stringlowercmp("text/x-mail", mtype)) { return processmbox(fn, docout, ipath); } else // hu ho return MimeHandler::MHError; } static const char *frompat = "^From .* [1-2][0-9][0-9][0-9]\n$"; static regex_t fromregex; static bool regcompiled; MimeHandler::Status MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath) { int mtarg = 0; if (ipath != "") { sscanf(ipath.c_str(), "%d", &mtarg); } LOGDEB2(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(), mtarg)); FILE *fp; // Open the file on first call, then save/reuse the file pointer if (!m_vfp) { fp = fopen(fn.c_str(), "r"); if (fp == 0) { LOGERR(("MimeHandlerMail::processmbox: error opening %s\n", fn.c_str())); return MimeHandler::MHError; } m_vfp = fp; } else { fp = (FILE *)m_vfp; } if (!regcompiled) { regcomp(&fromregex, frompat, REG_NOSUB); regcompiled = true; } // If we are called to retrieve a specific message, seek to bof // (then scan up to the message). This is for the case where the // same object is reused to fetch several messages (else the fp is // just opened no need for a seek). We could also check if the // current message number is lower than the requested one and // avoid rereading the whole thing in this case. But I'm not sure // we're ever used in this way (multiple retrieves on same // object). So: if (mtarg > 0) { fseek(fp, 0, SEEK_SET); m_msgnum = 0; } off_t start, end; bool iseof = false; bool hademptyline = true; string msgtxt; do { // Look for next 'From ' Line, start of message. Set start to // line after this char line[501]; for (;;) { if (!fgets(line, 500, fp)) { // Eof hit while looking for 'From ' -> file done. We'd need // another return code here return MimeHandler::MHError; } if (line[0] == '\n') { hademptyline = true; continue; } if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) { start = ftello(fp); m_msgnum++; break; } hademptyline = false; } // Look for next 'From ' line or eof, end of message. for (;;) { end = ftello(fp); if (!fgets(line, 500, fp)) { if (ferror(fp) || feof(fp)) iseof = true; break; } if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) { break; } if (mtarg <= 0 || m_msgnum == mtarg) { msgtxt += line; } if (line[0] == '\n') { hademptyline = true; } else { hademptyline = false; } } fseek(fp, end, SEEK_SET); } while (mtarg > 0 && m_msgnum < mtarg); stringstream s(msgtxt); LOGDEB2(("Message text: [%s]\n", msgtxt.c_str())); Binc::MimeDocument doc; doc.parseFull(s); if (!doc.isHeaderParsed() && !doc.isAllParsed()) { LOGERR(("MimeHandlerMail::processMbox: mime parse error for %s\n", fn.c_str())); return MimeHandler::MHError; } MimeHandler::Status ret = processMsg(docout, doc, 0); if (ret == MimeHandler::MHError) return ret; char buf[20]; sprintf(buf, "%d", m_msgnum); ipath = buf; return iseof ? MimeHandler::MHDone : (mtarg > 0) ? MimeHandler::MHDone : MimeHandler::MHAgain; } // Transform a single message into a document. The subject becomes the // title, and any simple body part with a content-type of text or html // and content-disposition inline gets concatenated as text. // // If depth is not zero, we're called recursively for an // message/rfc822 part and we must not touch the doc fields except the // text MimeHandler::Status MimeHandlerMail::processMsg(Rcl::Doc &docout, Binc::MimePart& doc, int depth) { LOGDEB2(("MimeHandlerMail::processMsg: depth %d\n", depth)); if (depth++ >= maxdepth) { // Have to stop somewhere LOGDEB(("MimeHandlerMail::processMsg: maxdepth %d exceeded\n", maxdepth)); return MimeHandler::MHDone; } // Handle some headers. Binc::HeaderItem hi; string transcoded; if (doc.h.getFirstHeader("From", hi)) { rfc2047_decode(hi.getValue(), transcoded); docout.text += string("From: ") + transcoded + string("\n"); } if (doc.h.getFirstHeader("To", hi)) { rfc2047_decode(hi.getValue(), transcoded); docout.text += string("To: ") + transcoded + string("\n"); } if (doc.h.getFirstHeader("Date", hi)) { rfc2047_decode(hi.getValue(), transcoded); if (depth == 1) { time_t t = rfc2822DateToUxTime(transcoded); if (t != (time_t)-1) { char ascuxtime[100]; sprintf(ascuxtime, "%ld", (long)t); docout.dmtime = ascuxtime; } else { // Leave mtime field alone, ftime will be used instead. LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str())); } } docout.text += string("Date: ") + transcoded + string("\n"); } if (doc.h.getFirstHeader("Subject", hi)) { rfc2047_decode(hi.getValue(), transcoded); if (depth == 1) docout.title = transcoded; docout.text += string("Subject: ") + transcoded + string("\n"); } docout.text += '\n'; LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n", doc.isMultipart(), doc.getSubType().c_str())); walkmime(docout, doc, depth); LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n", docout.text.c_str())); return MimeHandler::MHDone; } // Recursively walk the message mime parts and concatenate all the // inline html or text that we find anywhere. // // RFC2046 reminder: // Top level media types: // Simple: text, image, audio, video, application, // Composite: multipart, message. // // multipart can be mixed, alternative, parallel, digest. // message/rfc822 may also be of interest. void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth) { LOGDEB2(("MimeHandlerMail::walkmime: depth %d\n", depth)); if (depth++ >= maxdepth) { LOGINFO(("walkmime: max depth (%d) exceeded\n", maxdepth)); return; } string &out = docout.text; if (doc.isMultipart()) { LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n", doc.isMultipart(), doc.getSubType().c_str())); // We only handle alternative, related and mixed (no digests). std::vector::iterator it; if (!stringicmp("mixed", doc.getSubType()) || !stringicmp("related", doc.getSubType())) { // Multipart mixed and related: process each part. for (it = doc.members.begin(); it != doc.members.end();it++) { walkmime(docout, *it, depth); } } else if (!stringicmp("alternative", doc.getSubType())) { // Multipart/alternative: look for a text/plain part, then html. // Process if found std::vector::iterator ittxt, ithtml; ittxt = ithtml = doc.members.end(); int i = 1; for (it = doc.members.begin(); it != doc.members.end();it++, i++) { // Get and parse content-type header Binc::HeaderItem hi; if (!it->h.getFirstHeader("Content-Type", hi)) { LOGDEB(("No content-type header for part %d\n", i)); continue; } MimeHeaderValue content_type; parseMimeHeaderValue(hi.getValue(), content_type); LOGDEB2(("walkmime: C-type: %s\n",content_type.value.c_str())); if (!stringlowercmp("text/plain", content_type.value)) ittxt = it; else if (!stringlowercmp("text/html", content_type.value)) ithtml = it; } if (ittxt != doc.members.end()) { LOGDEB2(("walkmime: alternative: chose text/plain part\n")) walkmime(docout, *ittxt, depth); } else if (ithtml != doc.members.end()) { LOGDEB2(("walkmime: alternative: chose text/html part\n")) walkmime(docout, *ithtml, depth); } } return; } // Part is not multipart: it must be either simple or message. Take // a look at interesting headers and a possible filename parameter // Get and parse content-type header. Binc::HeaderItem hi; string ctt = "text/plain"; if (doc.h.getFirstHeader("Content-Type", hi)) { ctt = hi.getValue(); } LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str())); MimeHeaderValue content_type; parseMimeHeaderValue(ctt, content_type); // Get and parse Content-Disposition header string ctd = "inline"; if (doc.h.getFirstHeader("Content-Disposition", hi)) { ctd = hi.getValue(); } MimeHeaderValue content_disposition; parseMimeHeaderValue(ctd, content_disposition); LOGDEB2(("Content_disposition:[%s]\n", content_disposition.value.c_str())); string dispindic; if (stringlowercmp("inline", content_disposition.value)) dispindic = "Attachment"; else dispindic = "Inline"; // See if we have a filename. string filename; map::const_iterator it; it = content_disposition.params.find(string("filename")); if (it != content_disposition.params.end()) filename = it->second; if (doc.isMessageRFC822()) { LOGDEB2(("walkmime: message/RFC822 part\n")); // The first part is the already parsed message. Call // processMsg instead of walkmime so that mail headers get // printed. The depth will tell it what to do if (doc.members.empty()) { //?? return; } out += "\n"; if (m_forPreview) out += "[" + dispindic + " " + content_type.value + ": "; out += filename; if (m_forPreview) out += "]"; out += "\n\n"; processMsg(docout, doc.members[0], depth); return; } // "Simple" part. LOGDEB2(("walkmime: simple part\n")); // If the Content-Disposition is not inline, we treat it as // attachment, as per rfc2183. We don't process attachments // for now, except for indexing/displaying the file name // If it is inline but not text or html, same thing. if (stringlowercmp("inline", content_disposition.value) || (stringlowercmp("text/plain", content_type.value) && stringlowercmp("text/html", content_type.value)) ) { if (!filename.empty()) { out += "\n"; if (m_forPreview) out += "[" + dispindic + " " + content_type.value + ": "; out += filename; if (m_forPreview) out += "]"; out += "\n\n"; } // We're done with this part return; } // We are dealing with an inline part of text/plain or text/html type // Normally the default charset is us-ascii. But it happens that // 8 bit chars exist in a message that is stated as us-ascii. Ie the // mailer used by yahoo support ('KANA') does this. We could convert // to iso-8859 only if the transfer-encoding is 8 bit, or test for // actual 8 bit chars, but what the heck, le'ts use 8859-1 as default string charset = "iso-8859-1"; it = content_type.params.find(string("charset")); if (it != content_type.params.end()) charset = it->second; if (charset.empty() || !stringlowercmp("us-ascii", charset) || !stringlowercmp("default", charset) || !stringlowercmp("x-user-defined", charset) || !stringlowercmp("x-unknown", charset) || !stringlowercmp("unknown", charset) ) { charset = "iso-8859-1"; } // Content transfer encoding string cte = "7bit"; if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) { cte = hi.getValue(); } LOGDEB2(("walkmime: final: body start offset %d, length %d\n", doc.getBodyStartOffset(), doc.getBodyLength())); string body; doc.getBody(body, 0, doc.bodylength); // Decode according to content transfer encoding if (!stringlowercmp("quoted-printable", cte)) { string decoded; if (!qp_decode(body, decoded)) { LOGERR(("walkmime: quoted-printable decoding failed !\n")); return; } body = decoded; } else if (!stringlowercmp("base64", cte)) { string decoded; if (!base64_decode(body, decoded)) { LOGERR(("walkmime: base64 decoding failed !\n")); #if 0 FILE *fp = fopen("/tmp/recoll_decodefail", "w"); if (fp) { fprintf(fp, "%s", body.c_str()); fclose(fp); } #endif return; } body = decoded; } // Handle html stripping and transcoding to utf8 string utf8; if (!stringlowercmp("text/html", content_type.value)) { MimeHandlerHtml mh; Rcl::Doc hdoc; mh.charsethint = charset; mh.mkDoc(m_conf, "", body, content_type.value, hdoc); utf8 = hdoc.text; } else { // Transcode to utf-8 if (!transcode(body, utf8, charset, "UTF-8")) { LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n", charset.c_str())); utf8 = body; } } out += utf8; if (out.length() && out[out.length()-1] != '\n') out += '\n'; LOGDEB2(("walkmime: out now: [%s]\n", out.c_str())); }