496 lines
15 KiB
C++
496 lines
15 KiB
C++
#ifndef lint
|
|
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.22 2006-12-07 07:06:28 dockes Exp $ (C) 2005 J.F.Dockes";
|
|
#endif
|
|
/*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the
|
|
* Free Software Foundation, Inc.,
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <fcntl.h>
|
|
#include <errno.h>
|
|
#include <unistd.h>
|
|
#include <time.h>
|
|
#include <regex.h>
|
|
|
|
#include <map>
|
|
#include <sstream>
|
|
|
|
#include "mimehandler.h"
|
|
#include "debuglog.h"
|
|
#include "csguess.h"
|
|
#include "readfile.h"
|
|
#include "transcode.h"
|
|
#include "mimeparse.h"
|
|
#include "indextext.h"
|
|
#include "mh_mail.h"
|
|
#include "debuglog.h"
|
|
#include "smallut.h"
|
|
#include "mimeparse.h"
|
|
#include "mh_html.h"
|
|
|
|
// binc imap mime definitions
|
|
#include "mime.h"
|
|
|
|
#ifndef NO_NAMESPACES
|
|
using namespace std;
|
|
#endif /* NO_NAMESPACES */
|
|
|
|
static const int maxdepth = 20;
|
|
|
|
MimeHandlerMail::~MimeHandlerMail()
|
|
{
|
|
if (m_vfp) {
|
|
fclose((FILE *)m_vfp);
|
|
m_vfp = 0;
|
|
}
|
|
}
|
|
|
|
// We are called for two different file types: mbox-type folders
|
|
// holding multiple messages, and maildir-type files with one message
|
|
// ipath is non empty only when we are called for retrieving a single message
|
|
// for preview. It is always empty during indexing, and we fill it up with
|
|
// the message number for the returned doc
|
|
MimeHandler::Status
|
|
MimeHandlerMail::mkDoc(RclConfig *cnf, const string &fn,
|
|
const string &mtype, Rcl::Doc &docout, string& ipath)
|
|
{
|
|
LOGDEB2(("MimeHandlerMail::mkDoc: %s [%s]\n", mtype.c_str(), fn.c_str()));
|
|
m_conf = cnf;
|
|
|
|
if (!stringlowercmp("message/rfc822", mtype)) {
|
|
ipath = "";
|
|
int fd;
|
|
if ((fd = open(fn.c_str(), 0)) < 0) {
|
|
LOGERR(("MimeHandlerMail::mkDoc: open(%s) errno %d\n",
|
|
fn.c_str(), errno));
|
|
return MimeHandler::MHError;
|
|
}
|
|
Binc::MimeDocument doc;
|
|
doc.parseFull(fd);
|
|
if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
|
|
LOGERR(("MimeHandlerMail::mkDoc: mime parse error for %s\n",
|
|
fn.c_str()));
|
|
return MimeHandler::MHError;
|
|
}
|
|
MimeHandler::Status ret = processMsg(docout, doc, 0);
|
|
close(fd);
|
|
return ret;
|
|
} else if (!stringlowercmp("text/x-mail", mtype)) {
|
|
return processmbox(fn, docout, ipath);
|
|
} else // hu ho
|
|
return MimeHandler::MHError;
|
|
}
|
|
|
|
static const char *frompat = "^From .* [1-2][0-9][0-9][0-9]\n$";
|
|
static regex_t fromregex;
|
|
static bool regcompiled;
|
|
|
|
MimeHandler::Status
|
|
MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
|
|
{
|
|
int mtarg = 0;
|
|
if (ipath != "") {
|
|
sscanf(ipath.c_str(), "%d", &mtarg);
|
|
}
|
|
LOGDEB2(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(),
|
|
mtarg));
|
|
|
|
FILE *fp;
|
|
// Open the file on first call, then save/reuse the file pointer
|
|
if (!m_vfp) {
|
|
fp = fopen(fn.c_str(), "r");
|
|
if (fp == 0) {
|
|
LOGERR(("MimeHandlerMail::processmbox: error opening %s\n",
|
|
fn.c_str()));
|
|
return MimeHandler::MHError;
|
|
}
|
|
m_vfp = fp;
|
|
} else {
|
|
fp = (FILE *)m_vfp;
|
|
}
|
|
if (!regcompiled) {
|
|
regcomp(&fromregex, frompat, REG_NOSUB);
|
|
regcompiled = true;
|
|
}
|
|
|
|
// If we are called to retrieve a specific message, seek to bof
|
|
// (then scan up to the message). This is for the case where the
|
|
// same object is reused to fetch several messages (else the fp is
|
|
// just opened no need for a seek). We could also check if the
|
|
// current message number is lower than the requested one and
|
|
// avoid rereading the whole thing in this case. But I'm not sure
|
|
// we're ever used in this way (multiple retrieves on same
|
|
// object). So:
|
|
if (mtarg > 0) {
|
|
fseek(fp, 0, SEEK_SET);
|
|
m_msgnum = 0;
|
|
}
|
|
|
|
off_t start, end;
|
|
bool iseof = false;
|
|
bool hademptyline = true;
|
|
string msgtxt;
|
|
do {
|
|
// Look for next 'From ' Line, start of message. Set start to
|
|
// line after this
|
|
char line[501];
|
|
for (;;) {
|
|
if (!fgets(line, 500, fp)) {
|
|
// Eof hit while looking for 'From ' -> file done. We'd need
|
|
// another return code here
|
|
return MimeHandler::MHError;
|
|
}
|
|
if (line[0] == '\n') {
|
|
hademptyline = true;
|
|
continue;
|
|
}
|
|
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
|
|
start = ftello(fp);
|
|
m_msgnum++;
|
|
break;
|
|
}
|
|
hademptyline = false;
|
|
}
|
|
|
|
// Look for next 'From ' line or eof, end of message.
|
|
for (;;) {
|
|
end = ftello(fp);
|
|
if (!fgets(line, 500, fp)) {
|
|
if (ferror(fp) || feof(fp))
|
|
iseof = true;
|
|
break;
|
|
}
|
|
if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
|
|
break;
|
|
}
|
|
if (mtarg <= 0 || m_msgnum == mtarg) {
|
|
msgtxt += line;
|
|
}
|
|
if (line[0] == '\n') {
|
|
hademptyline = true;
|
|
} else {
|
|
hademptyline = false;
|
|
}
|
|
}
|
|
fseek(fp, end, SEEK_SET);
|
|
} while (mtarg > 0 && m_msgnum < mtarg);
|
|
|
|
stringstream s(msgtxt);
|
|
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
|
|
Binc::MimeDocument doc;
|
|
doc.parseFull(s);
|
|
if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
|
|
LOGERR(("MimeHandlerMail::processMbox: mime parse error for %s\n",
|
|
fn.c_str()));
|
|
return MimeHandler::MHError;
|
|
}
|
|
|
|
MimeHandler::Status ret = processMsg(docout, doc, 0);
|
|
|
|
if (ret == MimeHandler::MHError)
|
|
return ret;
|
|
char buf[20];
|
|
sprintf(buf, "%d", m_msgnum);
|
|
ipath = buf;
|
|
return iseof ? MimeHandler::MHDone :
|
|
(mtarg > 0) ? MimeHandler::MHDone : MimeHandler::MHAgain;
|
|
}
|
|
|
|
|
|
// Transform a single message into a document. The subject becomes the
|
|
// title, and any simple body part with a content-type of text or html
|
|
// and content-disposition inline gets concatenated as text.
|
|
//
|
|
// If depth is not zero, we're called recursively for an
|
|
// message/rfc822 part and we must not touch the doc fields except the
|
|
// text
|
|
MimeHandler::Status
|
|
MimeHandlerMail::processMsg(Rcl::Doc &docout, Binc::MimePart& doc,
|
|
int depth)
|
|
{
|
|
LOGDEB2(("MimeHandlerMail::processMsg: depth %d\n", depth));
|
|
if (depth++ >= maxdepth) {
|
|
// Have to stop somewhere
|
|
LOGDEB(("MimeHandlerMail::processMsg: maxdepth %d exceeded\n",
|
|
maxdepth));
|
|
return MimeHandler::MHDone;
|
|
}
|
|
|
|
// Handle some headers.
|
|
Binc::HeaderItem hi;
|
|
string transcoded;
|
|
if (doc.h.getFirstHeader("From", hi)) {
|
|
rfc2047_decode(hi.getValue(), transcoded);
|
|
docout.text += string("From: ") + transcoded + string("\n");
|
|
}
|
|
if (doc.h.getFirstHeader("To", hi)) {
|
|
rfc2047_decode(hi.getValue(), transcoded);
|
|
docout.text += string("To: ") + transcoded + string("\n");
|
|
}
|
|
if (doc.h.getFirstHeader("Date", hi)) {
|
|
rfc2047_decode(hi.getValue(), transcoded);
|
|
if (depth == 1) {
|
|
time_t t = rfc2822DateToUxTime(transcoded);
|
|
if (t != (time_t)-1) {
|
|
char ascuxtime[100];
|
|
sprintf(ascuxtime, "%ld", (long)t);
|
|
docout.dmtime = ascuxtime;
|
|
} else {
|
|
// Leave mtime field alone, ftime will be used instead.
|
|
LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
|
|
}
|
|
}
|
|
docout.text += string("Date: ") + transcoded + string("\n");
|
|
}
|
|
if (doc.h.getFirstHeader("Subject", hi)) {
|
|
rfc2047_decode(hi.getValue(), transcoded);
|
|
if (depth == 1)
|
|
docout.title = transcoded;
|
|
docout.text += string("Subject: ") + transcoded + string("\n");
|
|
}
|
|
docout.text += '\n';
|
|
|
|
LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n",
|
|
doc.isMultipart(), doc.getSubType().c_str()));
|
|
walkmime(docout, doc, depth);
|
|
|
|
LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n", docout.text.c_str()));
|
|
return MimeHandler::MHDone;
|
|
}
|
|
|
|
// Recursively walk the message mime parts and concatenate all the
|
|
// inline html or text that we find anywhere.
|
|
//
|
|
// RFC2046 reminder:
|
|
// Top level media types:
|
|
// Simple: text, image, audio, video, application,
|
|
// Composite: multipart, message.
|
|
//
|
|
// multipart can be mixed, alternative, parallel, digest.
|
|
// message/rfc822 may also be of interest.
|
|
|
|
void MimeHandlerMail::walkmime(Rcl::Doc& docout, Binc::MimePart& doc,int depth)
|
|
{
|
|
LOGDEB2(("MimeHandlerMail::walkmime: depth %d\n", depth));
|
|
if (depth++ >= maxdepth) {
|
|
LOGINFO(("walkmime: max depth (%d) exceeded\n", maxdepth));
|
|
return;
|
|
}
|
|
|
|
string &out = docout.text;
|
|
|
|
if (doc.isMultipart()) {
|
|
LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n",
|
|
doc.isMultipart(), doc.getSubType().c_str()));
|
|
// We only handle alternative, related and mixed (no digests).
|
|
std::vector<Binc::MimePart>::iterator it;
|
|
|
|
if (!stringicmp("mixed", doc.getSubType()) ||
|
|
!stringicmp("related", doc.getSubType())) {
|
|
// Multipart mixed and related: process each part.
|
|
for (it = doc.members.begin(); it != doc.members.end();it++) {
|
|
walkmime(docout, *it, depth);
|
|
}
|
|
|
|
} else if (!stringicmp("alternative", doc.getSubType())) {
|
|
// Multipart/alternative: look for a text/plain part, then html.
|
|
// Process if found
|
|
std::vector<Binc::MimePart>::iterator ittxt, ithtml;
|
|
ittxt = ithtml = doc.members.end();
|
|
int i = 1;
|
|
for (it = doc.members.begin(); it != doc.members.end();it++, i++) {
|
|
// Get and parse content-type header
|
|
Binc::HeaderItem hi;
|
|
if (!it->h.getFirstHeader("Content-Type", hi)) {
|
|
LOGDEB(("No content-type header for part %d\n", i));
|
|
continue;
|
|
}
|
|
MimeHeaderValue content_type;
|
|
parseMimeHeaderValue(hi.getValue(), content_type);
|
|
LOGDEB2(("walkmime: C-type: %s\n",content_type.value.c_str()));
|
|
if (!stringlowercmp("text/plain", content_type.value))
|
|
ittxt = it;
|
|
else if (!stringlowercmp("text/html", content_type.value))
|
|
ithtml = it;
|
|
}
|
|
if (ittxt != doc.members.end()) {
|
|
LOGDEB2(("walkmime: alternative: chose text/plain part\n"))
|
|
walkmime(docout, *ittxt, depth);
|
|
} else if (ithtml != doc.members.end()) {
|
|
LOGDEB2(("walkmime: alternative: chose text/html part\n"))
|
|
walkmime(docout, *ithtml, depth);
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Part is not multipart: it must be either simple or message. Take
|
|
// a look at interesting headers and a possible filename parameter
|
|
|
|
// Get and parse content-type header.
|
|
Binc::HeaderItem hi;
|
|
string ctt = "text/plain";
|
|
if (doc.h.getFirstHeader("Content-Type", hi)) {
|
|
ctt = hi.getValue();
|
|
}
|
|
LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
|
|
MimeHeaderValue content_type;
|
|
parseMimeHeaderValue(ctt, content_type);
|
|
|
|
// Get and parse Content-Disposition header
|
|
string ctd = "inline";
|
|
if (doc.h.getFirstHeader("Content-Disposition", hi)) {
|
|
ctd = hi.getValue();
|
|
}
|
|
MimeHeaderValue content_disposition;
|
|
parseMimeHeaderValue(ctd, content_disposition);
|
|
LOGDEB2(("Content_disposition:[%s]\n", content_disposition.value.c_str()));
|
|
string dispindic;
|
|
if (stringlowercmp("inline", content_disposition.value))
|
|
dispindic = "Attachment";
|
|
else
|
|
dispindic = "Inline";
|
|
|
|
// See if we have a filename.
|
|
string filename;
|
|
map<string,string>::const_iterator it;
|
|
it = content_disposition.params.find(string("filename"));
|
|
if (it != content_disposition.params.end())
|
|
filename = it->second;
|
|
|
|
if (doc.isMessageRFC822()) {
|
|
LOGDEB2(("walkmime: message/RFC822 part\n"));
|
|
|
|
// The first part is the already parsed message. Call
|
|
// processMsg instead of walkmime so that mail headers get
|
|
// printed. The depth will tell it what to do
|
|
if (doc.members.empty()) {
|
|
//??
|
|
return;
|
|
}
|
|
out += "\n";
|
|
if (m_forPreview)
|
|
out += "[" + dispindic + " " + content_type.value + ": ";
|
|
out += filename;
|
|
if (m_forPreview)
|
|
out += "]";
|
|
out += "\n\n";
|
|
processMsg(docout, doc.members[0], depth);
|
|
return;
|
|
}
|
|
|
|
// "Simple" part.
|
|
LOGDEB2(("walkmime: simple part\n"));
|
|
|
|
// If the Content-Disposition is not inline, we treat it as
|
|
// attachment, as per rfc2183. We don't process attachments
|
|
// for now, except for indexing/displaying the file name
|
|
// If it is inline but not text or html, same thing.
|
|
if (stringlowercmp("inline", content_disposition.value) ||
|
|
(stringlowercmp("text/plain", content_type.value) &&
|
|
stringlowercmp("text/html", content_type.value)) ) {
|
|
if (!filename.empty()) {
|
|
out += "\n";
|
|
if (m_forPreview)
|
|
out += "[" + dispindic + " " + content_type.value + ": ";
|
|
out += filename;
|
|
if (m_forPreview)
|
|
out += "]";
|
|
out += "\n\n";
|
|
}
|
|
// We're done with this part
|
|
return;
|
|
}
|
|
|
|
// We are dealing with an inline part of text/plain or text/html type
|
|
|
|
// Normally the default charset is us-ascii. But it happens that
|
|
// 8 bit chars exist in a message that is stated as us-ascii. Ie the
|
|
// mailer used by yahoo support ('KANA') does this. We could convert
|
|
// to iso-8859 only if the transfer-encoding is 8 bit, or test for
|
|
// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
|
|
string charset = "iso-8859-1";
|
|
it = content_type.params.find(string("charset"));
|
|
if (it != content_type.params.end())
|
|
charset = it->second;
|
|
if (charset.empty() ||
|
|
!stringlowercmp("us-ascii", charset) ||
|
|
!stringlowercmp("default", charset) ||
|
|
!stringlowercmp("x-user-defined", charset) ||
|
|
!stringlowercmp("x-unknown", charset) ||
|
|
!stringlowercmp("unknown", charset) ) {
|
|
charset = "iso-8859-1";
|
|
}
|
|
|
|
// Content transfer encoding
|
|
string cte = "7bit";
|
|
if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {
|
|
cte = hi.getValue();
|
|
}
|
|
|
|
LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
|
|
doc.getBodyStartOffset(), doc.getBodyLength()));
|
|
string body;
|
|
doc.getBody(body, 0, doc.bodylength);
|
|
|
|
// Decode according to content transfer encoding
|
|
if (!stringlowercmp("quoted-printable", cte)) {
|
|
string decoded;
|
|
if (!qp_decode(body, decoded)) {
|
|
LOGERR(("walkmime: quoted-printable decoding failed !\n"));
|
|
return;
|
|
}
|
|
body = decoded;
|
|
} else if (!stringlowercmp("base64", cte)) {
|
|
string decoded;
|
|
if (!base64_decode(body, decoded)) {
|
|
LOGERR(("walkmime: base64 decoding failed !\n"));
|
|
#if 0
|
|
FILE *fp = fopen("/tmp/recoll_decodefail", "w");
|
|
if (fp) {
|
|
fprintf(fp, "%s", body.c_str());
|
|
fclose(fp);
|
|
}
|
|
#endif
|
|
return;
|
|
}
|
|
body = decoded;
|
|
}
|
|
|
|
// Handle html stripping and transcoding to utf8
|
|
string utf8;
|
|
if (!stringlowercmp("text/html", content_type.value)) {
|
|
MimeHandlerHtml mh;
|
|
Rcl::Doc hdoc;
|
|
mh.charsethint = charset;
|
|
mh.mkDoc(m_conf, "", body, content_type.value, hdoc);
|
|
utf8 = hdoc.text;
|
|
} else {
|
|
// Transcode to utf-8
|
|
if (!transcode(body, utf8, charset, "UTF-8")) {
|
|
LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
|
|
charset.c_str()));
|
|
utf8 = body;
|
|
}
|
|
}
|
|
|
|
out += utf8;
|
|
if (out.length() && out[out.length()-1] != '\n')
|
|
out += '\n';
|
|
|
|
LOGDEB2(("walkmime: out now: [%s]\n", out.c_str()));
|
|
}
|