diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index b31216af..55756c01 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: indexer.cpp,v 1.11 2005-04-06 10:20:11 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: indexer.cpp,v 1.12 2005-10-15 12:18:04 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -172,10 +172,14 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, if (fis == FileInterner::FIError) break; - // Set up common fields: - char ascdate[20]; - sprintf(ascdate, "%ld", long(stp->st_ctime)); - doc.mtime = ascdate; + // Set the date if this was not done in the document handler + // (ie: date from Date: mail header). + if (doc.mtime.empty()) { + char ascdate[20]; + sprintf(ascdate, "%ld", long(stp->st_ctime)); + doc.mtime = ascdate; + } + // Internal access path for multi-document files doc.ipath = ipath; // Do database-specific work to update document data diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index 8ebe4cac..073ef37e 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.5 2005-04-06 10:20:11 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.6 2005-10-15 12:18:04 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #include @@ -172,30 +172,42 @@ MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc, return MimeHandler::MHError; } - // Handle some headers. We should process rfc2047 encoding here - // Also there should be no 8bit chars, but there sometimes are. So - // we transcode as if from iso-8859-1, which is better than - // getting utf8 conversion errors later on + // Handle some headers. Binc::HeaderItem hi; string transcoded; if (doc.h.getFirstHeader("Subject", hi)) { - transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8"); + rfc2047_decode(hi.getValue(), transcoded); docout.title = transcoded; } if (doc.h.getFirstHeader("From", hi)) { - transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8"); + rfc2047_decode(hi.getValue(), transcoded); docout.text += string("From: ") + transcoded + string("\n"); } if (doc.h.getFirstHeader("To", hi)) { - transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8"); + rfc2047_decode(hi.getValue(), transcoded); docout.text += string("To: ") + transcoded + string("\n"); } if (doc.h.getFirstHeader("Date", hi)) { - transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8"); + rfc2047_decode(hi.getValue(), transcoded); + // Try to set the mtime from the date field. + string date = transcoded; + string::size_type pos; + // Possibly get rid of the day + if ((pos = date.find(",")) != string::npos) + date = date.substr(pos+1); + struct tm tm; + if (strptime(date.c_str(), " %d %b %Y %H:%M:%S %z ", &tm)) { + char ascuxtime[100]; + sprintf(ascuxtime, "%ld", (long)mktime(&tm)); + docout.mtime = ascuxtime; + } else { + LOGDEB(("strptime failed for [%s]\n", date.c_str())); + } + docout.text += string("Date: ") + transcoded + string("\n"); } if (doc.h.getFirstHeader("Subject", hi)) { - transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8"); + rfc2047_decode(hi.getValue(), transcoded); docout.text += string("Subject: ") + transcoded + string("\n"); } diff --git a/src/utils/mimeparse.cpp b/src/utils/mimeparse.cpp index 2e3d6895..287eb0ef 100644 --- a/src/utils/mimeparse.cpp +++ b/src/utils/mimeparse.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.3 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.4 2005-10-15 12:18:04 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_MIMEPARSE @@ -348,6 +348,148 @@ bool base64_decode(const string& in, string& out) return true; } +#include "transcode.h" +#include "smallut.h" + +// Decode a parsed encoded word +static bool rfc2047_decodeParsed(const std::string& charset, + const std::string& encoding, + const std::string& value, + std::string &utf8) +{ + // fprintf(stderr, "DecodeParsed: charset [%s] enc [%s] val [%s]\n", + // charset.c_str(), encoding.c_str(), value.c_str()); + utf8 = ""; + + string decoded; + if (!stringlowercmp("b", encoding)) { + if (!base64_decode(value, decoded)) + return false; + // fprintf(stderr, "FromB64: [%s]\n", decoded.c_str()); + } else if (!stringlowercmp("q", encoding)) { + if (!qp_decode(value, decoded)) + return false; + // Need to translate _ to ' ' here + string temp; + for (string::size_type pos = 0; pos < decoded.length(); pos++) + if (decoded[pos] == '_') + temp += ' '; + else + temp += decoded[pos]; + decoded = temp; + // fprintf(stderr, "FromQP: [%s]\n", decoded.c_str()); + } else { + // fprintf(stderr, "Bad encoding [%s]\n", encoding.c_str()); + return false; + } + + if (!transcode(decoded, utf8, charset, "UTF-8")) { + // fprintf(stderr, "Transcode failed\n"); + return false; + } + return true; +} + +// Parse a mail header encoded value +typedef enum {rfc2047base, rfc2047open_eq, rfc2047charset, rfc2047encoding, + rfc2047value, rfc2047close_q} Rfc2047States; + +bool rfc2047_decode(const std::string& in, std::string &out) +{ + Rfc2047States state = rfc2047base; + string encoding, charset, value, utf8; + + out = ""; + + for (unsigned int ii = 0; ii < in.length(); ii++) { + char ch = in[ii]; + switch (state) { + case rfc2047base: + { + switch (ch) { + case '=': state = rfc2047open_eq; break; + default: value += ch; + } + } + break; + case rfc2047open_eq: + { + switch (ch) { + case '?': + { + // Transcode current (unencoded part) value: + // we sometimes find 8-bit chars in + // there. Interpret as Iso8859. + if (value.length() > 0) { + transcode(value, utf8, "ISO8859-1", "UTF-8"); + out += utf8; + value = ""; + } + state = rfc2047charset; + } + break; + default: state = rfc2047base; out += '='; out += ch;break; + } + } + break; + case rfc2047charset: + { + switch (ch) { + case '?': state = rfc2047encoding; break; + default: charset += ch; break; + } + } + break; + case rfc2047encoding: + { + switch (ch) { + case '?': state = rfc2047value; break; + default: encoding += ch; break; + } + } + break; + case rfc2047value: + { + switch (ch) { + case '?': state = rfc2047close_q; break; + default: value += ch;break; + } + } + break; + case rfc2047close_q: + { + switch (ch) { + case '=': + { + string utf8; + state = rfc2047base; + if (!rfc2047_decodeParsed(charset, encoding, value, + utf8)) { + return false; + } + out += utf8; + charset = encoding = value = ""; + } + break; + default: state = rfc2047value; value += '?';value += ch;break; + } + } + break; + default: // ?? + return false; + } + } + + if (value.length() > 0) { + transcode(value, utf8, "ISO8859-1", "UTF-8"); + out += utf8; + value = ""; + } + if (state != rfc2047base) + return false; + return true; +} + #else #include @@ -382,7 +524,7 @@ main(int argc, const char **argv) fprintf(stderr, "qp_decode returned error\n"); } printf("Decoded: '%s'\n", out.c_str()); -#else +#elif 0 //'C'est à boire qu'il nous faut éviter l'excès.' //'Deuxième ligne' //'Troisième ligne' @@ -396,6 +538,18 @@ main(int argc, const char **argv) fprintf(stderr, "base64_decode returned error\n"); } printf("Decoded: '%s'\n", out.c_str()); +#elif 1 + char line [1024]; + string out; + while (fgets(line, 1023, stdin)) { + int l = strlen(line); + if (l == 0) + continue; + line[l-1] = 0; + fprintf(stderr, "Line: [%s]\n", line); + rfc2047_decode(line, out); + fprintf(stderr, "Out: [%s]\n", out.c_str()); + } #endif } diff --git a/src/utils/mimeparse.h b/src/utils/mimeparse.h index 081b62ca..adc3af3d 100644 --- a/src/utils/mimeparse.h +++ b/src/utils/mimeparse.h @@ -1,6 +1,6 @@ #ifndef _MIME_H_INCLUDED_ #define _MIME_H_INCLUDED_ -/* @(#$Id: mimeparse.h,v 1.2 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: mimeparse.h,v 1.3 2005-10-15 12:18:04 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -15,5 +15,6 @@ extern bool parseMimeHeaderValue(const std::string& in, MimeHeaderValue& psd); bool qp_decode(const std::string& in, std::string &out); bool base64_decode(const std::string& in, std::string &out); +bool rfc2047_decode(const std::string& in, std::string &out); #endif /* _MIME_H_INCLUDED_ */