decode encoded mail headers, plus use message date instead of file mtime

2005-10-15 12:18:04 +00:00 · 2005-10-15 12:18:04 +00:00 · 763b5f58c7
commit 763b5f58c7
parent 8493933aef
4 changed files with 189 additions and 18 deletions
--- a/src/index/indexer.cpp
+++ b/src/index/indexer.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: indexer.cpp,v 1.11 2005-04-06 10:20:11 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: indexer.cpp,v 1.12 2005-10-15 12:18:04 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@ -172,10 +172,14 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
 	if (fis == FileInterner::FIError)
 	    break;
-	// Set up common fields:
+	// Set the date if this was not done in the document handler
-	char ascdate[20];
+	// (ie: date from Date: mail header).
-	sprintf(ascdate, "%ld", long(stp->st_ctime));
+	if (doc.mtime.empty()) {
-	doc.mtime = ascdate;
+	    char ascdate[20];
 	    sprintf(ascdate, "%ld", long(stp->st_ctime));
 	    doc.mtime = ascdate;
 	}
 	// Internal access path for multi-document files
 	doc.ipath = ipath;
 	// Do database-specific work to update document data
--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.5 2005-04-06 10:20:11 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.6 2005-10-15 12:18:04 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 #include <stdio.h>
@ -172,30 +172,42 @@ MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc,
 	return MimeHandler::MHError;
    }
-    // Handle some headers. We should process rfc2047 encoding here
+    // Handle some headers. 
    // Also there should be no 8bit chars, but there sometimes are. So
    // we transcode as if from iso-8859-1, which is better than
    // getting utf8 conversion errors later on
    Binc::HeaderItem hi;
    string transcoded;
    if (doc.h.getFirstHeader("Subject", hi)) {
-	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
+	rfc2047_decode(hi.getValue(), transcoded);
 	docout.title = transcoded;
    }
    if (doc.h.getFirstHeader("From", hi)) {
-	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
+	rfc2047_decode(hi.getValue(), transcoded);
 	docout.text += string("From: ") + transcoded + string("\n");
    }
    if (doc.h.getFirstHeader("To", hi)) {
-	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
+	rfc2047_decode(hi.getValue(), transcoded);
 	docout.text += string("To: ") + transcoded + string("\n");
    }
    if (doc.h.getFirstHeader("Date", hi)) {
-	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
+	rfc2047_decode(hi.getValue(), transcoded);
 	// Try to set the mtime from the date field.
 	string date = transcoded;
 	string::size_type pos;
 	// Possibly get rid of the day
 	if ((pos = date.find(",")) != string::npos)
 	    date = date.substr(pos+1);
 	struct tm tm;
 	if (strptime(date.c_str(), " %d %b %Y %H:%M:%S %z ", &tm)) {
 	    char ascuxtime[100];
 	    sprintf(ascuxtime, "%ld", (long)mktime(&tm));
 	    docout.mtime = ascuxtime;
 	} else {
 	    LOGDEB(("strptime failed for [%s]\n", date.c_str()));
 	}
 	docout.text += string("Date: ") + transcoded + string("\n");
    }
    if (doc.h.getFirstHeader("Subject", hi)) {
-	transcode(hi.getValue(), transcoded, "iso-8859-1", "UTF-8");
+	rfc2047_decode(hi.getValue(), transcoded);
 	docout.text += string("Subject: ") + transcoded + string("\n");
    }
--- a/src/utils/mimeparse.cpp
+++ b/src/utils/mimeparse.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.3 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.4 2005-10-15 12:18:04 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_MIMEPARSE
@ -348,6 +348,148 @@ bool base64_decode(const string& in, string& out)
    return true;
 }
 #include "transcode.h"
 #include "smallut.h"
 // Decode a parsed encoded word
 static bool rfc2047_decodeParsed(const std::string& charset, 
 				 const std::string& encoding, 
 				 const std::string& value, 
 				 std::string &utf8)
 {
    //    fprintf(stderr, "DecodeParsed: charset [%s] enc [%s] val [%s]\n",
    //	    charset.c_str(), encoding.c_str(), value.c_str());
    utf8 = "";
    string decoded;
    if (!stringlowercmp("b", encoding)) {
 	if (!base64_decode(value, decoded))
 	    return false;
 	//	fprintf(stderr, "FromB64: [%s]\n", decoded.c_str());
    } else if (!stringlowercmp("q", encoding)) {
 	if (!qp_decode(value, decoded))
 	    return false;
 	// Need to translate _ to ' ' here
 	string temp;
 	for (string::size_type pos = 0; pos < decoded.length(); pos++)
 	    if (decoded[pos] == '_')
 		temp += ' ';
 	    else 
 		temp += decoded[pos];
 	decoded = temp;
 	//	fprintf(stderr, "FromQP: [%s]\n", decoded.c_str());
    } else {
 	//	fprintf(stderr, "Bad encoding [%s]\n", encoding.c_str());
 	return false;
    }
    if (!transcode(decoded, utf8, charset, "UTF-8")) {
 	//	fprintf(stderr, "Transcode failed\n");
 	return false;
    }
    return true;
 }
 // Parse a mail header encoded value
 typedef enum  {rfc2047base, rfc2047open_eq, rfc2047charset, rfc2047encoding, 
 	       rfc2047value, rfc2047close_q} Rfc2047States;
 bool rfc2047_decode(const std::string& in, std::string &out) 
 {
    Rfc2047States state = rfc2047base;
    string encoding, charset, value, utf8;
    out = "";
    for (unsigned int ii = 0; ii < in.length(); ii++) {
 	char ch = in[ii];
 	switch (state) {
 	case rfc2047base: 
 	    {
 		switch (ch) {
 		case '=': state = rfc2047open_eq; break;
 		default: value += ch;
 		}
 	    }
 	    break;
 	case rfc2047open_eq: 
 	    {
 		switch (ch) {
 		case '?': 
 		    {
 			// Transcode current (unencoded part) value:
 			// we sometimes find 8-bit chars in
 			// there. Interpret as Iso8859.
 			if (value.length() > 0) {
 			    transcode(value, utf8, "ISO8859-1", "UTF-8");
 			    out += utf8;
 			    value = "";
 			}
 			state = rfc2047charset; 
 		    }
 		    break;
 		default: state = rfc2047base; out += '='; out += ch;break;
 		}
 	    } 
 	    break;
 	case rfc2047charset: 
 	    {
 		switch (ch) {
 		case '?': state = rfc2047encoding; break;
 		default: charset += ch; break;
 		}
 	    } 
 	    break;
 	case rfc2047encoding: 
 	    {
 		switch (ch) {
 		case '?': state = rfc2047value; break;
 		default: encoding += ch; break;
 		}
 	    }
 	    break;
 	case rfc2047value: 
 	    {
 		switch (ch) {
 		case '?': state = rfc2047close_q; break;
 		default: value += ch;break;
 		}
 	    }
 	    break;
 	case rfc2047close_q: 
 	    {
 		switch (ch) {
 		case '=': 
 		    {
 			string utf8;
 			state = rfc2047base; 
 			if (!rfc2047_decodeParsed(charset, encoding, value, 
 						  utf8)) {
 			    return false;
 			}
 			out += utf8;
 			charset = encoding = value = "";
 		    }
 		    break;
 		default: state = rfc2047value; value += '?';value += ch;break;
 		}
 	    }
 	    break;
 	default: // ??
 	    return false;
 	}
    }
    if (value.length() > 0) {
 	transcode(value, utf8, "ISO8859-1", "UTF-8");
 	out += utf8;
 	value = "";
    }
    if (state != rfc2047base) 
 	return false;
    return true;
 }
 #else 
 #include <string>
@ -382,7 +524,7 @@ main(int argc, const char **argv)
 	fprintf(stderr, "qp_decode returned error\n");
    }
    printf("Decoded: '%s'\n", out.c_str());
-#else
+#elif 0
    //'C'est à boire qu'il nous faut éviter l'excès.'
    //'Deuxième ligne'
    //'Troisième ligne'
@ -396,6 +538,18 @@ main(int argc, const char **argv)
 	fprintf(stderr, "base64_decode returned error\n");
    }
    printf("Decoded: '%s'\n", out.c_str());
 #elif 1
    char line [1024];
    string out;
    while (fgets(line, 1023, stdin)) {
 	int l = strlen(line);
 	if (l == 0)
 	    continue;
 	line[l-1] = 0;
 	fprintf(stderr, "Line: [%s]\n", line);
 	rfc2047_decode(line, out);
 	fprintf(stderr, "Out:  [%s]\n", out.c_str());
    }
 #endif
 }
--- a/src/utils/mimeparse.h
+++ b/src/utils/mimeparse.h
@ -1,6 +1,6 @@
 #ifndef _MIME_H_INCLUDED_
 #define _MIME_H_INCLUDED_
-/* @(#$Id: mimeparse.h,v 1.2 2005-03-25 09:40:28 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: mimeparse.h,v 1.3 2005-10-15 12:18:04 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #include <map>
@ -15,5 +15,6 @@ extern bool parseMimeHeaderValue(const std::string& in, MimeHeaderValue& psd);
 bool qp_decode(const std::string& in, std::string &out);
 bool base64_decode(const std::string& in, std::string &out);
 bool rfc2047_decode(const std::string& in, std::string &out);
 #endif /* _MIME_H_INCLUDED_ */