From 2fc294a9c6b5ee7b1bb609102b42eb41ad2a72f0 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Sat, 6 Oct 2012 12:14:04 +0200
Subject: [PATCH] factored out common charset handling code in exec and execm,
 cleaned up charset and textplain handling in mh_mail

---
 src/common/cstr.h              |  1 +
 src/internfile/mh_exec.cpp     | 41 ++++++++++++---------
 src/internfile/mh_exec.h       |  7 ++++
 src/internfile/mh_execm.cpp    | 24 ++++---------
 src/internfile/mh_html.cpp     |  2 +-
 src/internfile/mh_mail.cpp     | 65 ++++++++++++++++++----------------
 src/internfile/mimehandler.cpp |  3 +-
 src/internfile/mimehandler.h   |  1 +
 src/sampleconf/mimeconf        | 10 +++---
 9 files changed, 82 insertions(+), 72 deletions(-)

diff --git a/src/common/cstr.h b/src/common/cstr.h
index 641d1033..0b1c5cf6 100644
--- a/src/common/cstr.h
+++ b/src/common/cstr.h
@@ -48,6 +48,7 @@ DEF_CSTR(fbytes, "fbytes");
 DEF_CSTR(fileu, "file://");
 DEF_CSTR(fmtime, "fmtime");
 DEF_CSTR(iso_8859_1, "ISO-8859-1");
+DEF_CSTR(utf8, "UTF-8");
 DEF_CSTR(minwilds, "*?[");
 DEF_CSTR(newline, "\n");
 DEF_CSTR(null, "");
diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp
index 55d5644b..2b0fc03b 100644
--- a/src/internfile/mh_exec.cpp
+++ b/src/internfile/mh_exec.cpp
@@ -143,30 +143,37 @@ bool MimeHandlerExec::next_document()
     return true;
 }
 
-void MimeHandlerExec::finaldetails()
+void MimeHandlerExec::handle_cs(const string& mt, const string& icharset)
 {
-    m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset;
+    string charset(icharset);
 
     // cfgFilterOutputCharset comes from the mimeconf filter
-    // definition line If the value is "default", we use the charset
-    // value defined in recoll.conf (which may vary depending on
-    // directory)
-    string& charset = m_metaData[cstr_dj_keycharset];
-    charset = cfgFilterOutputCharset.empty() ? "UTF-8" : cfgFilterOutputCharset;
-    if (!stringlowercmp("default", charset)) {
-	charset = m_dfltInputCharset;
+    // definition line and defaults to UTF-8 if empty. If the value is
+    // "default", we use the default input charset value defined in
+    // recoll.conf (which may vary depending on directory)
+    if (charset.empty()) {
+	charset = cfgFilterOutputCharset.empty() ? cstr_utf8 : 
+	    cfgFilterOutputCharset;
+	if (!stringlowercmp("default", charset)) {
+	    charset = m_dfltInputCharset;
+	}
     }
-
-    // The output mime type is html except if defined otherwise in the filter
-    // definition.
-    string& mt = m_metaData[cstr_dj_keymt];
-    mt = cfgFilterOutputMtype.empty() ? "text/html" : 
-	cfgFilterOutputMtype;
+    m_metaData[cstr_dj_keyorigcharset] = charset;
 
     // If this is text/plain transcode_to/check utf-8
     if (!mt.compare(cstr_textplain)) {
-	(void)txtdcode("mh_exec");
+	(void)txtdcode("mh_exec/m");
+    } else {
+	m_metaData[cstr_dj_keycharset] = charset;
     }
+}
+
+void MimeHandlerExec::finaldetails()
+{
+    // The default output mime type is html, but it may be defined
+    // otherwise in the filter definition.
+    m_metaData[cstr_dj_keymt] = cfgFilterOutputMtype.empty() ? "text/html" : 
+	cfgFilterOutputMtype;
 
     string md5, xmd5, reason;
     if (MD5File(m_fn, md5, &reason)) {
@@ -175,4 +182,6 @@ void MimeHandlerExec::finaldetails()
 	LOGERR(("MimeHandlerExec: cant compute md5 for [%s]: %s\n", 
 		m_fn.c_str(), reason.c_str()));
     }
+
+    handle_cs(m_metaData[cstr_dj_keymt]);
 }
diff --git a/src/internfile/mh_exec.h b/src/internfile/mh_exec.h
index 41ba6b78..4c5a2654 100644
--- a/src/internfile/mh_exec.h
+++ b/src/internfile/mh_exec.h
@@ -77,6 +77,13 @@ protected:
     string m_fn;
     string m_ipath;
 
+    // Set up the character set metadata fields and possibly transcode
+    // text/plain output. 
+    // @param charset when called from mh_execm, a possible explicit
+    //       value from the filter (else the data will come from the config)
+    virtual void handle_cs(const string& mt, const string& charset = string());
+
+private:
     virtual void finaldetails();
 };
 
diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp
index de0badf9..2b9f3846 100644
--- a/src/internfile/mh_execm.cpp
+++ b/src/internfile/mh_execm.cpp
@@ -120,7 +120,7 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
                 ibuf.c_str()));
         return false;
     }
-    LOGDEB1(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
+    LOGDEB(("MHExecMultiple: got name [%s] len: %d\n", name.c_str(), len));
     if (len / 1024 > m_maxmemberkb) {
         LOGERR(("MHExecMultiple: data len > maxmemberkb\n"));
         return false;
@@ -290,27 +290,15 @@ bool MimeHandlerExecMultiple::next_document()
         }
     }
 
-    // Charset. For many document types it doesn't matter. For text
-    // and html it does. We supply a default from the configuration. 
-    if (charset.empty()) {
-	charset = cfgFilterOutputCharset.empty() ? "utf-8" : 
-	    cfgFilterOutputCharset;
-	if (!stringlowercmp("default", charset)) {
-	    charset = m_dfltInputCharset;
-	}
-    }
-    m_metaData[cstr_dj_keyorigcharset] = charset;
-    m_metaData[cstr_dj_keycharset] = charset;
+    handle_cs(m_metaData[cstr_dj_keymt], charset);
 
-    if (!m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
-	(void)txtdcode("mh_execm");
-    }
-    
     if (eofnext_received)
         m_havedoc = false;
 
     LOGDEB0(("MHExecMultiple: returning %d bytes of content,"
-	    " mtype [%s] charset [%s]\n", m_metaData[cstr_dj_keycontent].size(), 
-     m_metaData[cstr_dj_keymt].c_str(), m_metaData[cstr_dj_keycharset].c_str()));
+	    " mtype [%s] charset [%s]\n", 
+	     m_metaData[cstr_dj_keycontent].size(), 
+	     m_metaData[cstr_dj_keymt].c_str(), 
+	     m_metaData[cstr_dj_keycharset].c_str()));
     return true;
 }
diff --git a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp
index 31615397..5a9de050 100644
--- a/src/internfile/mh_html.cpp
+++ b/src/internfile/mh_html.cpp
@@ -164,7 +164,7 @@ bool MimeHandlerHtml::next_document()
 
     m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
     m_metaData[cstr_dj_keycontent] = result.dump;
-    m_metaData[cstr_dj_keycharset] = "utf-8";
+    m_metaData[cstr_dj_keycharset] = cstr_utf8;
     // Avoid setting empty values which would crush ones possibly inherited
     // from parent (if we're an attachment)
     if (!result.dmtime.empty())
diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp
index 1ac0890a..7e177c3a 100644
--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@@ -242,39 +242,27 @@ bool MimeHandlerMail::processAttach()
     MHMailAttach *att = m_attachments[m_idx];
 
     m_metaData[cstr_dj_keymt] = att->m_contentType;
+    m_metaData[cstr_dj_keyorigcharset] = att->m_charset;
     m_metaData[cstr_dj_keycharset] = att->m_charset;
     m_metaData[cstr_dj_keyfn] = att->m_filename;
-    // Change the title to something helpul
     m_metaData[cstr_dj_keytitle] = att->m_filename + "  (" + m_subject + ")";
     LOGDEB1(("  processAttach:ct [%s] cs [%s] fn [%s]\n", 
 	    att->m_contentType.c_str(),
 	    att->m_charset.c_str(),
 	    att->m_filename.c_str()));
 
+    // Erase current content and replace
     m_metaData[cstr_dj_keycontent] = string();
     string& body = m_metaData[cstr_dj_keycontent];
     att->m_part->getBody(body, 0, att->m_part->bodylength);
-    string decoded;
-    const string *bdp;
-    if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
-	return false;
-    }
-    if (bdp != &body)
-	body = decoded;
-
-    // Special case for text/plain content. Internfile should deal
-    // with this but it expects text/plain to be utf-8 already, so we
-    // handle the transcoding if needed
-    if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
-	string utf8;
-	if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], "UTF-8")) {
-	    LOGERR(("  processAttach: transcode to utf-8 failed "
-		    "for charset [%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
- 	    // can't transcode at all -> data is garbage just erase it
- 	    body.clear();
-	} else {
-	    body = utf8;
+    {
+	string decoded;
+	const string *bdp;
+	if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
+	    return false;
 	}
+	if (bdp != &body)
+	    body.swap(decoded);
     }
 
     // Special case for application/octet-stream: try to better
@@ -287,6 +275,22 @@ bool MimeHandlerMail::processAttach()
 	    m_metaData[cstr_dj_keymt] = mt;
     }
 
+    // Special case for text/plain content. Internfile should deal
+    // with this but it expects text/plain to be utf-8 already, so we
+    // handle the transcoding if needed
+    if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
+	string utf8;
+	if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], cstr_utf8)) {
+	    LOGERR(("  processAttach: transcode to utf-8 failed for charset "
+		    "[%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
+ 	    // can't transcode at all -> data is garbage just erase it
+ 	    body.clear();
+	} else {
+	    m_metaData[cstr_dj_keycharset] = cstr_utf8;
+	    body.swap(utf8);
+	}
+    }
+
     // Ipath
     char nbuf[20];
     sprintf(nbuf, "%d", m_idx);
@@ -527,11 +531,13 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
 
     // "Simple" part. 
     LOGDEB2(("walkmime: simple  part\n"));
-    // Normally the default charset is us-ascii. But it happens that
-    // 8 bit chars exist in a message that is stated as us-ascii. Ie the 
-    // mailer used by yahoo support ('KANA') does this. We could convert 
-    // to iso-8859 only if the transfer-encoding is 8 bit, or test for
-    // actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
+    // Normally the default charset is us-ascii. But it happens that 8
+    // bit chars exist in a message that is stated as us-ascii. Ie the
+    // mailer used by yahoo support ('KANA') does this. We could
+    // convert to iso-8859 only if the transfer-encoding is 8 bit, or
+    // test for actual 8 bit chars, but what the heck, le'ts use
+    // 8859-1 (actually CP1252 which is compatible, but with more
+    // useful chars) as default.
     string charset;
     it = content_type.params.find(cstr_mail_charset);
     if (it != content_type.params.end())
@@ -544,7 +550,7 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
 	!stringlowercmp("unknown", charset) ) {
         m_config->getConfParam("maildefcharset", charset);
         if (charset.empty())
-            charset = "iso-8859-1";
+            charset = "CP1252";
     }
 
     // Content transfer encoding
@@ -609,8 +615,6 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
 	body = decoded;
 
     // Handle html stripping and transcoding to utf8
-    string utf8;
-    const string *putf8 = 0;
     if (!stringlowercmp("text/html", content_type.value)) {
 	MimeHandlerHtml mh(m_config, "text/html");
 	mh.set_property(Dijon::Filter::OPERATING_MODE, 
@@ -623,9 +627,10 @@ void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
 	if (it != mh.get_meta_data().end())
 	    out += it->second;
     } else {
+	string utf8;
 	// Transcode to utf-8 
 	LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str()));
-	if (!transcode(body, utf8, charset, "UTF-8")) {
+	if (!transcode(body, utf8, charset, cstr_utf8)) {
 	    LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
 		    charset.c_str()));
 	    out += body;
diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp
index 83c464c7..55b2f498 100644
--- a/src/internfile/mimehandler.cpp
+++ b/src/internfile/mimehandler.cpp
@@ -288,8 +288,7 @@ Dijon::Filter *getMimeHandler(const string &mtype, RclConfig *cfg,
 
 out:
     if (h) {
-	string charset = cfg->getDefCharset();
-	h->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
+	h->set_property(Dijon::Filter::DEFAULT_CHARSET, cfg->getDefCharset());
     }
     return h;
 }
diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h
index 301815f6..4124f587 100644
--- a/src/internfile/mimehandler.h
+++ b/src/internfile/mimehandler.h
@@ -109,6 +109,7 @@ public:
     }
 
     // This only makes sense if the contents are currently txt/plain
+    // It converts from keyorigcharset to UTF-8 and sets keycharset.
     bool txtdcode(const string& who);
 
 protected:
diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf
index d0011916..b9b536e5 100644
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@@ -40,7 +40,7 @@ application/x-lzma = uncompress rcluncomp unxz %f %t
 # The default is now again to use rcldoc. Use raw antiword if speed is more
 # important for you than catching all data, 
 application/msword = exec rcldoc
-#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain;charset=utf-8
+#application/msword = exec antiword -t -i 1 -m UTF-8;mimetype=text/plain
 # You can also use wvware directly but it's much slower.
 # application/msword = exec wvWare --charset=utf-8 --nographics
 
@@ -52,8 +52,8 @@ application/vnd.ms-office = exec rcldoc
 application/ogg = execm rclaudio
 application/pdf = exec rclpdf
 application/postscript = exec pstotext;charset=iso-8859-1;mimetype=text/plain
-application/vnd.ms-excel = exec xls2csv -c "	" -d utf-8;charset=utf-8;mimetype=text/plain
-application/vnd.ms-powerpoint = exec catppt -d utf-8;charset=utf-8;mimetype=text/plain
+application/vnd.ms-excel = exec xls2csv -c "	" -d utf-8;mimetype=text/plain
+application/vnd.ms-powerpoint = exec catppt -d utf-8;mimetype=text/plain
 application/vn.oasis.opendocument.txt = exec rclsoff
 application/vnd.openxmlformats-officedocument.wordprocessingml.document = \
  exec rclopxml
@@ -81,7 +81,7 @@ application/vnd.wordperfect = exec wpd2html;mimetype=text/html
 application/x-abiword = exec rclabw
 application/x-awk = internal text/plain
 application/x-chm = execm rclchm
-application/x-dia-diagram = execm rcldia;mimetype=text/plain;charset=utf-8
+application/x-dia-diagram = execm rcldia;mimetype=text/plain
 application/x-dvi = exec rcldvi
 application/x-flac = execm rclaudio
 application/x-gnuinfo = execm rclinfo
@@ -109,7 +109,7 @@ image/vnd.djvu = exec rcldjvu
 image/svg+xml = exec rclsvg
 image/x-xcf = execm rclimg
 message/rfc822 = internal
-text/calendar = execm rclics;mimetype=text/plain;charset=utf-8
+text/calendar = execm rclics;mimetype=text/plain
 text/html  = internal 
 text/plain = internal 
 text/rtf = exec unrtf --nopict --html;mimetype=text/html