Transcode mh_execm text/plain output like we do for mh_exec. Adjust handling of transcoding errors. These changes should fix most cases of non-utf8 text making it to unac/index

2011-10-20 14:00:38 +02:00 · 2011-10-20 14:00:38 +02:00 · f544b28b4a
commit f544b28b4a
parent d94a4ec315
7 changed files with 53 additions and 13 deletions
--- a/src/internfile/mh_exec.cpp
+++ b/src/internfile/mh_exec.cpp
@ -153,17 +153,24 @@ void MimeHandlerExec::finaldetails()
    // cfgFilterOutputCharset comes from the mimeconf filter definition line
    string charset = cfgFilterOutputCharset.empty() ? "utf-8" : 
 	cfgFilterOutputCharset;
+    bool trustcharset = true;
    if (!stringlowercmp("default", charset)) {
 	charset = m_dfltInputCharset;
+	trustcharset = false;
    }
    string mt = cfgFilterOutputMtype.empty() ? "text/html" : 
 	cfgFilterOutputMtype;
-    if (!mt.compare(cstr_textplain) && stringlowercmp("utf-8", charset)) {
+
+    // If this is text/plain and not utf-8 or untrusted, transcode to utf-8.
+    if (!mt.compare(cstr_textplain) && 
+	(!trustcharset || stringlowercmp("utf-8", charset))) {
 	string transcoded;
 	int ecnt;
 	if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
 	    LOGERR(("mh_exec: transcode failed from [%s] to UTF-8\n",
 		    charset.c_str()));
+	    // Erase text in this case: it's garbage
+	    output.clear();
 	} else {
 	    if (ecnt) {
 		LOGDEB(("mh_exec: %d transcoding errors  from [%s] to UTF-8\n",
--- a/src/internfile/mh_execm.cpp
+++ b/src/internfile/mh_execm.cpp
@ -284,15 +284,37 @@ bool MimeHandlerExecMultiple::next_document()

    // Charset. For many document types it doesn't matter. For text
    // and html it does. We supply a default from the
-    // configuration. We should do the text transcoding to utf-8 here
-    // like exec::finaldetails does.
+    // configuration. 
+    bool trustcharset = true;
    if (charset.empty()) {
 	charset = cfgFilterOutputCharset.empty() ? "utf-8" : 
 	    cfgFilterOutputCharset;
 	if (!stringlowercmp("default", charset)) {
+	    trustcharset = false;
 	    charset = m_dfltInputCharset;
 	}
    }
+
+    string& output = m_metaData[cstr_content];
+    if (!m_metaData[cstr_mimetype].compare(cstr_textplain) && 
+	(!trustcharset || stringlowercmp("utf-8", charset))) {
+	string transcoded;
+	int ecnt;
+	if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
+	    LOGERR(("mh_execm: transcode failed from [%s] to UTF-8\n",
+		    charset.c_str()));
+	    // Erase text in this case: it's garbage
+	    output.clear();
+	} else {
+	    if (ecnt) {
+		LOGDEB(("mh_exec: %d transcoding errors  from [%s] to UTF-8\n",
+			ecnt, charset.c_str()));
+	    }
+	    output = transcoded;
+	    charset = "utf-8";
+	}
+    }
+    
    m_metaData[cstr_charset] = charset;
    
    if (eofnext_received)
--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@ -259,13 +259,13 @@ bool MimeHandlerMail::processAttach()
    // Special case for text/plain content. Internfile should deal
    // with this but it expects text/plain to be utf-8 already, so we
    // handle the transcoding if needed
-    if (m_metaData[cstr_mimetype] == cstr_textplain && 
-	stringicmp(m_metaData[cstr_charset], "UTF-8")) {
+    if (m_metaData[cstr_mimetype] == cstr_textplain) {
 	string utf8;
 	if (!transcode(body, utf8, m_metaData[cstr_charset], "UTF-8")) {
 	    LOGERR(("  processAttach: transcode to utf-8 failed "
 		    "for charset [%s]\n", m_metaData[cstr_charset].c_str()));
-	    // Just let it through and hope for the best...
+ 	    // can't transcode at all -> data is garbage just erase it
+ 	    body.clear();
 	} else {
 	    body = utf8;
 	}
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@ -78,7 +78,7 @@ application/x-kword = exec rclkwd
 application/x-lyx = exec rcllyx
 application/x-mimehtml = internal message/rfc822
 application/x-perl = internal text/plain
-application/x-rar = execm rclrar
+application/x-rar = execm rclrar;charset=default
 application/x-scribus = exec rclscribus
 application/x-shellscript = internal text/plain
 application/x-tex = exec rcltex
--- a/src/utils/transcode.cpp
+++ b/src/utils/transcode.cpp
@ -116,7 +116,13 @@ bool transcode(const string &in, string &out, const string &icode,
 		ip++;isiz--;
 		continue;
 	    }
-	    goto error;
+	    // Normally only EINVAL is possible here: incomplete
+	    // multibyte sequence at the end. This is not fatal. Any
+	    // other is supposedly impossible, we return an error
+	    if (errno == EINVAL)
+		goto out;
+	    else
+		goto error;
 	}

 	out.append(obuf, OBSIZ - osiz);
@ -131,9 +137,11 @@ bool transcode(const string &in, string &out, const string &icode,
    }
 #endif

+out:
    ret = true;

- error:
+error:
+
    if (icopen) {
 #ifndef ICONV_CACHE_OPEN
 	iconv_close(ic);
--- a/tests/txt/txt.sh
+++ b/tests/txt/txt.sh
@ -5,9 +5,11 @@ topdir=`dirname $0`/..

 initvariables $0

-# Should find the file where its unaccented and the other
-recollq Anemometre 2> $mystderr | 
-	egrep -v '^Recoll query: ' > $mystdout
+# Should find the file where its unaccented and the other and also an instance
+# in misc.zip
+(
+    recollq Anemometre 
+) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout

 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1

--- a/tests/txt/txt.txt
+++ b/tests/txt/txt.txt
@ -1,3 +1,4 @@
-2 results
+3 results
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/txt/liste.txt]	[liste.txt]	1182	bytes	
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/txt/liste1.txt]	[liste1.txt]	893	bytes	
+text/plain	[file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip]	[misc.zip]	168155	bytes