Transcode mh_execm text/plain output like we do for mh_exec. Adjust handling of transcoding errors. These changes should fix most cases of non-utf8 text making it to unac/index

2011-10-20 14:00:38 +02:00 · 2011-10-20 14:00:38 +02:00 · f544b28b4a
commit f544b28b4a
parent d94a4ec315
7 changed files with 53 additions and 13 deletions
--- a/src/internfile/mh_exec.cpp
+++ b/src/internfile/mh_exec.cpp
@ -153,17 +153,24 @@ void MimeHandlerExec::finaldetails()
    // cfgFilterOutputCharset comes from the mimeconf filter definition line
    string charset = cfgFilterOutputCharset.empty() ? "utf-8" : 
 	cfgFilterOutputCharset;
    bool trustcharset = true;
    if (!stringlowercmp("default", charset)) {
 	charset = m_dfltInputCharset;
 	trustcharset = false;
    }
    string mt = cfgFilterOutputMtype.empty() ? "text/html" : 
 	cfgFilterOutputMtype;
-    if (!mt.compare(cstr_textplain) && stringlowercmp("utf-8", charset)) {
+
    // If this is text/plain and not utf-8 or untrusted, transcode to utf-8.
    if (!mt.compare(cstr_textplain) && 
 	(!trustcharset || stringlowercmp("utf-8", charset))) {
 	string transcoded;
 	int ecnt;
 	if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
 	    LOGERR(("mh_exec: transcode failed from [%s] to UTF-8\n",
 		    charset.c_str()));
 	    // Erase text in this case: it's garbage
 	    output.clear();
 	} else {
 	    if (ecnt) {
 		LOGDEB(("mh_exec: %d transcoding errors  from [%s] to UTF-8\n",
--- a/src/internfile/mh_execm.cpp
+++ b/src/internfile/mh_execm.cpp
@ -284,15 +284,37 @@ bool MimeHandlerExecMultiple::next_document()
    // Charset. For many document types it doesn't matter. For text
    // and html it does. We supply a default from the
-    // configuration. We should do the text transcoding to utf-8 here
+    // configuration. 
-    // like exec::finaldetails does.
+    bool trustcharset = true;
    if (charset.empty()) {
 	charset = cfgFilterOutputCharset.empty() ? "utf-8" : 
 	    cfgFilterOutputCharset;
 	if (!stringlowercmp("default", charset)) {
 	    trustcharset = false;
 	    charset = m_dfltInputCharset;
 	}
    }
    string& output = m_metaData[cstr_content];
    if (!m_metaData[cstr_mimetype].compare(cstr_textplain) && 
 	(!trustcharset || stringlowercmp("utf-8", charset))) {
 	string transcoded;
 	int ecnt;
 	if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
 	    LOGERR(("mh_execm: transcode failed from [%s] to UTF-8\n",
 		    charset.c_str()));
 	    // Erase text in this case: it's garbage
 	    output.clear();
 	} else {
 	    if (ecnt) {
 		LOGDEB(("mh_exec: %d transcoding errors  from [%s] to UTF-8\n",
 			ecnt, charset.c_str()));
 	    }
 	    output = transcoded;
 	    charset = "utf-8";
 	}
    }
    m_metaData[cstr_charset] = charset;
    if (eofnext_received)
--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@ -259,13 +259,13 @@ bool MimeHandlerMail::processAttach()
    // Special case for text/plain content. Internfile should deal
    // with this but it expects text/plain to be utf-8 already, so we
    // handle the transcoding if needed
-    if (m_metaData[cstr_mimetype] == cstr_textplain && 
+    if (m_metaData[cstr_mimetype] == cstr_textplain) {
 	stringicmp(m_metaData[cstr_charset], "UTF-8")) {
 	string utf8;
 	if (!transcode(body, utf8, m_metaData[cstr_charset], "UTF-8")) {
 	    LOGERR(("  processAttach: transcode to utf-8 failed "
 		    "for charset [%s]\n", m_metaData[cstr_charset].c_str()));
-	    // Just let it through and hope for the best...
+ 	    // can't transcode at all -> data is garbage just erase it
 	    body.clear();
 	} else {
 	    body = utf8;
 	}
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@ -78,7 +78,7 @@ application/x-kword = exec rclkwd
 application/x-lyx = exec rcllyx
 application/x-mimehtml = internal message/rfc822
 application/x-perl = internal text/plain
-application/x-rar = execm rclrar
+application/x-rar = execm rclrar;charset=default
 application/x-scribus = exec rclscribus
 application/x-shellscript = internal text/plain
 application/x-tex = exec rcltex
--- a/src/utils/transcode.cpp
+++ b/src/utils/transcode.cpp
@ -116,7 +116,13 @@ bool transcode(const string &in, string &out, const string &icode,
 		ip++;isiz--;
 		continue;
 	    }
-	    goto error;
+	    // Normally only EINVAL is possible here: incomplete
 	    // multibyte sequence at the end. This is not fatal. Any
 	    // other is supposedly impossible, we return an error
 	    if (errno == EINVAL)
 		goto out;
 	    else
 		goto error;
 	}
 	out.append(obuf, OBSIZ - osiz);
@ -131,9 +137,11 @@ bool transcode(const string &in, string &out, const string &icode,
    }
 #endif
 out:
    ret = true;
- error:
+error:
    if (icopen) {
 #ifndef ICONV_CACHE_OPEN
 	iconv_close(ic);
--- a/tests/txt/txt.sh
+++ b/tests/txt/txt.sh
@ -5,9 +5,11 @@ topdir=`dirname $0`/..
 initvariables $0
-# Should find the file where its unaccented and the other
+# Should find the file where its unaccented and the other and also an instance
-recollq Anemometre 2> $mystderr | 
+# in misc.zip
-	egrep -v '^Recoll query: ' > $mystdout
+(
    recollq Anemometre 
 ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
--- a/tests/txt/txt.txt
+++ b/tests/txt/txt.txt
@ -1,3 +1,4 @@
-2 results
+3 results
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/txt/liste.txt]	[liste.txt]	1182	bytes	
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/txt/liste1.txt]	[liste1.txt]	893	bytes	
 text/plain	[file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip]	[misc.zip]	168155	bytes