diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp index 81cbc8cf..d0c321b3 100644 --- a/src/internfile/mh_exec.cpp +++ b/src/internfile/mh_exec.cpp @@ -153,17 +153,24 @@ void MimeHandlerExec::finaldetails() // cfgFilterOutputCharset comes from the mimeconf filter definition line string charset = cfgFilterOutputCharset.empty() ? "utf-8" : cfgFilterOutputCharset; + bool trustcharset = true; if (!stringlowercmp("default", charset)) { charset = m_dfltInputCharset; + trustcharset = false; } string mt = cfgFilterOutputMtype.empty() ? "text/html" : cfgFilterOutputMtype; - if (!mt.compare(cstr_textplain) && stringlowercmp("utf-8", charset)) { + + // If this is text/plain and not utf-8 or untrusted, transcode to utf-8. + if (!mt.compare(cstr_textplain) && + (!trustcharset || stringlowercmp("utf-8", charset))) { string transcoded; int ecnt; if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) { LOGERR(("mh_exec: transcode failed from [%s] to UTF-8\n", charset.c_str())); + // Erase text in this case: it's garbage + output.clear(); } else { if (ecnt) { LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n", diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index 7bf89042..9b2899c9 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -284,15 +284,37 @@ bool MimeHandlerExecMultiple::next_document() // Charset. For many document types it doesn't matter. For text // and html it does. We supply a default from the - // configuration. We should do the text transcoding to utf-8 here - // like exec::finaldetails does. + // configuration. + bool trustcharset = true; if (charset.empty()) { charset = cfgFilterOutputCharset.empty() ? "utf-8" : cfgFilterOutputCharset; if (!stringlowercmp("default", charset)) { + trustcharset = false; charset = m_dfltInputCharset; } } + + string& output = m_metaData[cstr_content]; + if (!m_metaData[cstr_mimetype].compare(cstr_textplain) && + (!trustcharset || stringlowercmp("utf-8", charset))) { + string transcoded; + int ecnt; + if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) { + LOGERR(("mh_execm: transcode failed from [%s] to UTF-8\n", + charset.c_str())); + // Erase text in this case: it's garbage + output.clear(); + } else { + if (ecnt) { + LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n", + ecnt, charset.c_str())); + } + output = transcoded; + charset = "utf-8"; + } + } + m_metaData[cstr_charset] = charset; if (eofnext_received) diff --git a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp index e0b8228e..6520e768 100644 --- a/src/internfile/mh_mail.cpp +++ b/src/internfile/mh_mail.cpp @@ -259,13 +259,13 @@ bool MimeHandlerMail::processAttach() // Special case for text/plain content. Internfile should deal // with this but it expects text/plain to be utf-8 already, so we // handle the transcoding if needed - if (m_metaData[cstr_mimetype] == cstr_textplain && - stringicmp(m_metaData[cstr_charset], "UTF-8")) { + if (m_metaData[cstr_mimetype] == cstr_textplain) { string utf8; if (!transcode(body, utf8, m_metaData[cstr_charset], "UTF-8")) { LOGERR((" processAttach: transcode to utf-8 failed " "for charset [%s]\n", m_metaData[cstr_charset].c_str())); - // Just let it through and hope for the best... + // can't transcode at all -> data is garbage just erase it + body.clear(); } else { body = utf8; } diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index a9a21aef..1412af5d 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -78,7 +78,7 @@ application/x-kword = exec rclkwd application/x-lyx = exec rcllyx application/x-mimehtml = internal message/rfc822 application/x-perl = internal text/plain -application/x-rar = execm rclrar +application/x-rar = execm rclrar;charset=default application/x-scribus = exec rclscribus application/x-shellscript = internal text/plain application/x-tex = exec rcltex diff --git a/src/utils/transcode.cpp b/src/utils/transcode.cpp index 35ddd7fd..aec847d7 100644 --- a/src/utils/transcode.cpp +++ b/src/utils/transcode.cpp @@ -116,7 +116,13 @@ bool transcode(const string &in, string &out, const string &icode, ip++;isiz--; continue; } - goto error; + // Normally only EINVAL is possible here: incomplete + // multibyte sequence at the end. This is not fatal. Any + // other is supposedly impossible, we return an error + if (errno == EINVAL) + goto out; + else + goto error; } out.append(obuf, OBSIZ - osiz); @@ -131,9 +137,11 @@ bool transcode(const string &in, string &out, const string &icode, } #endif +out: ret = true; - error: +error: + if (icopen) { #ifndef ICONV_CACHE_OPEN iconv_close(ic); diff --git a/tests/txt/txt.sh b/tests/txt/txt.sh index 2b2ed29e..e1140920 100755 --- a/tests/txt/txt.sh +++ b/tests/txt/txt.sh @@ -5,9 +5,11 @@ topdir=`dirname $0`/.. initvariables $0 -# Should find the file where its unaccented and the other -recollq Anemometre 2> $mystderr | - egrep -v '^Recoll query: ' > $mystdout +# Should find the file where its unaccented and the other and also an instance +# in misc.zip +( + recollq Anemometre +) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 diff --git a/tests/txt/txt.txt b/tests/txt/txt.txt index 188294ac..31cd9b51 100644 --- a/tests/txt/txt.txt +++ b/tests/txt/txt.txt @@ -1,3 +1,4 @@ -2 results +3 results text/plain [file:///home/dockes/projets/fulltext/testrecoll/txt/liste.txt] [liste.txt] 1182 bytes text/plain [file:///home/dockes/projets/fulltext/testrecoll/txt/liste1.txt] [liste1.txt] 893 bytes +text/plain [file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip] [misc.zip] 168155 bytes