Transcode mh_execm text/plain output like we do for mh_exec. Adjust handling of transcoding errors. These changes should fix most cases of non-utf8 text making it to unac/index

This commit is contained in:
Jean-Francois Dockes 2011-10-20 14:00:38 +02:00
parent d94a4ec315
commit f544b28b4a
7 changed files with 53 additions and 13 deletions

View File

@ -153,17 +153,24 @@ void MimeHandlerExec::finaldetails()
// cfgFilterOutputCharset comes from the mimeconf filter definition line
string charset = cfgFilterOutputCharset.empty() ? "utf-8" :
cfgFilterOutputCharset;
bool trustcharset = true;
if (!stringlowercmp("default", charset)) {
charset = m_dfltInputCharset;
trustcharset = false;
}
string mt = cfgFilterOutputMtype.empty() ? "text/html" :
cfgFilterOutputMtype;
if (!mt.compare(cstr_textplain) && stringlowercmp("utf-8", charset)) {
// If this is text/plain and not utf-8 or untrusted, transcode to utf-8.
if (!mt.compare(cstr_textplain) &&
(!trustcharset || stringlowercmp("utf-8", charset))) {
string transcoded;
int ecnt;
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
LOGERR(("mh_exec: transcode failed from [%s] to UTF-8\n",
charset.c_str()));
// Erase text in this case: it's garbage
output.clear();
} else {
if (ecnt) {
LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n",

View File

@ -284,15 +284,37 @@ bool MimeHandlerExecMultiple::next_document()
// Charset. For many document types it doesn't matter. For text
// and html it does. We supply a default from the
// configuration. We should do the text transcoding to utf-8 here
// like exec::finaldetails does.
// configuration.
bool trustcharset = true;
if (charset.empty()) {
charset = cfgFilterOutputCharset.empty() ? "utf-8" :
cfgFilterOutputCharset;
if (!stringlowercmp("default", charset)) {
trustcharset = false;
charset = m_dfltInputCharset;
}
}
string& output = m_metaData[cstr_content];
if (!m_metaData[cstr_mimetype].compare(cstr_textplain) &&
(!trustcharset || stringlowercmp("utf-8", charset))) {
string transcoded;
int ecnt;
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
LOGERR(("mh_execm: transcode failed from [%s] to UTF-8\n",
charset.c_str()));
// Erase text in this case: it's garbage
output.clear();
} else {
if (ecnt) {
LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n",
ecnt, charset.c_str()));
}
output = transcoded;
charset = "utf-8";
}
}
m_metaData[cstr_charset] = charset;
if (eofnext_received)

View File

@ -259,13 +259,13 @@ bool MimeHandlerMail::processAttach()
// Special case for text/plain content. Internfile should deal
// with this but it expects text/plain to be utf-8 already, so we
// handle the transcoding if needed
if (m_metaData[cstr_mimetype] == cstr_textplain &&
stringicmp(m_metaData[cstr_charset], "UTF-8")) {
if (m_metaData[cstr_mimetype] == cstr_textplain) {
string utf8;
if (!transcode(body, utf8, m_metaData[cstr_charset], "UTF-8")) {
LOGERR((" processAttach: transcode to utf-8 failed "
"for charset [%s]\n", m_metaData[cstr_charset].c_str()));
// Just let it through and hope for the best...
// can't transcode at all -> data is garbage just erase it
body.clear();
} else {
body = utf8;
}

View File

@ -78,7 +78,7 @@ application/x-kword = exec rclkwd
application/x-lyx = exec rcllyx
application/x-mimehtml = internal message/rfc822
application/x-perl = internal text/plain
application/x-rar = execm rclrar
application/x-rar = execm rclrar;charset=default
application/x-scribus = exec rclscribus
application/x-shellscript = internal text/plain
application/x-tex = exec rcltex

View File

@ -116,7 +116,13 @@ bool transcode(const string &in, string &out, const string &icode,
ip++;isiz--;
continue;
}
goto error;
// Normally only EINVAL is possible here: incomplete
// multibyte sequence at the end. This is not fatal. Any
// other is supposedly impossible, we return an error
if (errno == EINVAL)
goto out;
else
goto error;
}
out.append(obuf, OBSIZ - osiz);
@ -131,9 +137,11 @@ bool transcode(const string &in, string &out, const string &icode,
}
#endif
out:
ret = true;
error:
error:
if (icopen) {
#ifndef ICONV_CACHE_OPEN
iconv_close(ic);

View File

@ -5,9 +5,11 @@ topdir=`dirname $0`/..
initvariables $0
# Should find the file where its unaccented and the other
recollq Anemometre 2> $mystderr |
egrep -v '^Recoll query: ' > $mystdout
# Should find the file where its unaccented and the other and also an instance
# in misc.zip
(
recollq Anemometre
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1

View File

@ -1,3 +1,4 @@
2 results
3 results
text/plain [file:///home/dockes/projets/fulltext/testrecoll/txt/liste.txt] [liste.txt] 1182 bytes
text/plain [file:///home/dockes/projets/fulltext/testrecoll/txt/liste1.txt] [liste1.txt] 893 bytes
text/plain [file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip] [misc.zip] 168155 bytes