Transcode mh_execm text/plain output like we do for mh_exec. Adjust handling of transcoding errors. These changes should fix most cases of non-utf8 text making it to unac/index
This commit is contained in:
parent
d94a4ec315
commit
f544b28b4a
@ -153,17 +153,24 @@ void MimeHandlerExec::finaldetails()
|
||||
// cfgFilterOutputCharset comes from the mimeconf filter definition line
|
||||
string charset = cfgFilterOutputCharset.empty() ? "utf-8" :
|
||||
cfgFilterOutputCharset;
|
||||
bool trustcharset = true;
|
||||
if (!stringlowercmp("default", charset)) {
|
||||
charset = m_dfltInputCharset;
|
||||
trustcharset = false;
|
||||
}
|
||||
string mt = cfgFilterOutputMtype.empty() ? "text/html" :
|
||||
cfgFilterOutputMtype;
|
||||
if (!mt.compare(cstr_textplain) && stringlowercmp("utf-8", charset)) {
|
||||
|
||||
// If this is text/plain and not utf-8 or untrusted, transcode to utf-8.
|
||||
if (!mt.compare(cstr_textplain) &&
|
||||
(!trustcharset || stringlowercmp("utf-8", charset))) {
|
||||
string transcoded;
|
||||
int ecnt;
|
||||
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
|
||||
LOGERR(("mh_exec: transcode failed from [%s] to UTF-8\n",
|
||||
charset.c_str()));
|
||||
// Erase text in this case: it's garbage
|
||||
output.clear();
|
||||
} else {
|
||||
if (ecnt) {
|
||||
LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n",
|
||||
|
||||
@ -284,15 +284,37 @@ bool MimeHandlerExecMultiple::next_document()
|
||||
|
||||
// Charset. For many document types it doesn't matter. For text
|
||||
// and html it does. We supply a default from the
|
||||
// configuration. We should do the text transcoding to utf-8 here
|
||||
// like exec::finaldetails does.
|
||||
// configuration.
|
||||
bool trustcharset = true;
|
||||
if (charset.empty()) {
|
||||
charset = cfgFilterOutputCharset.empty() ? "utf-8" :
|
||||
cfgFilterOutputCharset;
|
||||
if (!stringlowercmp("default", charset)) {
|
||||
trustcharset = false;
|
||||
charset = m_dfltInputCharset;
|
||||
}
|
||||
}
|
||||
|
||||
string& output = m_metaData[cstr_content];
|
||||
if (!m_metaData[cstr_mimetype].compare(cstr_textplain) &&
|
||||
(!trustcharset || stringlowercmp("utf-8", charset))) {
|
||||
string transcoded;
|
||||
int ecnt;
|
||||
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
|
||||
LOGERR(("mh_execm: transcode failed from [%s] to UTF-8\n",
|
||||
charset.c_str()));
|
||||
// Erase text in this case: it's garbage
|
||||
output.clear();
|
||||
} else {
|
||||
if (ecnt) {
|
||||
LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n",
|
||||
ecnt, charset.c_str()));
|
||||
}
|
||||
output = transcoded;
|
||||
charset = "utf-8";
|
||||
}
|
||||
}
|
||||
|
||||
m_metaData[cstr_charset] = charset;
|
||||
|
||||
if (eofnext_received)
|
||||
|
||||
@ -259,13 +259,13 @@ bool MimeHandlerMail::processAttach()
|
||||
// Special case for text/plain content. Internfile should deal
|
||||
// with this but it expects text/plain to be utf-8 already, so we
|
||||
// handle the transcoding if needed
|
||||
if (m_metaData[cstr_mimetype] == cstr_textplain &&
|
||||
stringicmp(m_metaData[cstr_charset], "UTF-8")) {
|
||||
if (m_metaData[cstr_mimetype] == cstr_textplain) {
|
||||
string utf8;
|
||||
if (!transcode(body, utf8, m_metaData[cstr_charset], "UTF-8")) {
|
||||
LOGERR((" processAttach: transcode to utf-8 failed "
|
||||
"for charset [%s]\n", m_metaData[cstr_charset].c_str()));
|
||||
// Just let it through and hope for the best...
|
||||
// can't transcode at all -> data is garbage just erase it
|
||||
body.clear();
|
||||
} else {
|
||||
body = utf8;
|
||||
}
|
||||
|
||||
@ -78,7 +78,7 @@ application/x-kword = exec rclkwd
|
||||
application/x-lyx = exec rcllyx
|
||||
application/x-mimehtml = internal message/rfc822
|
||||
application/x-perl = internal text/plain
|
||||
application/x-rar = execm rclrar
|
||||
application/x-rar = execm rclrar;charset=default
|
||||
application/x-scribus = exec rclscribus
|
||||
application/x-shellscript = internal text/plain
|
||||
application/x-tex = exec rcltex
|
||||
|
||||
@ -116,7 +116,13 @@ bool transcode(const string &in, string &out, const string &icode,
|
||||
ip++;isiz--;
|
||||
continue;
|
||||
}
|
||||
goto error;
|
||||
// Normally only EINVAL is possible here: incomplete
|
||||
// multibyte sequence at the end. This is not fatal. Any
|
||||
// other is supposedly impossible, we return an error
|
||||
if (errno == EINVAL)
|
||||
goto out;
|
||||
else
|
||||
goto error;
|
||||
}
|
||||
|
||||
out.append(obuf, OBSIZ - osiz);
|
||||
@ -131,9 +137,11 @@ bool transcode(const string &in, string &out, const string &icode,
|
||||
}
|
||||
#endif
|
||||
|
||||
out:
|
||||
ret = true;
|
||||
|
||||
error:
|
||||
error:
|
||||
|
||||
if (icopen) {
|
||||
#ifndef ICONV_CACHE_OPEN
|
||||
iconv_close(ic);
|
||||
|
||||
@ -5,9 +5,11 @@ topdir=`dirname $0`/..
|
||||
|
||||
initvariables $0
|
||||
|
||||
# Should find the file where its unaccented and the other
|
||||
recollq Anemometre 2> $mystderr |
|
||||
egrep -v '^Recoll query: ' > $mystdout
|
||||
# Should find the file where its unaccented and the other and also an instance
|
||||
# in misc.zip
|
||||
(
|
||||
recollq Anemometre
|
||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||
|
||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
2 results
|
||||
3 results
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/txt/liste.txt] [liste.txt] 1182 bytes
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/txt/liste1.txt] [liste1.txt] 893 bytes
|
||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip] [misc.zip] 168155 bytes
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user