Transcode mh_execm text/plain output like we do for mh_exec. Adjust handling of transcoding errors. These changes should fix most cases of non-utf8 text making it to unac/index
This commit is contained in:
parent
d94a4ec315
commit
f544b28b4a
@ -153,17 +153,24 @@ void MimeHandlerExec::finaldetails()
|
|||||||
// cfgFilterOutputCharset comes from the mimeconf filter definition line
|
// cfgFilterOutputCharset comes from the mimeconf filter definition line
|
||||||
string charset = cfgFilterOutputCharset.empty() ? "utf-8" :
|
string charset = cfgFilterOutputCharset.empty() ? "utf-8" :
|
||||||
cfgFilterOutputCharset;
|
cfgFilterOutputCharset;
|
||||||
|
bool trustcharset = true;
|
||||||
if (!stringlowercmp("default", charset)) {
|
if (!stringlowercmp("default", charset)) {
|
||||||
charset = m_dfltInputCharset;
|
charset = m_dfltInputCharset;
|
||||||
|
trustcharset = false;
|
||||||
}
|
}
|
||||||
string mt = cfgFilterOutputMtype.empty() ? "text/html" :
|
string mt = cfgFilterOutputMtype.empty() ? "text/html" :
|
||||||
cfgFilterOutputMtype;
|
cfgFilterOutputMtype;
|
||||||
if (!mt.compare(cstr_textplain) && stringlowercmp("utf-8", charset)) {
|
|
||||||
|
// If this is text/plain and not utf-8 or untrusted, transcode to utf-8.
|
||||||
|
if (!mt.compare(cstr_textplain) &&
|
||||||
|
(!trustcharset || stringlowercmp("utf-8", charset))) {
|
||||||
string transcoded;
|
string transcoded;
|
||||||
int ecnt;
|
int ecnt;
|
||||||
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
|
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
|
||||||
LOGERR(("mh_exec: transcode failed from [%s] to UTF-8\n",
|
LOGERR(("mh_exec: transcode failed from [%s] to UTF-8\n",
|
||||||
charset.c_str()));
|
charset.c_str()));
|
||||||
|
// Erase text in this case: it's garbage
|
||||||
|
output.clear();
|
||||||
} else {
|
} else {
|
||||||
if (ecnt) {
|
if (ecnt) {
|
||||||
LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n",
|
LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n",
|
||||||
|
|||||||
@ -284,15 +284,37 @@ bool MimeHandlerExecMultiple::next_document()
|
|||||||
|
|
||||||
// Charset. For many document types it doesn't matter. For text
|
// Charset. For many document types it doesn't matter. For text
|
||||||
// and html it does. We supply a default from the
|
// and html it does. We supply a default from the
|
||||||
// configuration. We should do the text transcoding to utf-8 here
|
// configuration.
|
||||||
// like exec::finaldetails does.
|
bool trustcharset = true;
|
||||||
if (charset.empty()) {
|
if (charset.empty()) {
|
||||||
charset = cfgFilterOutputCharset.empty() ? "utf-8" :
|
charset = cfgFilterOutputCharset.empty() ? "utf-8" :
|
||||||
cfgFilterOutputCharset;
|
cfgFilterOutputCharset;
|
||||||
if (!stringlowercmp("default", charset)) {
|
if (!stringlowercmp("default", charset)) {
|
||||||
|
trustcharset = false;
|
||||||
charset = m_dfltInputCharset;
|
charset = m_dfltInputCharset;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string& output = m_metaData[cstr_content];
|
||||||
|
if (!m_metaData[cstr_mimetype].compare(cstr_textplain) &&
|
||||||
|
(!trustcharset || stringlowercmp("utf-8", charset))) {
|
||||||
|
string transcoded;
|
||||||
|
int ecnt;
|
||||||
|
if (!transcode(output, transcoded, charset, "UTF-8", &ecnt)) {
|
||||||
|
LOGERR(("mh_execm: transcode failed from [%s] to UTF-8\n",
|
||||||
|
charset.c_str()));
|
||||||
|
// Erase text in this case: it's garbage
|
||||||
|
output.clear();
|
||||||
|
} else {
|
||||||
|
if (ecnt) {
|
||||||
|
LOGDEB(("mh_exec: %d transcoding errors from [%s] to UTF-8\n",
|
||||||
|
ecnt, charset.c_str()));
|
||||||
|
}
|
||||||
|
output = transcoded;
|
||||||
|
charset = "utf-8";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
m_metaData[cstr_charset] = charset;
|
m_metaData[cstr_charset] = charset;
|
||||||
|
|
||||||
if (eofnext_received)
|
if (eofnext_received)
|
||||||
|
|||||||
@ -259,13 +259,13 @@ bool MimeHandlerMail::processAttach()
|
|||||||
// Special case for text/plain content. Internfile should deal
|
// Special case for text/plain content. Internfile should deal
|
||||||
// with this but it expects text/plain to be utf-8 already, so we
|
// with this but it expects text/plain to be utf-8 already, so we
|
||||||
// handle the transcoding if needed
|
// handle the transcoding if needed
|
||||||
if (m_metaData[cstr_mimetype] == cstr_textplain &&
|
if (m_metaData[cstr_mimetype] == cstr_textplain) {
|
||||||
stringicmp(m_metaData[cstr_charset], "UTF-8")) {
|
|
||||||
string utf8;
|
string utf8;
|
||||||
if (!transcode(body, utf8, m_metaData[cstr_charset], "UTF-8")) {
|
if (!transcode(body, utf8, m_metaData[cstr_charset], "UTF-8")) {
|
||||||
LOGERR((" processAttach: transcode to utf-8 failed "
|
LOGERR((" processAttach: transcode to utf-8 failed "
|
||||||
"for charset [%s]\n", m_metaData[cstr_charset].c_str()));
|
"for charset [%s]\n", m_metaData[cstr_charset].c_str()));
|
||||||
// Just let it through and hope for the best...
|
// can't transcode at all -> data is garbage just erase it
|
||||||
|
body.clear();
|
||||||
} else {
|
} else {
|
||||||
body = utf8;
|
body = utf8;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -78,7 +78,7 @@ application/x-kword = exec rclkwd
|
|||||||
application/x-lyx = exec rcllyx
|
application/x-lyx = exec rcllyx
|
||||||
application/x-mimehtml = internal message/rfc822
|
application/x-mimehtml = internal message/rfc822
|
||||||
application/x-perl = internal text/plain
|
application/x-perl = internal text/plain
|
||||||
application/x-rar = execm rclrar
|
application/x-rar = execm rclrar;charset=default
|
||||||
application/x-scribus = exec rclscribus
|
application/x-scribus = exec rclscribus
|
||||||
application/x-shellscript = internal text/plain
|
application/x-shellscript = internal text/plain
|
||||||
application/x-tex = exec rcltex
|
application/x-tex = exec rcltex
|
||||||
|
|||||||
@ -116,7 +116,13 @@ bool transcode(const string &in, string &out, const string &icode,
|
|||||||
ip++;isiz--;
|
ip++;isiz--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
goto error;
|
// Normally only EINVAL is possible here: incomplete
|
||||||
|
// multibyte sequence at the end. This is not fatal. Any
|
||||||
|
// other is supposedly impossible, we return an error
|
||||||
|
if (errno == EINVAL)
|
||||||
|
goto out;
|
||||||
|
else
|
||||||
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
out.append(obuf, OBSIZ - osiz);
|
out.append(obuf, OBSIZ - osiz);
|
||||||
@ -131,9 +137,11 @@ bool transcode(const string &in, string &out, const string &icode,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
out:
|
||||||
ret = true;
|
ret = true;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
|
|
||||||
if (icopen) {
|
if (icopen) {
|
||||||
#ifndef ICONV_CACHE_OPEN
|
#ifndef ICONV_CACHE_OPEN
|
||||||
iconv_close(ic);
|
iconv_close(ic);
|
||||||
|
|||||||
@ -5,9 +5,11 @@ topdir=`dirname $0`/..
|
|||||||
|
|
||||||
initvariables $0
|
initvariables $0
|
||||||
|
|
||||||
# Should find the file where its unaccented and the other
|
# Should find the file where its unaccented and the other and also an instance
|
||||||
recollq Anemometre 2> $mystderr |
|
# in misc.zip
|
||||||
egrep -v '^Recoll query: ' > $mystdout
|
(
|
||||||
|
recollq Anemometre
|
||||||
|
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||||
|
|
||||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
2 results
|
3 results
|
||||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/txt/liste.txt] [liste.txt] 1182 bytes
|
text/plain [file:///home/dockes/projets/fulltext/testrecoll/txt/liste.txt] [liste.txt] 1182 bytes
|
||||||
text/plain [file:///home/dockes/projets/fulltext/testrecoll/txt/liste1.txt] [liste1.txt] 893 bytes
|
text/plain [file:///home/dockes/projets/fulltext/testrecoll/txt/liste1.txt] [liste1.txt] 893 bytes
|
||||||
|
text/plain [file:///home/dockes/projets/fulltext/testrecoll/zip/misc.zip] [misc.zip] 168155 bytes
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user