Discard data for docs where the alternate transcode-from-8bit trial fails after the transcode from utf-8 has failed
This commit is contained in:
parent
17d0a6cbba
commit
4713c3e488
@ -26,14 +26,18 @@
|
|||||||
// where this is a good old 8bit-encoded text document left-over when
|
// where this is a good old 8bit-encoded text document left-over when
|
||||||
// the locale was switched to utf-8. We try to guess a charset
|
// the locale was switched to utf-8. We try to guess a charset
|
||||||
// according to the locale language and use it. This is a very rough
|
// according to the locale language and use it. This is a very rough
|
||||||
// heuristic, but may be better than discarding the data.
|
// heuristic, but may be better than discarding the data.
|
||||||
|
// If we still get a significant number of decode errors, the doc is
|
||||||
|
// quite probably binary, so just fail.
|
||||||
static bool alternate_decode(const string& in, string& out)
|
static bool alternate_decode(const string& in, string& out)
|
||||||
{
|
{
|
||||||
string lang = localelang();
|
string lang = localelang();
|
||||||
string code = langtocode(lang);
|
string code = langtocode(lang);
|
||||||
LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
|
LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
|
||||||
code.c_str()));
|
code.c_str()));
|
||||||
return transcode(in, out, code, cstr_utf8);
|
int ecnt;
|
||||||
|
bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
|
||||||
|
return ecnt > 5 ? false : ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool RecollFilter::txtdcode(const string& who)
|
bool RecollFilter::txtdcode(const string& who)
|
||||||
@ -58,8 +62,11 @@ bool RecollFilter::txtdcode(const string& who)
|
|||||||
|
|
||||||
if (samecharset(ocs, cstr_utf8)) {
|
if (samecharset(ocs, cstr_utf8)) {
|
||||||
ret = alternate_decode(itext, otext);
|
ret = alternate_decode(itext, otext);
|
||||||
|
} else {
|
||||||
|
ret = false;
|
||||||
}
|
}
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
|
LOGDEB(("txtdcode: failed. Doc is not text?\n"));
|
||||||
itext.erase();
|
itext.erase();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user