From 4713c3e4881e414db9e74b6cf3c9fcd0a821886f Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 13 Aug 2015 18:14:47 +0200 Subject: [PATCH] Discard data for docs where the alternate transcode-from-8bit trial fails after the transcode from utf-8 has failed --- src/internfile/txtdcode.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/internfile/txtdcode.cpp b/src/internfile/txtdcode.cpp index 307b5d34..3bd95ff7 100644 --- a/src/internfile/txtdcode.cpp +++ b/src/internfile/txtdcode.cpp @@ -26,14 +26,18 @@ // where this is a good old 8bit-encoded text document left-over when // the locale was switched to utf-8. We try to guess a charset // according to the locale language and use it. This is a very rough -// heuristic, but may be better than discarding the data. +// heuristic, but may be better than discarding the data. +// If we still get a significant number of decode errors, the doc is +// quite probably binary, so just fail. static bool alternate_decode(const string& in, string& out) { string lang = localelang(); string code = langtocode(lang); LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n", code.c_str())); - return transcode(in, out, code, cstr_utf8); + int ecnt; + bool ret = transcode(in, out, code, cstr_utf8, &ecnt); + return ecnt > 5 ? false : ret; } bool RecollFilter::txtdcode(const string& who) @@ -58,8 +62,11 @@ bool RecollFilter::txtdcode(const string& who) if (samecharset(ocs, cstr_utf8)) { ret = alternate_decode(itext, otext); + } else { + ret = false; } if (!ret) { + LOGDEB(("txtdcode: failed. Doc is not text?\n")); itext.erase(); return false; }