Windows: text/plain: check if already utf-8 before trying transcode

2020-08-15 11:16:10 +01:00 · 2020-08-15 11:16:10 +01:00 · cca69cbd31
commit cca69cbd31
parent d9c1a9648c
1 changed files with 23 additions and 8 deletions
--- a/src/internfile/txtdcode.cpp
+++ b/src/internfile/txtdcode.cpp
@ -24,6 +24,10 @@
 #include "smallut.h"
 #include "listmem.h"
 #ifdef _WIN32
 #include "utf8iter.h"
 #endif
 using std::string;
 // Called after decoding from utf-8 failed. Handle the common case
@ -99,9 +103,9 @@ static string bomtocode(const string& itext)
 bool RecollFilter::txtdcode(const string& who)
 {
    if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
-    LOGERR(who << "::txtdcode: called on non txt/plain: " <<
+        LOGERR(who << "::txtdcode: called on non txt/plain: " <<
               m_metaData[cstr_dj_keymt] << "\n");
-    return false;
+        return false;
    }
    string& ocs = m_metaData[cstr_dj_keyorigcharset];
@ -111,6 +115,17 @@ bool RecollFilter::txtdcode(const string& who)
    int ecnt;
    string otext;
 #ifdef _WIN32
    // Under Windows the environment charset will usually not be
    // utf-8. We check if the text is actually utf-8. This is worth
    // it, else the conversion from 8-bit is going to succeed if the
    // text is already utf-8, and produce bogus data.
    if (utf8check(itext, otext) >= 0) {
        m_metaData[cstr_dj_keycharset] = cstr_utf8;
        return true;
    }
 #endif
    string bomfromcode = bomtocode(itext);
    if (!bomfromcode.empty()) {
        LOGDEB(who << "::txtdcode: " << " input charset changed from " <<
@ -120,17 +135,17 @@ bool RecollFilter::txtdcode(const string& who)
    bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
    if (!ret || ecnt > int(itext.size() / 100)) {
-    LOGERR(who << "::txtdcode: transcode " << itext.size() <<
+        LOGERR(who << "::txtdcode: transcode " << itext.size() <<
               " bytes to UTF-8 failed for input charset [" << ocs <<
               "] ret " << ret << " ecnt "  << ecnt << "\n");
        ret = alternate_decode(itext, otext, ocs);
-    if (!ret) {
+        if (!ret) {
-        LOGDEB("txtdcode: failed. Doc is not text?\n" );
+            LOGDEB("txtdcode: failed. Doc is not text?\n" );
-        itext.erase();
+            itext.erase();
-        return false;
+            return false;
-    }
+        }
    }
    itext.swap(otext);