Windows: text/plain: check if already utf-8 before trying transcode

This commit is contained in:
Jean-Francois Dockes 2020-08-15 11:16:10 +01:00
parent d9c1a9648c
commit cca69cbd31

View File

@ -24,6 +24,10 @@
#include "smallut.h" #include "smallut.h"
#include "listmem.h" #include "listmem.h"
#ifdef _WIN32
#include "utf8iter.h"
#endif
using std::string; using std::string;
// Called after decoding from utf-8 failed. Handle the common case // Called after decoding from utf-8 failed. Handle the common case
@ -99,9 +103,9 @@ static string bomtocode(const string& itext)
bool RecollFilter::txtdcode(const string& who) bool RecollFilter::txtdcode(const string& who)
{ {
if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) { if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
LOGERR(who << "::txtdcode: called on non txt/plain: " << LOGERR(who << "::txtdcode: called on non txt/plain: " <<
m_metaData[cstr_dj_keymt] << "\n"); m_metaData[cstr_dj_keymt] << "\n");
return false; return false;
} }
string& ocs = m_metaData[cstr_dj_keyorigcharset]; string& ocs = m_metaData[cstr_dj_keyorigcharset];
@ -111,6 +115,17 @@ bool RecollFilter::txtdcode(const string& who)
int ecnt; int ecnt;
string otext; string otext;
#ifdef _WIN32
// Under Windows the environment charset will usually not be
// utf-8. We check if the text is actually utf-8. This is worth
// it, else the conversion from 8-bit is going to succeed if the
// text is already utf-8, and produce bogus data.
if (utf8check(itext, otext) >= 0) {
m_metaData[cstr_dj_keycharset] = cstr_utf8;
return true;
}
#endif
string bomfromcode = bomtocode(itext); string bomfromcode = bomtocode(itext);
if (!bomfromcode.empty()) { if (!bomfromcode.empty()) {
LOGDEB(who << "::txtdcode: " << " input charset changed from " << LOGDEB(who << "::txtdcode: " << " input charset changed from " <<
@ -120,17 +135,17 @@ bool RecollFilter::txtdcode(const string& who)
bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt); bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
if (!ret || ecnt > int(itext.size() / 100)) { if (!ret || ecnt > int(itext.size() / 100)) {
LOGERR(who << "::txtdcode: transcode " << itext.size() << LOGERR(who << "::txtdcode: transcode " << itext.size() <<
" bytes to UTF-8 failed for input charset [" << ocs << " bytes to UTF-8 failed for input charset [" << ocs <<
"] ret " << ret << " ecnt " << ecnt << "\n"); "] ret " << ret << " ecnt " << ecnt << "\n");
ret = alternate_decode(itext, otext, ocs); ret = alternate_decode(itext, otext, ocs);
if (!ret) { if (!ret) {
LOGDEB("txtdcode: failed. Doc is not text?\n" ); LOGDEB("txtdcode: failed. Doc is not text?\n" );
itext.erase(); itext.erase();
return false; return false;
} }
} }
itext.swap(otext); itext.swap(otext);