Windows: text/plain: check if already utf-8 before trying transcode
This commit is contained in:
parent
d9c1a9648c
commit
cca69cbd31
@ -24,6 +24,10 @@
|
||||
#include "smallut.h"
|
||||
#include "listmem.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#include "utf8iter.h"
|
||||
#endif
|
||||
|
||||
using std::string;
|
||||
|
||||
// Called after decoding from utf-8 failed. Handle the common case
|
||||
@ -99,9 +103,9 @@ static string bomtocode(const string& itext)
|
||||
bool RecollFilter::txtdcode(const string& who)
|
||||
{
|
||||
if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
|
||||
LOGERR(who << "::txtdcode: called on non txt/plain: " <<
|
||||
LOGERR(who << "::txtdcode: called on non txt/plain: " <<
|
||||
m_metaData[cstr_dj_keymt] << "\n");
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
string& ocs = m_metaData[cstr_dj_keyorigcharset];
|
||||
@ -111,6 +115,17 @@ bool RecollFilter::txtdcode(const string& who)
|
||||
int ecnt;
|
||||
string otext;
|
||||
|
||||
#ifdef _WIN32
|
||||
// Under Windows the environment charset will usually not be
|
||||
// utf-8. We check if the text is actually utf-8. This is worth
|
||||
// it, else the conversion from 8-bit is going to succeed if the
|
||||
// text is already utf-8, and produce bogus data.
|
||||
if (utf8check(itext, otext) >= 0) {
|
||||
m_metaData[cstr_dj_keycharset] = cstr_utf8;
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
string bomfromcode = bomtocode(itext);
|
||||
if (!bomfromcode.empty()) {
|
||||
LOGDEB(who << "::txtdcode: " << " input charset changed from " <<
|
||||
@ -120,17 +135,17 @@ bool RecollFilter::txtdcode(const string& who)
|
||||
|
||||
bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
|
||||
if (!ret || ecnt > int(itext.size() / 100)) {
|
||||
LOGERR(who << "::txtdcode: transcode " << itext.size() <<
|
||||
LOGERR(who << "::txtdcode: transcode " << itext.size() <<
|
||||
" bytes to UTF-8 failed for input charset [" << ocs <<
|
||||
"] ret " << ret << " ecnt " << ecnt << "\n");
|
||||
|
||||
ret = alternate_decode(itext, otext, ocs);
|
||||
|
||||
if (!ret) {
|
||||
LOGDEB("txtdcode: failed. Doc is not text?\n" );
|
||||
itext.erase();
|
||||
return false;
|
||||
}
|
||||
if (!ret) {
|
||||
LOGDEB("txtdcode: failed. Doc is not text?\n" );
|
||||
itext.erase();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
itext.swap(otext);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user