indents and readability
This commit is contained in:
parent
4cc0bc90b6
commit
a24fc7bacc
@ -163,6 +163,9 @@ private:
|
||||
// Current span. Might be jf.dockes@wanadoo.f
|
||||
std::string m_span;
|
||||
|
||||
// Words in span: byte positions of start and end of words in m_span. For example:
|
||||
// 0 4 9
|
||||
// bill@some.com -> (0,4) (5,9) (10,13)
|
||||
std::vector <std::pair<int, int> > m_words_in_span;
|
||||
|
||||
// Current word: no punctuation at all in there. Byte offset
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* Copyright (C) 2004-2019 J.F.Dockes
|
||||
/* Copyright (C) 2004-2021 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
@ -29,7 +29,7 @@
|
||||
using namespace std;
|
||||
|
||||
bool unacmaybefold(const string &in, string &out,
|
||||
const char *encoding, UnacOp what)
|
||||
const char *encoding, UnacOp what)
|
||||
{
|
||||
char *cout = 0;
|
||||
size_t out_len;
|
||||
@ -37,16 +37,13 @@ bool unacmaybefold(const string &in, string &out,
|
||||
|
||||
switch (what) {
|
||||
case UNACOP_UNAC:
|
||||
status = unac_string(encoding, in.c_str(), in.length(),
|
||||
&cout, &out_len);
|
||||
status = unac_string(encoding, in.c_str(), in.length(), &cout, &out_len);
|
||||
break;
|
||||
case UNACOP_UNACFOLD:
|
||||
status = unacfold_string(encoding, in.c_str(), in.length(),
|
||||
&cout, &out_len);
|
||||
status = unacfold_string(encoding, in.c_str(), in.length(), &cout, &out_len);
|
||||
break;
|
||||
case UNACOP_FOLD:
|
||||
status = fold_string(encoding, in.c_str(), in.length(),
|
||||
&cout, &out_len);
|
||||
status = fold_string(encoding, in.c_str(), in.length(), &cout, &out_len);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* Copyright (C) 2005 J.F.Dockes
|
||||
/* Copyright (C) 2005-2021 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
@ -78,8 +78,7 @@ public:
|
||||
string dumb = term;
|
||||
if (o_index_stripchars) {
|
||||
if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
|
||||
LOGINFO("PlainToRich::takeword: unac failed for [" << term <<
|
||||
"]\n");
|
||||
LOGINFO("PlainToRich::takeword: unac failed for [" << term << "]\n");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -173,30 +172,25 @@ static string activate_urls(const string& in)
|
||||
}
|
||||
#endif
|
||||
|
||||
// Fix result text for display inside the gui text window.
|
||||
// Enrich result text for display inside the gui text window.
|
||||
//
|
||||
// We call overridden functions to output header data, beginnings and ends of
|
||||
// matches etc.
|
||||
// We call overridden functions to output header data, beginnings and ends of matches etc.
|
||||
//
|
||||
// If the input is text, we output the result in chunks, arranging not
|
||||
// to cut in the middle of a tag, which would confuse qtextedit. If
|
||||
// the input is html, the body is always a single output chunk.
|
||||
bool PlainToRich::plaintorich(const string& in,
|
||||
list<string>& out, // Output chunk list
|
||||
const HighlightData& hdata,
|
||||
int chunksize)
|
||||
// If the input is text, we output the result in chunks, arranging not to cut in the middle of a
|
||||
// tag, which would confuse qtextedit. If the input is html, the body is always a single output
|
||||
// chunk.
|
||||
bool PlainToRich::plaintorich(
|
||||
const string& in, list<string>& out, const HighlightData& hdata, int chunksize)
|
||||
{
|
||||
Chrono chron;
|
||||
bool ret = true;
|
||||
LOGDEB1("plaintorichich: in: [" << in << "]\n");
|
||||
|
||||
m_hdata = &hdata;
|
||||
// Compute the positions for the query terms. We use the text
|
||||
// splitter to break the text into words, and compare the words to
|
||||
// the search terms,
|
||||
// Compute the positions for the query terms. We use the text splitter to break the text into
|
||||
// words, and compare the words to the search terms,
|
||||
TextSplitPTR splitter(hdata);
|
||||
// Note: the splitter returns the term locations in byte, not
|
||||
// character, offsets.
|
||||
// Note: the splitter returns the term locations in byte, not character, offsets.
|
||||
splitter.text_to_words(in);
|
||||
LOGDEB2("plaintorich: split done " << chron.millis() << " mS\n");
|
||||
// Compute the positions for NEAR and PHRASE groups.
|
||||
@ -205,7 +199,7 @@ bool PlainToRich::plaintorich(const string& in,
|
||||
|
||||
out.clear();
|
||||
out.push_back("");
|
||||
list<string>::iterator olit = out.begin();
|
||||
auto olit = out.begin();
|
||||
|
||||
// Rich text output
|
||||
*olit = header();
|
||||
@ -225,9 +219,10 @@ bool PlainToRich::plaintorich(const string& in,
|
||||
vector<GroupMatchEntry>::iterator tPosEnd = splitter.m_tboffs.end();
|
||||
|
||||
#if 0
|
||||
for (vector<pair<int, int> >::const_iterator it = splitter.m_tboffs.begin();
|
||||
it != splitter.m_tboffs.end(); it++) {
|
||||
LOGDEB2("plaintorich: region: " << it->first << " "<<it->second<< "\n");
|
||||
for (const auto& region : splitter.m_tboffs) {
|
||||
auto st = region.offs.first;
|
||||
auto nd = region.offs.second;
|
||||
LOGDEB0("plaintorich: region: " << st << " " << nd << "\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -276,8 +271,7 @@ bool PlainToRich::plaintorich(const string& in,
|
||||
}
|
||||
// Skip all highlight areas that would overlap this one
|
||||
int crend = tPosIt->offs.second;
|
||||
while (tPosIt != splitter.m_tboffs.end() &&
|
||||
tPosIt->offs.first < crend)
|
||||
while (tPosIt != splitter.m_tboffs.end() && tPosIt->offs.first < crend)
|
||||
tPosIt++;
|
||||
inrcltag = 0;
|
||||
}
|
||||
|
||||
702
src/unac/unac.c
702
src/unac/unac.c
@ -14109,7 +14109,7 @@ static int debug_level = UNAC_DEBUG_LOW;
|
||||
*/
|
||||
static void debug_doprint_default(const char* message, void* data)
|
||||
{
|
||||
fprintf(stderr, "%s", message);
|
||||
fprintf(stderr, "%s", message);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -14130,30 +14130,30 @@ static void* debug_appdata = (void*)0;
|
||||
static void debug_print(const char* message, ...)
|
||||
{
|
||||
#define UNAC_MAXIMUM_MESSAGE_SIZE 512
|
||||
/*
|
||||
* UNAC_MAXIMUM_MESSAGE_SIZE is supposedly enough but I
|
||||
* do trust some vsnprintf implementations to be bugous.
|
||||
*/
|
||||
char unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE+1] = { '\0' };
|
||||
va_list args;
|
||||
va_start(args, message);
|
||||
if(vsnprintf(unac_message_buffer, UNAC_MAXIMUM_MESSAGE_SIZE, message, args) < 0) {
|
||||
char tmp[UNAC_MAXIMUM_MESSAGE_SIZE];
|
||||
sprintf(tmp, "[message larger than %d, truncated]", UNAC_MAXIMUM_MESSAGE_SIZE);
|
||||
debug_doprint(tmp, debug_appdata);
|
||||
}
|
||||
va_end(args);
|
||||
unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE] = '\0';
|
||||
/*
|
||||
* UNAC_MAXIMUM_MESSAGE_SIZE is supposedly enough but I
|
||||
* do trust some vsnprintf implementations to be bugous.
|
||||
*/
|
||||
char unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE+1] = { '\0' };
|
||||
va_list args;
|
||||
va_start(args, message);
|
||||
if(vsnprintf(unac_message_buffer, UNAC_MAXIMUM_MESSAGE_SIZE, message, args) < 0) {
|
||||
char tmp[UNAC_MAXIMUM_MESSAGE_SIZE];
|
||||
sprintf(tmp, "[message larger than %d, truncated]", UNAC_MAXIMUM_MESSAGE_SIZE);
|
||||
debug_doprint(tmp, debug_appdata);
|
||||
}
|
||||
va_end(args);
|
||||
unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE] = '\0';
|
||||
|
||||
debug_doprint(unac_message_buffer, debug_appdata);
|
||||
debug_doprint(unac_message_buffer, debug_appdata);
|
||||
}
|
||||
|
||||
void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
||||
{
|
||||
debug_level = level;
|
||||
if(function)
|
||||
debug_doprint = function;
|
||||
debug_appdata = data;
|
||||
debug_level = level;
|
||||
if(function)
|
||||
debug_doprint = function;
|
||||
debug_appdata = data;
|
||||
}
|
||||
|
||||
#else /* UNAC_DEBUG_AVAILABLE */
|
||||
@ -14167,146 +14167,140 @@ void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
||||
#define UNAC_FOLD 2
|
||||
|
||||
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp, int what)
|
||||
char** outp, size_t* out_lengthp, int what)
|
||||
{
|
||||
char* out;
|
||||
size_t out_size;
|
||||
size_t out_length;
|
||||
size_t i;
|
||||
|
||||
out_size = in_length > 0 ? in_length : 1024;
|
||||
|
||||
out = *outp;
|
||||
out = (char*)realloc(out, out_size + 1);
|
||||
if(out == 0) {
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
/* *outp is still valid. Let the caller free it */
|
||||
return -1;
|
||||
}
|
||||
|
||||
out_length = 0;
|
||||
|
||||
for(i = 0; i < in_length; i += 2) {
|
||||
unsigned short c;
|
||||
unsigned short* p;
|
||||
size_t l;
|
||||
size_t k;
|
||||
c = (in[i] << 8) | (in[i + 1] & 0xff);
|
||||
/*
|
||||
* Lookup the tables for decomposition information
|
||||
*/
|
||||
#ifdef BUILDING_RECOLL
|
||||
// Exception unac/fold values set by user. There should be 3 arrays for
|
||||
// unac/fold/unac+fold. For now there is only one array, which used to
|
||||
// be set for unac+fold, and is mostly or only used to prevent diacritics
|
||||
// removal for some chars and languages where it should not be done.
|
||||
// In conformance with current usage, but incorrectly, we do the following
|
||||
// things for the special chars depending on the operation requested:
|
||||
// - unaccenting: do nothing (copy original char)
|
||||
// - unac+fold: use table
|
||||
// - fold: use the unicode data.
|
||||
string trans;
|
||||
if (what != UNAC_FOLD && except_trans.size() != 0 &&
|
||||
is_except_char(c, trans)) {
|
||||
if (what == UNAC_UNAC) {
|
||||
// Unaccent only. Do nothing
|
||||
p = 0;
|
||||
l = 0;
|
||||
} else {
|
||||
// Has to be UNAC_UNACFOLD: use table
|
||||
p = (unsigned short *)trans.c_str();
|
||||
l = trans.size() / 2;
|
||||
}
|
||||
} else {
|
||||
#endif /* BUILDING_RECOLL */
|
||||
unac_uf_char_utf16_(c, p, l, what)
|
||||
#ifdef BUILDING_RECOLL
|
||||
}
|
||||
#endif /* BUILDING_RECOLL */
|
||||
|
||||
/*
|
||||
* Explain what's done in great detail
|
||||
*/
|
||||
if(debug_level == UNAC_DEBUG_HIGH) {
|
||||
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT];
|
||||
unsigned char position = (c) & UNAC_BLOCK_MASK;
|
||||
DEBUG("unac_data%d[%d] & unac_positions[%d][%d]: ", index, unac_positions[index][position], index, position+1);
|
||||
DEBUG_APPEND("0x%04x => ", (c));
|
||||
if(l == 0) {
|
||||
DEBUG_APPEND("untouched\n");
|
||||
} else {
|
||||
char* out;
|
||||
size_t out_size;
|
||||
size_t out_length;
|
||||
size_t i;
|
||||
for(i = 0; i < l; i++)
|
||||
DEBUG_APPEND("0x%04x ", p[i]);
|
||||
DEBUG_APPEND("\n");
|
||||
}
|
||||
|
||||
out_size = in_length > 0 ? in_length : 1024;
|
||||
|
||||
out = *outp;
|
||||
out = (char*)realloc(out, out_size + 1);
|
||||
if(out == 0) {
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
/* *outp is still valid. Let the caller free it */
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure there is enough space to hold the decomposition
|
||||
* Note: a previous realloc may have succeeded, which means that *outp
|
||||
* is not valid any more. We have to do the freeing and zero out *outp
|
||||
*/
|
||||
if(out_length + ((l + 1) * 2) > out_size) {
|
||||
char *saved;
|
||||
out_size += ((l + 1) * 2) + 1024;
|
||||
saved = out;
|
||||
out = (char *)realloc(out, out_size);
|
||||
if(out == 0) {
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size);
|
||||
free(saved);
|
||||
*outp = 0;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if(l > 0) {
|
||||
/* l == 1 && *p == 0 is the special case generated for
|
||||
mark characters (which may be found if the input is
|
||||
already in decomposed form. Output nothing */
|
||||
if (l != 1 || *p != 0) {
|
||||
out_length = 0;
|
||||
|
||||
for(i = 0; i < in_length; i += 2) {
|
||||
unsigned short c;
|
||||
unsigned short* p;
|
||||
size_t l;
|
||||
size_t k;
|
||||
c = (in[i] << 8) | (in[i + 1] & 0xff);
|
||||
/*
|
||||
* If there is a decomposition, insert it in the output
|
||||
* string.
|
||||
* Lookup the tables for decomposition information
|
||||
*/
|
||||
for(k = 0; k < l; k++) {
|
||||
out[out_length++] = (p[k] >> 8) & 0xff;
|
||||
out[out_length++] = (p[k] & 0xff);
|
||||
#ifdef BUILDING_RECOLL
|
||||
// Exception unac/fold values set by user. There should be 3 arrays for
|
||||
// unac/fold/unac+fold. For now there is only one array, which used to
|
||||
// be set for unac+fold, and is mostly or only used to prevent diacritics
|
||||
// removal for some chars and languages where it should not be done.
|
||||
// In conformance with current usage, but incorrectly, we do the following
|
||||
// things for the special chars depending on the operation requested:
|
||||
// - unaccenting: do nothing (copy original char)
|
||||
// - unac+fold: use table
|
||||
// - fold: use the unicode data.
|
||||
string trans;
|
||||
if (what != UNAC_FOLD && except_trans.size() != 0 &&
|
||||
is_except_char(c, trans)) {
|
||||
if (what == UNAC_UNAC) {
|
||||
// Unaccent only. Do nothing
|
||||
p = 0;
|
||||
l = 0;
|
||||
} else {
|
||||
// Has to be UNAC_UNACFOLD: use table
|
||||
p = (unsigned short *)trans.c_str();
|
||||
l = trans.size() / 2;
|
||||
}
|
||||
} else {
|
||||
#endif /* BUILDING_RECOLL */
|
||||
unac_uf_char_utf16_(c, p, l, what)
|
||||
#ifdef BUILDING_RECOLL
|
||||
}
|
||||
#endif /* BUILDING_RECOLL */
|
||||
|
||||
/*
|
||||
* Explain what's done in great detail
|
||||
*/
|
||||
if(debug_level == UNAC_DEBUG_HIGH) {
|
||||
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT];
|
||||
unsigned char position = (c) & UNAC_BLOCK_MASK;
|
||||
DEBUG("unac_data%d[%d] & unac_positions[%d][%d]: ", index, unac_positions[index][position], index, position+1);
|
||||
DEBUG_APPEND("0x%04x => ", (c));
|
||||
if(l == 0) {
|
||||
DEBUG_APPEND("untouched\n");
|
||||
} else {
|
||||
size_t i;
|
||||
for(i = 0; i < l; i++)
|
||||
DEBUG_APPEND("0x%04x ", p[i]);
|
||||
DEBUG_APPEND("\n");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure there is enough space to hold the decomposition
|
||||
* Note: a previous realloc may have succeeded, which means that *outp
|
||||
* is not valid any more. We have to do the freeing and zero out *outp
|
||||
*/
|
||||
if(out_length + ((l + 1) * 2) > out_size) {
|
||||
char *saved;
|
||||
out_size += ((l + 1) * 2) + 1024;
|
||||
saved = out;
|
||||
out = (char *)realloc(out, out_size);
|
||||
if(out == 0) {
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size);
|
||||
free(saved);
|
||||
*outp = 0;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if(l > 0) {
|
||||
/* l == 1 && *p == 0 is the special case generated for
|
||||
mark characters (which may be found if the input is
|
||||
already in decomposed form. Output nothing */
|
||||
if (l != 1 || *p != 0) {
|
||||
/*
|
||||
* If there is a decomposition, insert it in the output
|
||||
* string.
|
||||
*/
|
||||
for(k = 0; k < l; k++) {
|
||||
out[out_length++] = (p[k] >> 8) & 0xff;
|
||||
out[out_length++] = (p[k] & 0xff);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If there is no decomposition leave it unchanged
|
||||
*/
|
||||
out[out_length++] = in[i];
|
||||
out[out_length++] = in[i + 1];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If there is no decomposition leave it unchanged
|
||||
*/
|
||||
out[out_length++] = in[i];
|
||||
out[out_length++] = in[i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
*outp = out;
|
||||
*out_lengthp = out_length;
|
||||
(*outp)[*out_lengthp] = '\0';
|
||||
*outp = out;
|
||||
*out_lengthp = out_length;
|
||||
(*outp)[*out_lengthp] = '\0';
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
int unac_string_utf16(const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
int unac_string_utf16(const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||
{
|
||||
return unacmaybefold_string_utf16(in, in_length,
|
||||
outp, out_lengthp, UNAC_UNAC);
|
||||
return unacmaybefold_string_utf16(in, in_length, outp, out_lengthp, UNAC_UNAC);
|
||||
}
|
||||
int unacfold_string_utf16(const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
int unacfold_string_utf16(const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||
{
|
||||
return unacmaybefold_string_utf16(in, in_length,
|
||||
outp, out_lengthp, UNAC_UNACFOLD);
|
||||
return unacmaybefold_string_utf16(in, in_length, outp, out_lengthp, UNAC_UNACFOLD);
|
||||
}
|
||||
int fold_string_utf16(const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
int fold_string_utf16(const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||
{
|
||||
return unacmaybefold_string_utf16(in, in_length,
|
||||
outp, out_lengthp, UNAC_FOLD);
|
||||
return unacmaybefold_string_utf16(in, in_length, outp, out_lengthp, UNAC_FOLD);
|
||||
}
|
||||
|
||||
static const char *utf16be = "UTF-16BE";
|
||||
@ -14322,229 +14316,223 @@ static std::mutex o_unac_mutex;
|
||||
* The out string is always null terminated.
|
||||
*/
|
||||
static int convert(const char* from, const char* to,
|
||||
const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
{
|
||||
int ret = -1;
|
||||
iconv_t cd;
|
||||
char* out;
|
||||
size_t out_remain;
|
||||
size_t out_size;
|
||||
char* out_base;
|
||||
int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
|
||||
const char space[] = { 0x00, 0x20 };
|
||||
int ret = -1;
|
||||
iconv_t cd;
|
||||
char* out;
|
||||
size_t out_remain;
|
||||
size_t out_size;
|
||||
char* out_base;
|
||||
int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
|
||||
const char space[] = { 0x00, 0x20 };
|
||||
|
||||
std::unique_lock<std::mutex> lock(o_unac_mutex);
|
||||
std::unique_lock<std::mutex> lock(o_unac_mutex);
|
||||
|
||||
if (!strcmp(utf16be, from)) {
|
||||
from_utf8 = 0;
|
||||
from_utf16 = 1;
|
||||
} else if (!strcasecmp("UTF-8", from)) {
|
||||
from_utf8 = 1;
|
||||
from_utf16 = 0;
|
||||
} else {
|
||||
from_utf8 = from_utf16 = 0;
|
||||
}
|
||||
if (!strcmp(utf16be, to)) {
|
||||
to_utf8 = 0;
|
||||
to_utf16 = 1;
|
||||
} else if (!strcasecmp("UTF-8", to)) {
|
||||
to_utf8 = 1;
|
||||
to_utf16 = 0;
|
||||
} else {
|
||||
to_utf8 = to_utf16 = 0;
|
||||
}
|
||||
u16tou8 = from_utf16 && to_utf8;
|
||||
u8tou16 = from_utf8 && to_utf16;
|
||||
|
||||
out_size = in_length > 0 ? in_length : 1024;
|
||||
|
||||
out = *outp;
|
||||
out = (char *)realloc(out, out_size + 1);
|
||||
if(out == 0) {
|
||||
/* *outp still valid, no freeing */
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
goto out;
|
||||
}
|
||||
|
||||
out_remain = out_size;
|
||||
out_base = out;
|
||||
|
||||
if (u8tou16) {
|
||||
if (u8tou16_cd == (iconv_t)-1) {
|
||||
if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
iconv(u8tou16_cd, 0, 0, 0, 0);
|
||||
}
|
||||
cd = u8tou16_cd;
|
||||
} else if (u16tou8) {
|
||||
if (u16tou8_cd == (iconv_t)-1) {
|
||||
if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
iconv(u16tou8_cd, 0, 0, 0, 0);
|
||||
}
|
||||
cd = u16tou8_cd;
|
||||
} else {
|
||||
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
||||
switch(errno) {
|
||||
case EILSEQ:
|
||||
/*
|
||||
* If an illegal sequence is found in the context of unac_string
|
||||
* it means the unaccented version of a character contains
|
||||
* a sequence that cannot be mapped back to the original charset.
|
||||
* For instance, the 1/4 character in ISO-8859-1 is decomposed
|
||||
* in three characters including the FRACTION SLASH (2044) which
|
||||
* have no equivalent in the ISO-8859-1 map. One can argue that
|
||||
* the conversions tables should map it to the regular / character
|
||||
* or that a <compat> entry should be associated with it.
|
||||
*
|
||||
* To cope with this situation, convert silently transform all
|
||||
* illegal sequences (EILSEQ) into a SPACE character 0x0020.
|
||||
*
|
||||
* In the general conversion case this behaviour is not desirable.
|
||||
* However, it is not the responsibility of this program to cope
|
||||
* with inconsistencies of the Unicode description and a bug report
|
||||
* should be submited to Unicode so that they can fix the problem.
|
||||
*
|
||||
*/
|
||||
if(from_utf16) {
|
||||
const char* tmp = space;
|
||||
size_t tmp_length = 2;
|
||||
if(iconv(cd, (ICONV_CONST char **) &tmp, &tmp_length, &out, &out_remain) == (size_t)-1) {
|
||||
if(errno == E2BIG) {
|
||||
/* fall thru to the E2BIG case below */;
|
||||
} else {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
/* The offending character was replaced by a SPACE, skip it. */
|
||||
in += 2;
|
||||
in_length -= 2;
|
||||
/* And continue conversion. */
|
||||
break;
|
||||
}
|
||||
if (!strcmp(utf16be, from)) {
|
||||
from_utf8 = 0;
|
||||
from_utf16 = 1;
|
||||
} else if (!strcasecmp("UTF-8", from)) {
|
||||
from_utf8 = 1;
|
||||
from_utf16 = 0;
|
||||
} else {
|
||||
goto out;
|
||||
from_utf8 = from_utf16 = 0;
|
||||
}
|
||||
case E2BIG:
|
||||
{
|
||||
/*
|
||||
* The output does not fit in the current out buffer, enlarge it.
|
||||
*/
|
||||
size_t length = out - out_base;
|
||||
out_size *= 2;
|
||||
{
|
||||
char *saved = out_base;
|
||||
/* +1 for null */
|
||||
out_base = (char *)realloc(out_base, out_size + 1);
|
||||
if (out_base == 0) {
|
||||
/* *outp potentially not valid any more. Free here,
|
||||
* and zero out */
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
free(saved);
|
||||
*outp = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
out = out_base + length;
|
||||
out_remain = out_size - length;
|
||||
if (!strcmp(utf16be, to)) {
|
||||
to_utf8 = 0;
|
||||
to_utf16 = 1;
|
||||
} else if (!strcasecmp("UTF-8", to)) {
|
||||
to_utf8 = 1;
|
||||
to_utf16 = 0;
|
||||
} else {
|
||||
to_utf8 = to_utf16 = 0;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
u16tou8 = from_utf16 && to_utf8;
|
||||
u8tou16 = from_utf8 && to_utf16;
|
||||
|
||||
out_size = in_length > 0 ? in_length : 1024;
|
||||
|
||||
out = *outp;
|
||||
out = (char *)realloc(out, out_size + 1);
|
||||
if(out == 0) {
|
||||
/* *outp still valid, no freeing */
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
goto out;
|
||||
}
|
||||
} while(in_length > 0);
|
||||
|
||||
if (!u8tou16 && !u16tou8)
|
||||
iconv_close(cd);
|
||||
out_remain = out_size;
|
||||
out_base = out;
|
||||
|
||||
*outp = out_base;
|
||||
*out_lengthp = out - out_base;
|
||||
(*outp)[*out_lengthp] = '\0';
|
||||
if (u8tou16) {
|
||||
if (u8tou16_cd == (iconv_t)-1) {
|
||||
if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
iconv(u8tou16_cd, 0, 0, 0, 0);
|
||||
}
|
||||
cd = u8tou16_cd;
|
||||
} else if (u16tou8) {
|
||||
if (u16tou8_cd == (iconv_t)-1) {
|
||||
if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
iconv(u16tou8_cd, 0, 0, 0, 0);
|
||||
}
|
||||
cd = u16tou8_cd;
|
||||
} else {
|
||||
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
do {
|
||||
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
||||
switch(errno) {
|
||||
case EILSEQ:
|
||||
/*
|
||||
* If an illegal sequence is found in the context of unac_string
|
||||
* it means the unaccented version of a character contains
|
||||
* a sequence that cannot be mapped back to the original charset.
|
||||
* For instance, the 1/4 character in ISO-8859-1 is decomposed
|
||||
* in three characters including the FRACTION SLASH (2044) which
|
||||
* have no equivalent in the ISO-8859-1 map. One can argue that
|
||||
* the conversions tables should map it to the regular / character
|
||||
* or that a <compat> entry should be associated with it.
|
||||
*
|
||||
* To cope with this situation, convert silently transform all
|
||||
* illegal sequences (EILSEQ) into a SPACE character 0x0020.
|
||||
*
|
||||
* In the general conversion case this behaviour is not desirable.
|
||||
* However, it is not the responsibility of this program to cope
|
||||
* with inconsistencies of the Unicode description and a bug report
|
||||
* should be submited to Unicode so that they can fix the problem.
|
||||
*
|
||||
*/
|
||||
if (from_utf16) {
|
||||
const char* tmp = space;
|
||||
size_t tmp_length = 2;
|
||||
if (iconv(cd, (ICONV_CONST char **)&tmp, &tmp_length, &out, &out_remain) ==
|
||||
(size_t)-1) {
|
||||
if(errno == E2BIG) {
|
||||
/* fall thru to the E2BIG case below */;
|
||||
} else {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
/* The offending character was replaced by a SPACE, skip it. */
|
||||
in += 2;
|
||||
in_length -= 2;
|
||||
/* And continue conversion. */
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
goto out;
|
||||
}
|
||||
case E2BIG:
|
||||
{
|
||||
/*
|
||||
* The output does not fit in the current out buffer, enlarge it.
|
||||
*/
|
||||
size_t length = out - out_base;
|
||||
out_size *= 2;
|
||||
{
|
||||
char *saved = out_base;
|
||||
/* +1 for null */
|
||||
out_base = (char *)realloc(out_base, out_size + 1);
|
||||
if (out_base == 0) {
|
||||
/* *outp potentially not valid any more. Free here,
|
||||
* and zero out */
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
free(saved);
|
||||
*outp = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
out = out_base + length;
|
||||
out_remain = out_size - length;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while(in_length > 0);
|
||||
|
||||
if (!u8tou16 && !u16tou8)
|
||||
iconv_close(cd);
|
||||
|
||||
*outp = out_base;
|
||||
*out_lengthp = out - out_base;
|
||||
(*outp)[*out_lengthp] = '\0';
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int unacmaybefold_string(const char* charset,
|
||||
const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp, int what)
|
||||
int unacmaybefold_string(const char* charset, const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp, int what)
|
||||
{
|
||||
/*
|
||||
* When converting an empty string, skip everything but alloc the
|
||||
* buffer if NULL pointer.
|
||||
*/
|
||||
if (in_length <= 0) {
|
||||
if(!*outp) {
|
||||
if ((*outp = (char*)malloc(32)) == 0)
|
||||
return -1;
|
||||
}
|
||||
(*outp)[0] = '\0';
|
||||
*out_lengthp = 0;
|
||||
if(!*outp) {
|
||||
if ((*outp = (char*)malloc(32)) == 0)
|
||||
return -1;
|
||||
}
|
||||
(*outp)[0] = '\0';
|
||||
*out_lengthp = 0;
|
||||
} else {
|
||||
char* utf16 = 0;
|
||||
size_t utf16_length = 0;
|
||||
char* utf16_unaccented = 0;
|
||||
size_t utf16_unaccented_length = 0;
|
||||
char* utf16 = 0;
|
||||
size_t utf16_length = 0;
|
||||
char* utf16_unaccented = 0;
|
||||
size_t utf16_unaccented_length = 0;
|
||||
|
||||
if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
|
||||
return -1;
|
||||
}
|
||||
if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented,
|
||||
&utf16_unaccented_length, what);
|
||||
free(utf16);
|
||||
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented,
|
||||
&utf16_unaccented_length, what);
|
||||
free(utf16);
|
||||
|
||||
if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length,
|
||||
outp, out_lengthp) < 0) {
|
||||
return -1;
|
||||
}
|
||||
free(utf16_unaccented);
|
||||
if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length,
|
||||
outp, out_lengthp) < 0) {
|
||||
return -1;
|
||||
}
|
||||
free(utf16_unaccented);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int unac_string(const char* charset,
|
||||
const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
int unac_string(
|
||||
const char* charset, const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||
{
|
||||
return unacmaybefold_string(charset, in, in_length,
|
||||
outp, out_lengthp, UNAC_UNAC);
|
||||
return unacmaybefold_string(charset, in, in_length, outp, out_lengthp, UNAC_UNAC);
|
||||
}
|
||||
int unacfold_string(const char* charset,
|
||||
const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
int unacfold_string(
|
||||
const char* charset, const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||
{
|
||||
return unacmaybefold_string(charset, in, in_length,
|
||||
outp, out_lengthp, UNAC_UNACFOLD);
|
||||
return unacmaybefold_string(charset, in, in_length, outp, out_lengthp, UNAC_UNACFOLD);
|
||||
}
|
||||
int fold_string(const char* charset,
|
||||
const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
int fold_string(
|
||||
const char* charset, const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||
{
|
||||
return unacmaybefold_string(charset, in, in_length,
|
||||
outp, out_lengthp, UNAC_FOLD);
|
||||
return unacmaybefold_string(charset, in, in_length, outp, out_lengthp, UNAC_FOLD);
|
||||
}
|
||||
|
||||
const char* unac_version(void)
|
||||
{
|
||||
return UNAC_VERSION;
|
||||
return UNAC_VERSION;
|
||||
}
|
||||
|
||||
#ifdef BUILDING_RECOLL
|
||||
@ -14552,7 +14540,7 @@ void unac_set_except_translations(const char *spectrans)
|
||||
{
|
||||
except_trans.clear();
|
||||
if (!spectrans || !spectrans[0])
|
||||
return;
|
||||
return;
|
||||
|
||||
// The translation tables out of Unicode are in machine byte order (we
|
||||
// just let the compiler read the values).
|
||||
@ -14563,41 +14551,39 @@ void unac_set_except_translations(const char *spectrans)
|
||||
static const char *machinecoding = 0;
|
||||
bool littleendian = true;
|
||||
if (machinecoding == 0) {
|
||||
const char* charshort = "\001\002";
|
||||
short *ip = (short *)charshort;
|
||||
if (*ip == 0x0102) {
|
||||
littleendian = false;
|
||||
machinecoding = "UTF-16BE";
|
||||
} else {
|
||||
littleendian = true;
|
||||
machinecoding = "UTF-16LE";
|
||||
}
|
||||
const char* charshort = "\001\002";
|
||||
short *ip = (short *)charshort;
|
||||
if (*ip == 0x0102) {
|
||||
littleendian = false;
|
||||
machinecoding = "UTF-16BE";
|
||||
} else {
|
||||
littleendian = true;
|
||||
machinecoding = "UTF-16LE";
|
||||
}
|
||||
}
|
||||
|
||||
vector<string> vtrans;
|
||||
stringToStrings(spectrans, vtrans);
|
||||
|
||||
for (vector<string>::iterator it = vtrans.begin();
|
||||
it != vtrans.end(); it++) {
|
||||
for (const auto& trans : vtrans) {
|
||||
|
||||
/* Convert the whole thing to utf-16be/le according to endianness */
|
||||
char *out = 0;
|
||||
size_t outsize;
|
||||
if (convert("UTF-8", machinecoding,
|
||||
it->c_str(), it->size(),
|
||||
&out, &outsize) != 0 || outsize < 2)
|
||||
continue;
|
||||
/* Convert the whole thing to utf-16be/le according to endianness */
|
||||
char *out = 0;
|
||||
size_t outsize;
|
||||
if (convert("UTF-8", machinecoding, trans.c_str(), trans.size(), &out, &outsize) != 0 ||
|
||||
outsize < 2)
|
||||
continue;
|
||||
|
||||
/* The source char must be utf-16be as this is what we convert the
|
||||
input text to for internal processing */
|
||||
unsigned short ch;
|
||||
if (littleendian)
|
||||
ch = (out[1] << 8) | (out[0] & 0xff);
|
||||
else
|
||||
ch = (out[0] << 8) | (out[1] & 0xff);
|
||||
/* The source char must be utf-16be as this is what we convert the
|
||||
input text to for internal processing */
|
||||
unsigned short ch;
|
||||
if (littleendian)
|
||||
ch = (out[1] << 8) | (out[0] & 0xff);
|
||||
else
|
||||
ch = (out[0] << 8) | (out[1] & 0xff);
|
||||
|
||||
except_trans[ch] = string((const char *)(out + 2), outsize-2);
|
||||
free(out);
|
||||
except_trans[ch] = string((const char *)(out + 2), outsize-2);
|
||||
free(out);
|
||||
}
|
||||
}
|
||||
#endif /* BUILDING_RECOLL */
|
||||
|
||||
704
unac/unac.c
704
unac/unac.c
@ -13,7 +13,7 @@
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*/
|
||||
|
||||
#ifdef BUILDING_RECOLL
|
||||
@ -14109,7 +14109,7 @@ static int debug_level = UNAC_DEBUG_LOW;
|
||||
*/
|
||||
static void debug_doprint_default(const char* message, void* data)
|
||||
{
|
||||
fprintf(stderr, "%s", message);
|
||||
fprintf(stderr, "%s", message);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -14130,30 +14130,30 @@ static void* debug_appdata = (void*)0;
|
||||
static void debug_print(const char* message, ...)
|
||||
{
|
||||
#define UNAC_MAXIMUM_MESSAGE_SIZE 512
|
||||
/*
|
||||
* UNAC_MAXIMUM_MESSAGE_SIZE is supposedly enough but I
|
||||
* do trust some vsnprintf implementations to be bugous.
|
||||
*/
|
||||
char unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE+1] = { '\0' };
|
||||
va_list args;
|
||||
va_start(args, message);
|
||||
if(vsnprintf(unac_message_buffer, UNAC_MAXIMUM_MESSAGE_SIZE, message, args) < 0) {
|
||||
char tmp[UNAC_MAXIMUM_MESSAGE_SIZE];
|
||||
sprintf(tmp, "[message larger than %d, truncated]", UNAC_MAXIMUM_MESSAGE_SIZE);
|
||||
debug_doprint(tmp, debug_appdata);
|
||||
}
|
||||
va_end(args);
|
||||
unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE] = '\0';
|
||||
/*
|
||||
* UNAC_MAXIMUM_MESSAGE_SIZE is supposedly enough but I
|
||||
* do trust some vsnprintf implementations to be bugous.
|
||||
*/
|
||||
char unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE+1] = { '\0' };
|
||||
va_list args;
|
||||
va_start(args, message);
|
||||
if(vsnprintf(unac_message_buffer, UNAC_MAXIMUM_MESSAGE_SIZE, message, args) < 0) {
|
||||
char tmp[UNAC_MAXIMUM_MESSAGE_SIZE];
|
||||
sprintf(tmp, "[message larger than %d, truncated]", UNAC_MAXIMUM_MESSAGE_SIZE);
|
||||
debug_doprint(tmp, debug_appdata);
|
||||
}
|
||||
va_end(args);
|
||||
unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE] = '\0';
|
||||
|
||||
debug_doprint(unac_message_buffer, debug_appdata);
|
||||
debug_doprint(unac_message_buffer, debug_appdata);
|
||||
}
|
||||
|
||||
void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
||||
{
|
||||
debug_level = level;
|
||||
if(function)
|
||||
debug_doprint = function;
|
||||
debug_appdata = data;
|
||||
debug_level = level;
|
||||
if(function)
|
||||
debug_doprint = function;
|
||||
debug_appdata = data;
|
||||
}
|
||||
|
||||
#else /* UNAC_DEBUG_AVAILABLE */
|
||||
@ -14167,146 +14167,140 @@ void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
||||
#define UNAC_FOLD 2
|
||||
|
||||
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp, int what)
|
||||
char** outp, size_t* out_lengthp, int what)
|
||||
{
|
||||
char* out;
|
||||
size_t out_size;
|
||||
size_t out_length;
|
||||
size_t i;
|
||||
|
||||
out_size = in_length > 0 ? in_length : 1024;
|
||||
|
||||
out = *outp;
|
||||
out = (char*)realloc(out, out_size + 1);
|
||||
if(out == 0) {
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
/* *outp is still valid. Let the caller free it */
|
||||
return -1;
|
||||
}
|
||||
|
||||
out_length = 0;
|
||||
|
||||
for(i = 0; i < in_length; i += 2) {
|
||||
unsigned short c;
|
||||
unsigned short* p;
|
||||
size_t l;
|
||||
size_t k;
|
||||
c = (in[i] << 8) | (in[i + 1] & 0xff);
|
||||
/*
|
||||
* Lookup the tables for decomposition information
|
||||
*/
|
||||
#ifdef BUILDING_RECOLL
|
||||
// Exception unac/fold values set by user. There should be 3 arrays for
|
||||
// unac/fold/unac+fold. For now there is only one array, which used to
|
||||
// be set for unac+fold, and is mostly or only used to prevent diacritics
|
||||
// removal for some chars and languages where it should not be done.
|
||||
// In conformance with current usage, but incorrectly, we do the following
|
||||
// things for the special chars depending on the operation requested:
|
||||
// - unaccenting: do nothing (copy original char)
|
||||
// - unac+fold: use table
|
||||
// - fold: use the unicode data.
|
||||
string trans;
|
||||
if (what != UNAC_FOLD && except_trans.size() != 0 &&
|
||||
is_except_char(c, trans)) {
|
||||
if (what == UNAC_UNAC) {
|
||||
// Unaccent only. Do nothing
|
||||
p = 0;
|
||||
l = 0;
|
||||
} else {
|
||||
// Has to be UNAC_UNACFOLD: use table
|
||||
p = (unsigned short *)trans.c_str();
|
||||
l = trans.size() / 2;
|
||||
}
|
||||
} else {
|
||||
#endif /* BUILDING_RECOLL */
|
||||
unac_uf_char_utf16_(c, p, l, what)
|
||||
#ifdef BUILDING_RECOLL
|
||||
}
|
||||
#endif /* BUILDING_RECOLL */
|
||||
|
||||
/*
|
||||
* Explain what's done in great detail
|
||||
*/
|
||||
if(debug_level == UNAC_DEBUG_HIGH) {
|
||||
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT];
|
||||
unsigned char position = (c) & UNAC_BLOCK_MASK;
|
||||
DEBUG("unac_data%d[%d] & unac_positions[%d][%d]: ", index, unac_positions[index][position], index, position+1);
|
||||
DEBUG_APPEND("0x%04x => ", (c));
|
||||
if(l == 0) {
|
||||
DEBUG_APPEND("untouched\n");
|
||||
} else {
|
||||
char* out;
|
||||
size_t out_size;
|
||||
size_t out_length;
|
||||
size_t i;
|
||||
for(i = 0; i < l; i++)
|
||||
DEBUG_APPEND("0x%04x ", p[i]);
|
||||
DEBUG_APPEND("\n");
|
||||
}
|
||||
|
||||
out_size = in_length > 0 ? in_length : 1024;
|
||||
|
||||
out = *outp;
|
||||
out = (char*)realloc(out, out_size + 1);
|
||||
if(out == 0) {
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
/* *outp is still valid. Let the caller free it */
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure there is enough space to hold the decomposition
|
||||
* Note: a previous realloc may have succeeded, which means that *outp
|
||||
* is not valid any more. We have to do the freeing and zero out *outp
|
||||
*/
|
||||
if(out_length + ((l + 1) * 2) > out_size) {
|
||||
char *saved;
|
||||
out_size += ((l + 1) * 2) + 1024;
|
||||
saved = out;
|
||||
out = (char *)realloc(out, out_size);
|
||||
if(out == 0) {
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size);
|
||||
free(saved);
|
||||
*outp = 0;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if(l > 0) {
|
||||
/* l == 1 && *p == 0 is the special case generated for
|
||||
mark characters (which may be found if the input is
|
||||
already in decomposed form. Output nothing */
|
||||
if (l != 1 || *p != 0) {
|
||||
out_length = 0;
|
||||
|
||||
for(i = 0; i < in_length; i += 2) {
|
||||
unsigned short c;
|
||||
unsigned short* p;
|
||||
size_t l;
|
||||
size_t k;
|
||||
c = (in[i] << 8) | (in[i + 1] & 0xff);
|
||||
/*
|
||||
* If there is a decomposition, insert it in the output
|
||||
* string.
|
||||
* Lookup the tables for decomposition information
|
||||
*/
|
||||
for(k = 0; k < l; k++) {
|
||||
out[out_length++] = (p[k] >> 8) & 0xff;
|
||||
out[out_length++] = (p[k] & 0xff);
|
||||
#ifdef BUILDING_RECOLL
|
||||
// Exception unac/fold values set by user. There should be 3 arrays for
|
||||
// unac/fold/unac+fold. For now there is only one array, which used to
|
||||
// be set for unac+fold, and is mostly or only used to prevent diacritics
|
||||
// removal for some chars and languages where it should not be done.
|
||||
// In conformance with current usage, but incorrectly, we do the following
|
||||
// things for the special chars depending on the operation requested:
|
||||
// - unaccenting: do nothing (copy original char)
|
||||
// - unac+fold: use table
|
||||
// - fold: use the unicode data.
|
||||
string trans;
|
||||
if (what != UNAC_FOLD && except_trans.size() != 0 &&
|
||||
is_except_char(c, trans)) {
|
||||
if (what == UNAC_UNAC) {
|
||||
// Unaccent only. Do nothing
|
||||
p = 0;
|
||||
l = 0;
|
||||
} else {
|
||||
// Has to be UNAC_UNACFOLD: use table
|
||||
p = (unsigned short *)trans.c_str();
|
||||
l = trans.size() / 2;
|
||||
}
|
||||
} else {
|
||||
#endif /* BUILDING_RECOLL */
|
||||
unac_uf_char_utf16_(c, p, l, what)
|
||||
#ifdef BUILDING_RECOLL
|
||||
}
|
||||
#endif /* BUILDING_RECOLL */
|
||||
|
||||
/*
|
||||
* Explain what's done in great detail
|
||||
*/
|
||||
if(debug_level == UNAC_DEBUG_HIGH) {
|
||||
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT];
|
||||
unsigned char position = (c) & UNAC_BLOCK_MASK;
|
||||
DEBUG("unac_data%d[%d] & unac_positions[%d][%d]: ", index, unac_positions[index][position], index, position+1);
|
||||
DEBUG_APPEND("0x%04x => ", (c));
|
||||
if(l == 0) {
|
||||
DEBUG_APPEND("untouched\n");
|
||||
} else {
|
||||
size_t i;
|
||||
for(i = 0; i < l; i++)
|
||||
DEBUG_APPEND("0x%04x ", p[i]);
|
||||
DEBUG_APPEND("\n");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure there is enough space to hold the decomposition
|
||||
* Note: a previous realloc may have succeeded, which means that *outp
|
||||
* is not valid any more. We have to do the freeing and zero out *outp
|
||||
*/
|
||||
if(out_length + ((l + 1) * 2) > out_size) {
|
||||
char *saved;
|
||||
out_size += ((l + 1) * 2) + 1024;
|
||||
saved = out;
|
||||
out = (char *)realloc(out, out_size);
|
||||
if(out == 0) {
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size);
|
||||
free(saved);
|
||||
*outp = 0;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if(l > 0) {
|
||||
/* l == 1 && *p == 0 is the special case generated for
|
||||
mark characters (which may be found if the input is
|
||||
already in decomposed form. Output nothing */
|
||||
if (l != 1 || *p != 0) {
|
||||
/*
|
||||
* If there is a decomposition, insert it in the output
|
||||
* string.
|
||||
*/
|
||||
for(k = 0; k < l; k++) {
|
||||
out[out_length++] = (p[k] >> 8) & 0xff;
|
||||
out[out_length++] = (p[k] & 0xff);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If there is no decomposition leave it unchanged
|
||||
*/
|
||||
out[out_length++] = in[i];
|
||||
out[out_length++] = in[i + 1];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If there is no decomposition leave it unchanged
|
||||
*/
|
||||
out[out_length++] = in[i];
|
||||
out[out_length++] = in[i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
*outp = out;
|
||||
*out_lengthp = out_length;
|
||||
(*outp)[*out_lengthp] = '\0';
|
||||
*outp = out;
|
||||
*out_lengthp = out_length;
|
||||
(*outp)[*out_lengthp] = '\0';
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
int unac_string_utf16(const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
int unac_string_utf16(const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||
{
|
||||
return unacmaybefold_string_utf16(in, in_length,
|
||||
outp, out_lengthp, UNAC_UNAC);
|
||||
return unacmaybefold_string_utf16(in, in_length, outp, out_lengthp, UNAC_UNAC);
|
||||
}
|
||||
int unacfold_string_utf16(const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
int unacfold_string_utf16(const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||
{
|
||||
return unacmaybefold_string_utf16(in, in_length,
|
||||
outp, out_lengthp, UNAC_UNACFOLD);
|
||||
return unacmaybefold_string_utf16(in, in_length, outp, out_lengthp, UNAC_UNACFOLD);
|
||||
}
|
||||
int fold_string_utf16(const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
int fold_string_utf16(const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||
{
|
||||
return unacmaybefold_string_utf16(in, in_length,
|
||||
outp, out_lengthp, UNAC_FOLD);
|
||||
return unacmaybefold_string_utf16(in, in_length, outp, out_lengthp, UNAC_FOLD);
|
||||
}
|
||||
|
||||
static const char *utf16be = "UTF-16BE";
|
||||
@ -14322,229 +14316,223 @@ static std::mutex o_unac_mutex;
|
||||
* The out string is always null terminated.
|
||||
*/
|
||||
static int convert(const char* from, const char* to,
|
||||
const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
{
|
||||
int ret = -1;
|
||||
iconv_t cd;
|
||||
char* out;
|
||||
size_t out_remain;
|
||||
size_t out_size;
|
||||
char* out_base;
|
||||
int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
|
||||
const char space[] = { 0x00, 0x20 };
|
||||
int ret = -1;
|
||||
iconv_t cd;
|
||||
char* out;
|
||||
size_t out_remain;
|
||||
size_t out_size;
|
||||
char* out_base;
|
||||
int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
|
||||
const char space[] = { 0x00, 0x20 };
|
||||
|
||||
std::unique_lock<std::mutex> lock(o_unac_mutex);
|
||||
std::unique_lock<std::mutex> lock(o_unac_mutex);
|
||||
|
||||
if (!strcmp(utf16be, from)) {
|
||||
from_utf8 = 0;
|
||||
from_utf16 = 1;
|
||||
} else if (!strcasecmp("UTF-8", from)) {
|
||||
from_utf8 = 1;
|
||||
from_utf16 = 0;
|
||||
} else {
|
||||
from_utf8 = from_utf16 = 0;
|
||||
}
|
||||
if (!strcmp(utf16be, to)) {
|
||||
to_utf8 = 0;
|
||||
to_utf16 = 1;
|
||||
} else if (!strcasecmp("UTF-8", to)) {
|
||||
to_utf8 = 1;
|
||||
to_utf16 = 0;
|
||||
} else {
|
||||
to_utf8 = to_utf16 = 0;
|
||||
}
|
||||
u16tou8 = from_utf16 && to_utf8;
|
||||
u8tou16 = from_utf8 && to_utf16;
|
||||
|
||||
out_size = in_length > 0 ? in_length : 1024;
|
||||
|
||||
out = *outp;
|
||||
out = (char *)realloc(out, out_size + 1);
|
||||
if(out == 0) {
|
||||
/* *outp still valid, no freeing */
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
goto out;
|
||||
}
|
||||
|
||||
out_remain = out_size;
|
||||
out_base = out;
|
||||
|
||||
if (u8tou16) {
|
||||
if (u8tou16_cd == (iconv_t)-1) {
|
||||
if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
iconv(u8tou16_cd, 0, 0, 0, 0);
|
||||
}
|
||||
cd = u8tou16_cd;
|
||||
} else if (u16tou8) {
|
||||
if (u16tou8_cd == (iconv_t)-1) {
|
||||
if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
iconv(u16tou8_cd, 0, 0, 0, 0);
|
||||
}
|
||||
cd = u16tou8_cd;
|
||||
} else {
|
||||
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
||||
switch(errno) {
|
||||
case EILSEQ:
|
||||
/*
|
||||
* If an illegal sequence is found in the context of unac_string
|
||||
* it means the unaccented version of a character contains
|
||||
* a sequence that cannot be mapped back to the original charset.
|
||||
* For instance, the 1/4 character in ISO-8859-1 is decomposed
|
||||
* in three characters including the FRACTION SLASH (2044) which
|
||||
* have no equivalent in the ISO-8859-1 map. One can argue that
|
||||
* the conversions tables should map it to the regular / character
|
||||
* or that a <compat> entry should be associated with it.
|
||||
*
|
||||
* To cope with this situation, convert silently transform all
|
||||
* illegal sequences (EILSEQ) into a SPACE character 0x0020.
|
||||
*
|
||||
* In the general conversion case this behaviour is not desirable.
|
||||
* However, it is not the responsibility of this program to cope
|
||||
* with inconsistencies of the Unicode description and a bug report
|
||||
* should be submited to Unicode so that they can fix the problem.
|
||||
*
|
||||
*/
|
||||
if(from_utf16) {
|
||||
const char* tmp = space;
|
||||
size_t tmp_length = 2;
|
||||
if(iconv(cd, (ICONV_CONST char **) &tmp, &tmp_length, &out, &out_remain) == (size_t)-1) {
|
||||
if(errno == E2BIG) {
|
||||
/* fall thru to the E2BIG case below */;
|
||||
} else {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
/* The offending character was replaced by a SPACE, skip it. */
|
||||
in += 2;
|
||||
in_length -= 2;
|
||||
/* And continue conversion. */
|
||||
break;
|
||||
}
|
||||
if (!strcmp(utf16be, from)) {
|
||||
from_utf8 = 0;
|
||||
from_utf16 = 1;
|
||||
} else if (!strcasecmp("UTF-8", from)) {
|
||||
from_utf8 = 1;
|
||||
from_utf16 = 0;
|
||||
} else {
|
||||
goto out;
|
||||
from_utf8 = from_utf16 = 0;
|
||||
}
|
||||
case E2BIG:
|
||||
{
|
||||
/*
|
||||
* The output does not fit in the current out buffer, enlarge it.
|
||||
*/
|
||||
size_t length = out - out_base;
|
||||
out_size *= 2;
|
||||
{
|
||||
char *saved = out_base;
|
||||
/* +1 for null */
|
||||
out_base = (char *)realloc(out_base, out_size + 1);
|
||||
if (out_base == 0) {
|
||||
/* *outp potentially not valid any more. Free here,
|
||||
* and zero out */
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
free(saved);
|
||||
*outp = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
out = out_base + length;
|
||||
out_remain = out_size - length;
|
||||
if (!strcmp(utf16be, to)) {
|
||||
to_utf8 = 0;
|
||||
to_utf16 = 1;
|
||||
} else if (!strcasecmp("UTF-8", to)) {
|
||||
to_utf8 = 1;
|
||||
to_utf16 = 0;
|
||||
} else {
|
||||
to_utf8 = to_utf16 = 0;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
u16tou8 = from_utf16 && to_utf8;
|
||||
u8tou16 = from_utf8 && to_utf16;
|
||||
|
||||
out_size = in_length > 0 ? in_length : 1024;
|
||||
|
||||
out = *outp;
|
||||
out = (char *)realloc(out, out_size + 1);
|
||||
if(out == 0) {
|
||||
/* *outp still valid, no freeing */
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
goto out;
|
||||
}
|
||||
} while(in_length > 0);
|
||||
|
||||
if (!u8tou16 && !u16tou8)
|
||||
iconv_close(cd);
|
||||
out_remain = out_size;
|
||||
out_base = out;
|
||||
|
||||
*outp = out_base;
|
||||
*out_lengthp = out - out_base;
|
||||
(*outp)[*out_lengthp] = '\0';
|
||||
if (u8tou16) {
|
||||
if (u8tou16_cd == (iconv_t)-1) {
|
||||
if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
iconv(u8tou16_cd, 0, 0, 0, 0);
|
||||
}
|
||||
cd = u8tou16_cd;
|
||||
} else if (u16tou8) {
|
||||
if (u16tou8_cd == (iconv_t)-1) {
|
||||
if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
iconv(u16tou8_cd, 0, 0, 0, 0);
|
||||
}
|
||||
cd = u16tou8_cd;
|
||||
} else {
|
||||
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
do {
|
||||
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
||||
switch(errno) {
|
||||
case EILSEQ:
|
||||
/*
|
||||
* If an illegal sequence is found in the context of unac_string
|
||||
* it means the unaccented version of a character contains
|
||||
* a sequence that cannot be mapped back to the original charset.
|
||||
* For instance, the 1/4 character in ISO-8859-1 is decomposed
|
||||
* in three characters including the FRACTION SLASH (2044) which
|
||||
* have no equivalent in the ISO-8859-1 map. One can argue that
|
||||
* the conversions tables should map it to the regular / character
|
||||
* or that a <compat> entry should be associated with it.
|
||||
*
|
||||
* To cope with this situation, convert silently transform all
|
||||
* illegal sequences (EILSEQ) into a SPACE character 0x0020.
|
||||
*
|
||||
* In the general conversion case this behaviour is not desirable.
|
||||
* However, it is not the responsibility of this program to cope
|
||||
* with inconsistencies of the Unicode description and a bug report
|
||||
* should be submited to Unicode so that they can fix the problem.
|
||||
*
|
||||
*/
|
||||
if (from_utf16) {
|
||||
const char* tmp = space;
|
||||
size_t tmp_length = 2;
|
||||
if (iconv(cd, (ICONV_CONST char **)&tmp, &tmp_length, &out, &out_remain) ==
|
||||
(size_t)-1) {
|
||||
if(errno == E2BIG) {
|
||||
/* fall thru to the E2BIG case below */;
|
||||
} else {
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
/* The offending character was replaced by a SPACE, skip it. */
|
||||
in += 2;
|
||||
in_length -= 2;
|
||||
/* And continue conversion. */
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
goto out;
|
||||
}
|
||||
case E2BIG:
|
||||
{
|
||||
/*
|
||||
* The output does not fit in the current out buffer, enlarge it.
|
||||
*/
|
||||
size_t length = out - out_base;
|
||||
out_size *= 2;
|
||||
{
|
||||
char *saved = out_base;
|
||||
/* +1 for null */
|
||||
out_base = (char *)realloc(out_base, out_size + 1);
|
||||
if (out_base == 0) {
|
||||
/* *outp potentially not valid any more. Free here,
|
||||
* and zero out */
|
||||
if(debug_level >= UNAC_DEBUG_LOW)
|
||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||
free(saved);
|
||||
*outp = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
out = out_base + length;
|
||||
out_remain = out_size - length;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while(in_length > 0);
|
||||
|
||||
if (!u8tou16 && !u16tou8)
|
||||
iconv_close(cd);
|
||||
|
||||
*outp = out_base;
|
||||
*out_lengthp = out - out_base;
|
||||
(*outp)[*out_lengthp] = '\0';
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int unacmaybefold_string(const char* charset,
|
||||
const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp, int what)
|
||||
int unacmaybefold_string(const char* charset, const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp, int what)
|
||||
{
|
||||
/*
|
||||
* When converting an empty string, skip everything but alloc the
|
||||
* buffer if NULL pointer.
|
||||
*/
|
||||
if (in_length <= 0) {
|
||||
if(!*outp) {
|
||||
if ((*outp = (char*)malloc(32)) == 0)
|
||||
return -1;
|
||||
}
|
||||
(*outp)[0] = '\0';
|
||||
*out_lengthp = 0;
|
||||
if(!*outp) {
|
||||
if ((*outp = (char*)malloc(32)) == 0)
|
||||
return -1;
|
||||
}
|
||||
(*outp)[0] = '\0';
|
||||
*out_lengthp = 0;
|
||||
} else {
|
||||
char* utf16 = 0;
|
||||
size_t utf16_length = 0;
|
||||
char* utf16_unaccented = 0;
|
||||
size_t utf16_unaccented_length = 0;
|
||||
char* utf16 = 0;
|
||||
size_t utf16_length = 0;
|
||||
char* utf16_unaccented = 0;
|
||||
size_t utf16_unaccented_length = 0;
|
||||
|
||||
if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
|
||||
return -1;
|
||||
}
|
||||
if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented,
|
||||
&utf16_unaccented_length, what);
|
||||
free(utf16);
|
||||
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented,
|
||||
&utf16_unaccented_length, what);
|
||||
free(utf16);
|
||||
|
||||
if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length,
|
||||
outp, out_lengthp) < 0) {
|
||||
return -1;
|
||||
}
|
||||
free(utf16_unaccented);
|
||||
if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length,
|
||||
outp, out_lengthp) < 0) {
|
||||
return -1;
|
||||
}
|
||||
free(utf16_unaccented);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int unac_string(const char* charset,
|
||||
const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
int unac_string(
|
||||
const char* charset, const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||
{
|
||||
return unacmaybefold_string(charset, in, in_length,
|
||||
outp, out_lengthp, UNAC_UNAC);
|
||||
return unacmaybefold_string(charset, in, in_length, outp, out_lengthp, UNAC_UNAC);
|
||||
}
|
||||
int unacfold_string(const char* charset,
|
||||
const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
int unacfold_string(
|
||||
const char* charset, const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||
{
|
||||
return unacmaybefold_string(charset, in, in_length,
|
||||
outp, out_lengthp, UNAC_UNACFOLD);
|
||||
return unacmaybefold_string(charset, in, in_length, outp, out_lengthp, UNAC_UNACFOLD);
|
||||
}
|
||||
int fold_string(const char* charset,
|
||||
const char* in, size_t in_length,
|
||||
char** outp, size_t* out_lengthp)
|
||||
int fold_string(
|
||||
const char* charset, const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||
{
|
||||
return unacmaybefold_string(charset, in, in_length,
|
||||
outp, out_lengthp, UNAC_FOLD);
|
||||
return unacmaybefold_string(charset, in, in_length, outp, out_lengthp, UNAC_FOLD);
|
||||
}
|
||||
|
||||
const char* unac_version(void)
|
||||
{
|
||||
return UNAC_VERSION;
|
||||
return UNAC_VERSION;
|
||||
}
|
||||
|
||||
#ifdef BUILDING_RECOLL
|
||||
@ -14552,7 +14540,7 @@ void unac_set_except_translations(const char *spectrans)
|
||||
{
|
||||
except_trans.clear();
|
||||
if (!spectrans || !spectrans[0])
|
||||
return;
|
||||
return;
|
||||
|
||||
// The translation tables out of Unicode are in machine byte order (we
|
||||
// just let the compiler read the values).
|
||||
@ -14563,41 +14551,39 @@ void unac_set_except_translations(const char *spectrans)
|
||||
static const char *machinecoding = 0;
|
||||
bool littleendian = true;
|
||||
if (machinecoding == 0) {
|
||||
const char* charshort = "\001\002";
|
||||
short *ip = (short *)charshort;
|
||||
if (*ip == 0x0102) {
|
||||
littleendian = false;
|
||||
machinecoding = "UTF-16BE";
|
||||
} else {
|
||||
littleendian = true;
|
||||
machinecoding = "UTF-16LE";
|
||||
}
|
||||
const char* charshort = "\001\002";
|
||||
short *ip = (short *)charshort;
|
||||
if (*ip == 0x0102) {
|
||||
littleendian = false;
|
||||
machinecoding = "UTF-16BE";
|
||||
} else {
|
||||
littleendian = true;
|
||||
machinecoding = "UTF-16LE";
|
||||
}
|
||||
}
|
||||
|
||||
vector<string> vtrans;
|
||||
stringToStrings(spectrans, vtrans);
|
||||
|
||||
for (vector<string>::iterator it = vtrans.begin();
|
||||
it != vtrans.end(); it++) {
|
||||
for (const auto& trans : vtrans) {
|
||||
|
||||
/* Convert the whole thing to utf-16be/le according to endianness */
|
||||
char *out = 0;
|
||||
size_t outsize;
|
||||
if (convert("UTF-8", machinecoding,
|
||||
it->c_str(), it->size(),
|
||||
&out, &outsize) != 0 || outsize < 2)
|
||||
continue;
|
||||
/* Convert the whole thing to utf-16be/le according to endianness */
|
||||
char *out = 0;
|
||||
size_t outsize;
|
||||
if (convert("UTF-8", machinecoding, trans.c_str(), trans.size(), &out, &outsize) != 0 ||
|
||||
outsize < 2)
|
||||
continue;
|
||||
|
||||
/* The source char must be utf-16be as this is what we convert the
|
||||
input text to for internal processing */
|
||||
unsigned short ch;
|
||||
if (littleendian)
|
||||
ch = (out[1] << 8) | (out[0] & 0xff);
|
||||
else
|
||||
ch = (out[0] << 8) | (out[1] & 0xff);
|
||||
/* The source char must be utf-16be as this is what we convert the
|
||||
input text to for internal processing */
|
||||
unsigned short ch;
|
||||
if (littleendian)
|
||||
ch = (out[1] << 8) | (out[0] & 0xff);
|
||||
else
|
||||
ch = (out[0] << 8) | (out[1] & 0xff);
|
||||
|
||||
except_trans[ch] = string((const char *)(out + 2), outsize-2);
|
||||
free(out);
|
||||
except_trans[ch] = string((const char *)(out + 2), outsize-2);
|
||||
free(out);
|
||||
}
|
||||
}
|
||||
#endif /* BUILDING_RECOLL */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user