indents and readability
This commit is contained in:
parent
4cc0bc90b6
commit
a24fc7bacc
@ -163,6 +163,9 @@ private:
|
|||||||
// Current span. Might be jf.dockes@wanadoo.f
|
// Current span. Might be jf.dockes@wanadoo.f
|
||||||
std::string m_span;
|
std::string m_span;
|
||||||
|
|
||||||
|
// Words in span: byte positions of start and end of words in m_span. For example:
|
||||||
|
// 0 4 9
|
||||||
|
// bill@some.com -> (0,4) (5,9) (10,13)
|
||||||
std::vector <std::pair<int, int> > m_words_in_span;
|
std::vector <std::pair<int, int> > m_words_in_span;
|
||||||
|
|
||||||
// Current word: no punctuation at all in there. Byte offset
|
// Current word: no punctuation at all in there. Byte offset
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
/* Copyright (C) 2004-2019 J.F.Dockes
|
/* Copyright (C) 2004-2021 J.F.Dockes
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU General Public License as published by
|
* it under the terms of the GNU General Public License as published by
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
@ -29,7 +29,7 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
bool unacmaybefold(const string &in, string &out,
|
bool unacmaybefold(const string &in, string &out,
|
||||||
const char *encoding, UnacOp what)
|
const char *encoding, UnacOp what)
|
||||||
{
|
{
|
||||||
char *cout = 0;
|
char *cout = 0;
|
||||||
size_t out_len;
|
size_t out_len;
|
||||||
@ -37,16 +37,13 @@ bool unacmaybefold(const string &in, string &out,
|
|||||||
|
|
||||||
switch (what) {
|
switch (what) {
|
||||||
case UNACOP_UNAC:
|
case UNACOP_UNAC:
|
||||||
status = unac_string(encoding, in.c_str(), in.length(),
|
status = unac_string(encoding, in.c_str(), in.length(), &cout, &out_len);
|
||||||
&cout, &out_len);
|
|
||||||
break;
|
break;
|
||||||
case UNACOP_UNACFOLD:
|
case UNACOP_UNACFOLD:
|
||||||
status = unacfold_string(encoding, in.c_str(), in.length(),
|
status = unacfold_string(encoding, in.c_str(), in.length(), &cout, &out_len);
|
||||||
&cout, &out_len);
|
|
||||||
break;
|
break;
|
||||||
case UNACOP_FOLD:
|
case UNACOP_FOLD:
|
||||||
status = fold_string(encoding, in.c_str(), in.length(),
|
status = fold_string(encoding, in.c_str(), in.length(), &cout, &out_len);
|
||||||
&cout, &out_len);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
/* Copyright (C) 2005 J.F.Dockes
|
/* Copyright (C) 2005-2021 J.F.Dockes
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU General Public License as published by
|
* it under the terms of the GNU General Public License as published by
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
@ -78,8 +78,7 @@ public:
|
|||||||
string dumb = term;
|
string dumb = term;
|
||||||
if (o_index_stripchars) {
|
if (o_index_stripchars) {
|
||||||
if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
|
if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
LOGINFO("PlainToRich::takeword: unac failed for [" << term <<
|
LOGINFO("PlainToRich::takeword: unac failed for [" << term << "]\n");
|
||||||
"]\n");
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -173,30 +172,25 @@ static string activate_urls(const string& in)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Fix result text for display inside the gui text window.
|
// Enrich result text for display inside the gui text window.
|
||||||
//
|
//
|
||||||
// We call overridden functions to output header data, beginnings and ends of
|
// We call overridden functions to output header data, beginnings and ends of matches etc.
|
||||||
// matches etc.
|
|
||||||
//
|
//
|
||||||
// If the input is text, we output the result in chunks, arranging not
|
// If the input is text, we output the result in chunks, arranging not to cut in the middle of a
|
||||||
// to cut in the middle of a tag, which would confuse qtextedit. If
|
// tag, which would confuse qtextedit. If the input is html, the body is always a single output
|
||||||
// the input is html, the body is always a single output chunk.
|
// chunk.
|
||||||
bool PlainToRich::plaintorich(const string& in,
|
bool PlainToRich::plaintorich(
|
||||||
list<string>& out, // Output chunk list
|
const string& in, list<string>& out, const HighlightData& hdata, int chunksize)
|
||||||
const HighlightData& hdata,
|
|
||||||
int chunksize)
|
|
||||||
{
|
{
|
||||||
Chrono chron;
|
Chrono chron;
|
||||||
bool ret = true;
|
bool ret = true;
|
||||||
LOGDEB1("plaintorichich: in: [" << in << "]\n");
|
LOGDEB1("plaintorichich: in: [" << in << "]\n");
|
||||||
|
|
||||||
m_hdata = &hdata;
|
m_hdata = &hdata;
|
||||||
// Compute the positions for the query terms. We use the text
|
// Compute the positions for the query terms. We use the text splitter to break the text into
|
||||||
// splitter to break the text into words, and compare the words to
|
// words, and compare the words to the search terms,
|
||||||
// the search terms,
|
|
||||||
TextSplitPTR splitter(hdata);
|
TextSplitPTR splitter(hdata);
|
||||||
// Note: the splitter returns the term locations in byte, not
|
// Note: the splitter returns the term locations in byte, not character, offsets.
|
||||||
// character, offsets.
|
|
||||||
splitter.text_to_words(in);
|
splitter.text_to_words(in);
|
||||||
LOGDEB2("plaintorich: split done " << chron.millis() << " mS\n");
|
LOGDEB2("plaintorich: split done " << chron.millis() << " mS\n");
|
||||||
// Compute the positions for NEAR and PHRASE groups.
|
// Compute the positions for NEAR and PHRASE groups.
|
||||||
@ -205,7 +199,7 @@ bool PlainToRich::plaintorich(const string& in,
|
|||||||
|
|
||||||
out.clear();
|
out.clear();
|
||||||
out.push_back("");
|
out.push_back("");
|
||||||
list<string>::iterator olit = out.begin();
|
auto olit = out.begin();
|
||||||
|
|
||||||
// Rich text output
|
// Rich text output
|
||||||
*olit = header();
|
*olit = header();
|
||||||
@ -225,9 +219,10 @@ bool PlainToRich::plaintorich(const string& in,
|
|||||||
vector<GroupMatchEntry>::iterator tPosEnd = splitter.m_tboffs.end();
|
vector<GroupMatchEntry>::iterator tPosEnd = splitter.m_tboffs.end();
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
for (vector<pair<int, int> >::const_iterator it = splitter.m_tboffs.begin();
|
for (const auto& region : splitter.m_tboffs) {
|
||||||
it != splitter.m_tboffs.end(); it++) {
|
auto st = region.offs.first;
|
||||||
LOGDEB2("plaintorich: region: " << it->first << " "<<it->second<< "\n");
|
auto nd = region.offs.second;
|
||||||
|
LOGDEB0("plaintorich: region: " << st << " " << nd << "\n");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -276,8 +271,7 @@ bool PlainToRich::plaintorich(const string& in,
|
|||||||
}
|
}
|
||||||
// Skip all highlight areas that would overlap this one
|
// Skip all highlight areas that would overlap this one
|
||||||
int crend = tPosIt->offs.second;
|
int crend = tPosIt->offs.second;
|
||||||
while (tPosIt != splitter.m_tboffs.end() &&
|
while (tPosIt != splitter.m_tboffs.end() && tPosIt->offs.first < crend)
|
||||||
tPosIt->offs.first < crend)
|
|
||||||
tPosIt++;
|
tPosIt++;
|
||||||
inrcltag = 0;
|
inrcltag = 0;
|
||||||
}
|
}
|
||||||
|
|||||||
702
src/unac/unac.c
702
src/unac/unac.c
@ -14109,7 +14109,7 @@ static int debug_level = UNAC_DEBUG_LOW;
|
|||||||
*/
|
*/
|
||||||
static void debug_doprint_default(const char* message, void* data)
|
static void debug_doprint_default(const char* message, void* data)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s", message);
|
fprintf(stderr, "%s", message);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -14130,30 +14130,30 @@ static void* debug_appdata = (void*)0;
|
|||||||
static void debug_print(const char* message, ...)
|
static void debug_print(const char* message, ...)
|
||||||
{
|
{
|
||||||
#define UNAC_MAXIMUM_MESSAGE_SIZE 512
|
#define UNAC_MAXIMUM_MESSAGE_SIZE 512
|
||||||
/*
|
/*
|
||||||
* UNAC_MAXIMUM_MESSAGE_SIZE is supposedly enough but I
|
* UNAC_MAXIMUM_MESSAGE_SIZE is supposedly enough but I
|
||||||
* do trust some vsnprintf implementations to be bugous.
|
* do trust some vsnprintf implementations to be bugous.
|
||||||
*/
|
*/
|
||||||
char unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE+1] = { '\0' };
|
char unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE+1] = { '\0' };
|
||||||
va_list args;
|
va_list args;
|
||||||
va_start(args, message);
|
va_start(args, message);
|
||||||
if(vsnprintf(unac_message_buffer, UNAC_MAXIMUM_MESSAGE_SIZE, message, args) < 0) {
|
if(vsnprintf(unac_message_buffer, UNAC_MAXIMUM_MESSAGE_SIZE, message, args) < 0) {
|
||||||
char tmp[UNAC_MAXIMUM_MESSAGE_SIZE];
|
char tmp[UNAC_MAXIMUM_MESSAGE_SIZE];
|
||||||
sprintf(tmp, "[message larger than %d, truncated]", UNAC_MAXIMUM_MESSAGE_SIZE);
|
sprintf(tmp, "[message larger than %d, truncated]", UNAC_MAXIMUM_MESSAGE_SIZE);
|
||||||
debug_doprint(tmp, debug_appdata);
|
debug_doprint(tmp, debug_appdata);
|
||||||
}
|
}
|
||||||
va_end(args);
|
va_end(args);
|
||||||
unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE] = '\0';
|
unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE] = '\0';
|
||||||
|
|
||||||
debug_doprint(unac_message_buffer, debug_appdata);
|
debug_doprint(unac_message_buffer, debug_appdata);
|
||||||
}
|
}
|
||||||
|
|
||||||
void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
||||||
{
|
{
|
||||||
debug_level = level;
|
debug_level = level;
|
||||||
if(function)
|
if(function)
|
||||||
debug_doprint = function;
|
debug_doprint = function;
|
||||||
debug_appdata = data;
|
debug_appdata = data;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* UNAC_DEBUG_AVAILABLE */
|
#else /* UNAC_DEBUG_AVAILABLE */
|
||||||
@ -14167,146 +14167,140 @@ void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
|||||||
#define UNAC_FOLD 2
|
#define UNAC_FOLD 2
|
||||||
|
|
||||||
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
||||||
char** outp, size_t* out_lengthp, int what)
|
char** outp, size_t* out_lengthp, int what)
|
||||||
{
|
{
|
||||||
char* out;
|
char* out;
|
||||||
size_t out_size;
|
size_t out_size;
|
||||||
size_t out_length;
|
size_t out_length;
|
||||||
size_t i;
|
|
||||||
|
|
||||||
out_size = in_length > 0 ? in_length : 1024;
|
|
||||||
|
|
||||||
out = *outp;
|
|
||||||
out = (char*)realloc(out, out_size + 1);
|
|
||||||
if(out == 0) {
|
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
|
||||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
|
||||||
/* *outp is still valid. Let the caller free it */
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
out_length = 0;
|
|
||||||
|
|
||||||
for(i = 0; i < in_length; i += 2) {
|
|
||||||
unsigned short c;
|
|
||||||
unsigned short* p;
|
|
||||||
size_t l;
|
|
||||||
size_t k;
|
|
||||||
c = (in[i] << 8) | (in[i + 1] & 0xff);
|
|
||||||
/*
|
|
||||||
* Lookup the tables for decomposition information
|
|
||||||
*/
|
|
||||||
#ifdef BUILDING_RECOLL
|
|
||||||
// Exception unac/fold values set by user. There should be 3 arrays for
|
|
||||||
// unac/fold/unac+fold. For now there is only one array, which used to
|
|
||||||
// be set for unac+fold, and is mostly or only used to prevent diacritics
|
|
||||||
// removal for some chars and languages where it should not be done.
|
|
||||||
// In conformance with current usage, but incorrectly, we do the following
|
|
||||||
// things for the special chars depending on the operation requested:
|
|
||||||
// - unaccenting: do nothing (copy original char)
|
|
||||||
// - unac+fold: use table
|
|
||||||
// - fold: use the unicode data.
|
|
||||||
string trans;
|
|
||||||
if (what != UNAC_FOLD && except_trans.size() != 0 &&
|
|
||||||
is_except_char(c, trans)) {
|
|
||||||
if (what == UNAC_UNAC) {
|
|
||||||
// Unaccent only. Do nothing
|
|
||||||
p = 0;
|
|
||||||
l = 0;
|
|
||||||
} else {
|
|
||||||
// Has to be UNAC_UNACFOLD: use table
|
|
||||||
p = (unsigned short *)trans.c_str();
|
|
||||||
l = trans.size() / 2;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
#endif /* BUILDING_RECOLL */
|
|
||||||
unac_uf_char_utf16_(c, p, l, what)
|
|
||||||
#ifdef BUILDING_RECOLL
|
|
||||||
}
|
|
||||||
#endif /* BUILDING_RECOLL */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Explain what's done in great detail
|
|
||||||
*/
|
|
||||||
if(debug_level == UNAC_DEBUG_HIGH) {
|
|
||||||
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT];
|
|
||||||
unsigned char position = (c) & UNAC_BLOCK_MASK;
|
|
||||||
DEBUG("unac_data%d[%d] & unac_positions[%d][%d]: ", index, unac_positions[index][position], index, position+1);
|
|
||||||
DEBUG_APPEND("0x%04x => ", (c));
|
|
||||||
if(l == 0) {
|
|
||||||
DEBUG_APPEND("untouched\n");
|
|
||||||
} else {
|
|
||||||
size_t i;
|
size_t i;
|
||||||
for(i = 0; i < l; i++)
|
|
||||||
DEBUG_APPEND("0x%04x ", p[i]);
|
out_size = in_length > 0 ? in_length : 1024;
|
||||||
DEBUG_APPEND("\n");
|
|
||||||
}
|
out = *outp;
|
||||||
|
out = (char*)realloc(out, out_size + 1);
|
||||||
|
if(out == 0) {
|
||||||
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
|
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||||
|
/* *outp is still valid. Let the caller free it */
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
out_length = 0;
|
||||||
* Make sure there is enough space to hold the decomposition
|
|
||||||
* Note: a previous realloc may have succeeded, which means that *outp
|
for(i = 0; i < in_length; i += 2) {
|
||||||
* is not valid any more. We have to do the freeing and zero out *outp
|
unsigned short c;
|
||||||
*/
|
unsigned short* p;
|
||||||
if(out_length + ((l + 1) * 2) > out_size) {
|
size_t l;
|
||||||
char *saved;
|
size_t k;
|
||||||
out_size += ((l + 1) * 2) + 1024;
|
c = (in[i] << 8) | (in[i + 1] & 0xff);
|
||||||
saved = out;
|
|
||||||
out = (char *)realloc(out, out_size);
|
|
||||||
if(out == 0) {
|
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
|
||||||
DEBUG("realloc %d bytes failed\n", out_size);
|
|
||||||
free(saved);
|
|
||||||
*outp = 0;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(l > 0) {
|
|
||||||
/* l == 1 && *p == 0 is the special case generated for
|
|
||||||
mark characters (which may be found if the input is
|
|
||||||
already in decomposed form. Output nothing */
|
|
||||||
if (l != 1 || *p != 0) {
|
|
||||||
/*
|
/*
|
||||||
* If there is a decomposition, insert it in the output
|
* Lookup the tables for decomposition information
|
||||||
* string.
|
|
||||||
*/
|
*/
|
||||||
for(k = 0; k < l; k++) {
|
#ifdef BUILDING_RECOLL
|
||||||
out[out_length++] = (p[k] >> 8) & 0xff;
|
// Exception unac/fold values set by user. There should be 3 arrays for
|
||||||
out[out_length++] = (p[k] & 0xff);
|
// unac/fold/unac+fold. For now there is only one array, which used to
|
||||||
|
// be set for unac+fold, and is mostly or only used to prevent diacritics
|
||||||
|
// removal for some chars and languages where it should not be done.
|
||||||
|
// In conformance with current usage, but incorrectly, we do the following
|
||||||
|
// things for the special chars depending on the operation requested:
|
||||||
|
// - unaccenting: do nothing (copy original char)
|
||||||
|
// - unac+fold: use table
|
||||||
|
// - fold: use the unicode data.
|
||||||
|
string trans;
|
||||||
|
if (what != UNAC_FOLD && except_trans.size() != 0 &&
|
||||||
|
is_except_char(c, trans)) {
|
||||||
|
if (what == UNAC_UNAC) {
|
||||||
|
// Unaccent only. Do nothing
|
||||||
|
p = 0;
|
||||||
|
l = 0;
|
||||||
|
} else {
|
||||||
|
// Has to be UNAC_UNACFOLD: use table
|
||||||
|
p = (unsigned short *)trans.c_str();
|
||||||
|
l = trans.size() / 2;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
#endif /* BUILDING_RECOLL */
|
||||||
|
unac_uf_char_utf16_(c, p, l, what)
|
||||||
|
#ifdef BUILDING_RECOLL
|
||||||
|
}
|
||||||
|
#endif /* BUILDING_RECOLL */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Explain what's done in great detail
|
||||||
|
*/
|
||||||
|
if(debug_level == UNAC_DEBUG_HIGH) {
|
||||||
|
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT];
|
||||||
|
unsigned char position = (c) & UNAC_BLOCK_MASK;
|
||||||
|
DEBUG("unac_data%d[%d] & unac_positions[%d][%d]: ", index, unac_positions[index][position], index, position+1);
|
||||||
|
DEBUG_APPEND("0x%04x => ", (c));
|
||||||
|
if(l == 0) {
|
||||||
|
DEBUG_APPEND("untouched\n");
|
||||||
|
} else {
|
||||||
|
size_t i;
|
||||||
|
for(i = 0; i < l; i++)
|
||||||
|
DEBUG_APPEND("0x%04x ", p[i]);
|
||||||
|
DEBUG_APPEND("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Make sure there is enough space to hold the decomposition
|
||||||
|
* Note: a previous realloc may have succeeded, which means that *outp
|
||||||
|
* is not valid any more. We have to do the freeing and zero out *outp
|
||||||
|
*/
|
||||||
|
if(out_length + ((l + 1) * 2) > out_size) {
|
||||||
|
char *saved;
|
||||||
|
out_size += ((l + 1) * 2) + 1024;
|
||||||
|
saved = out;
|
||||||
|
out = (char *)realloc(out, out_size);
|
||||||
|
if(out == 0) {
|
||||||
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
|
DEBUG("realloc %d bytes failed\n", out_size);
|
||||||
|
free(saved);
|
||||||
|
*outp = 0;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(l > 0) {
|
||||||
|
/* l == 1 && *p == 0 is the special case generated for
|
||||||
|
mark characters (which may be found if the input is
|
||||||
|
already in decomposed form. Output nothing */
|
||||||
|
if (l != 1 || *p != 0) {
|
||||||
|
/*
|
||||||
|
* If there is a decomposition, insert it in the output
|
||||||
|
* string.
|
||||||
|
*/
|
||||||
|
for(k = 0; k < l; k++) {
|
||||||
|
out[out_length++] = (p[k] >> 8) & 0xff;
|
||||||
|
out[out_length++] = (p[k] & 0xff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* If there is no decomposition leave it unchanged
|
||||||
|
*/
|
||||||
|
out[out_length++] = in[i];
|
||||||
|
out[out_length++] = in[i + 1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
* If there is no decomposition leave it unchanged
|
|
||||||
*/
|
|
||||||
out[out_length++] = in[i];
|
|
||||||
out[out_length++] = in[i + 1];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
*outp = out;
|
*outp = out;
|
||||||
*out_lengthp = out_length;
|
*out_lengthp = out_length;
|
||||||
(*outp)[*out_lengthp] = '\0';
|
(*outp)[*out_lengthp] = '\0';
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
int unac_string_utf16(const char* in, size_t in_length,
|
int unac_string_utf16(const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||||
char** outp, size_t* out_lengthp)
|
|
||||||
{
|
{
|
||||||
return unacmaybefold_string_utf16(in, in_length,
|
return unacmaybefold_string_utf16(in, in_length, outp, out_lengthp, UNAC_UNAC);
|
||||||
outp, out_lengthp, UNAC_UNAC);
|
|
||||||
}
|
}
|
||||||
int unacfold_string_utf16(const char* in, size_t in_length,
|
int unacfold_string_utf16(const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||||
char** outp, size_t* out_lengthp)
|
|
||||||
{
|
{
|
||||||
return unacmaybefold_string_utf16(in, in_length,
|
return unacmaybefold_string_utf16(in, in_length, outp, out_lengthp, UNAC_UNACFOLD);
|
||||||
outp, out_lengthp, UNAC_UNACFOLD);
|
|
||||||
}
|
}
|
||||||
int fold_string_utf16(const char* in, size_t in_length,
|
int fold_string_utf16(const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||||
char** outp, size_t* out_lengthp)
|
|
||||||
{
|
{
|
||||||
return unacmaybefold_string_utf16(in, in_length,
|
return unacmaybefold_string_utf16(in, in_length, outp, out_lengthp, UNAC_FOLD);
|
||||||
outp, out_lengthp, UNAC_FOLD);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char *utf16be = "UTF-16BE";
|
static const char *utf16be = "UTF-16BE";
|
||||||
@ -14322,229 +14316,223 @@ static std::mutex o_unac_mutex;
|
|||||||
* The out string is always null terminated.
|
* The out string is always null terminated.
|
||||||
*/
|
*/
|
||||||
static int convert(const char* from, const char* to,
|
static int convert(const char* from, const char* to,
|
||||||
const char* in, size_t in_length,
|
const char* in, size_t in_length,
|
||||||
char** outp, size_t* out_lengthp)
|
char** outp, size_t* out_lengthp)
|
||||||
{
|
{
|
||||||
int ret = -1;
|
int ret = -1;
|
||||||
iconv_t cd;
|
iconv_t cd;
|
||||||
char* out;
|
char* out;
|
||||||
size_t out_remain;
|
size_t out_remain;
|
||||||
size_t out_size;
|
size_t out_size;
|
||||||
char* out_base;
|
char* out_base;
|
||||||
int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
|
int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
|
||||||
const char space[] = { 0x00, 0x20 };
|
const char space[] = { 0x00, 0x20 };
|
||||||
|
|
||||||
std::unique_lock<std::mutex> lock(o_unac_mutex);
|
std::unique_lock<std::mutex> lock(o_unac_mutex);
|
||||||
|
|
||||||
if (!strcmp(utf16be, from)) {
|
if (!strcmp(utf16be, from)) {
|
||||||
from_utf8 = 0;
|
from_utf8 = 0;
|
||||||
from_utf16 = 1;
|
from_utf16 = 1;
|
||||||
} else if (!strcasecmp("UTF-8", from)) {
|
} else if (!strcasecmp("UTF-8", from)) {
|
||||||
from_utf8 = 1;
|
from_utf8 = 1;
|
||||||
from_utf16 = 0;
|
from_utf16 = 0;
|
||||||
} else {
|
|
||||||
from_utf8 = from_utf16 = 0;
|
|
||||||
}
|
|
||||||
if (!strcmp(utf16be, to)) {
|
|
||||||
to_utf8 = 0;
|
|
||||||
to_utf16 = 1;
|
|
||||||
} else if (!strcasecmp("UTF-8", to)) {
|
|
||||||
to_utf8 = 1;
|
|
||||||
to_utf16 = 0;
|
|
||||||
} else {
|
|
||||||
to_utf8 = to_utf16 = 0;
|
|
||||||
}
|
|
||||||
u16tou8 = from_utf16 && to_utf8;
|
|
||||||
u8tou16 = from_utf8 && to_utf16;
|
|
||||||
|
|
||||||
out_size = in_length > 0 ? in_length : 1024;
|
|
||||||
|
|
||||||
out = *outp;
|
|
||||||
out = (char *)realloc(out, out_size + 1);
|
|
||||||
if(out == 0) {
|
|
||||||
/* *outp still valid, no freeing */
|
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
|
||||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
out_remain = out_size;
|
|
||||||
out_base = out;
|
|
||||||
|
|
||||||
if (u8tou16) {
|
|
||||||
if (u8tou16_cd == (iconv_t)-1) {
|
|
||||||
if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
iconv(u8tou16_cd, 0, 0, 0, 0);
|
|
||||||
}
|
|
||||||
cd = u8tou16_cd;
|
|
||||||
} else if (u16tou8) {
|
|
||||||
if (u16tou8_cd == (iconv_t)-1) {
|
|
||||||
if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
iconv(u16tou8_cd, 0, 0, 0, 0);
|
|
||||||
}
|
|
||||||
cd = u16tou8_cd;
|
|
||||||
} else {
|
|
||||||
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
do {
|
|
||||||
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
|
||||||
switch(errno) {
|
|
||||||
case EILSEQ:
|
|
||||||
/*
|
|
||||||
* If an illegal sequence is found in the context of unac_string
|
|
||||||
* it means the unaccented version of a character contains
|
|
||||||
* a sequence that cannot be mapped back to the original charset.
|
|
||||||
* For instance, the 1/4 character in ISO-8859-1 is decomposed
|
|
||||||
* in three characters including the FRACTION SLASH (2044) which
|
|
||||||
* have no equivalent in the ISO-8859-1 map. One can argue that
|
|
||||||
* the conversions tables should map it to the regular / character
|
|
||||||
* or that a <compat> entry should be associated with it.
|
|
||||||
*
|
|
||||||
* To cope with this situation, convert silently transform all
|
|
||||||
* illegal sequences (EILSEQ) into a SPACE character 0x0020.
|
|
||||||
*
|
|
||||||
* In the general conversion case this behaviour is not desirable.
|
|
||||||
* However, it is not the responsibility of this program to cope
|
|
||||||
* with inconsistencies of the Unicode description and a bug report
|
|
||||||
* should be submited to Unicode so that they can fix the problem.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
if(from_utf16) {
|
|
||||||
const char* tmp = space;
|
|
||||||
size_t tmp_length = 2;
|
|
||||||
if(iconv(cd, (ICONV_CONST char **) &tmp, &tmp_length, &out, &out_remain) == (size_t)-1) {
|
|
||||||
if(errno == E2BIG) {
|
|
||||||
/* fall thru to the E2BIG case below */;
|
|
||||||
} else {
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
/* The offending character was replaced by a SPACE, skip it. */
|
|
||||||
in += 2;
|
|
||||||
in_length -= 2;
|
|
||||||
/* And continue conversion. */
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
goto out;
|
from_utf8 = from_utf16 = 0;
|
||||||
}
|
}
|
||||||
case E2BIG:
|
if (!strcmp(utf16be, to)) {
|
||||||
{
|
to_utf8 = 0;
|
||||||
/*
|
to_utf16 = 1;
|
||||||
* The output does not fit in the current out buffer, enlarge it.
|
} else if (!strcasecmp("UTF-8", to)) {
|
||||||
*/
|
to_utf8 = 1;
|
||||||
size_t length = out - out_base;
|
to_utf16 = 0;
|
||||||
out_size *= 2;
|
} else {
|
||||||
{
|
to_utf8 = to_utf16 = 0;
|
||||||
char *saved = out_base;
|
|
||||||
/* +1 for null */
|
|
||||||
out_base = (char *)realloc(out_base, out_size + 1);
|
|
||||||
if (out_base == 0) {
|
|
||||||
/* *outp potentially not valid any more. Free here,
|
|
||||||
* and zero out */
|
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
|
||||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
|
||||||
free(saved);
|
|
||||||
*outp = 0;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
out = out_base + length;
|
|
||||||
out_remain = out_size - length;
|
|
||||||
}
|
}
|
||||||
break;
|
u16tou8 = from_utf16 && to_utf8;
|
||||||
default:
|
u8tou16 = from_utf8 && to_utf16;
|
||||||
goto out;
|
|
||||||
break;
|
out_size = in_length > 0 ? in_length : 1024;
|
||||||
}
|
|
||||||
|
out = *outp;
|
||||||
|
out = (char *)realloc(out, out_size + 1);
|
||||||
|
if(out == 0) {
|
||||||
|
/* *outp still valid, no freeing */
|
||||||
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
|
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||||
|
goto out;
|
||||||
}
|
}
|
||||||
} while(in_length > 0);
|
|
||||||
|
|
||||||
if (!u8tou16 && !u16tou8)
|
out_remain = out_size;
|
||||||
iconv_close(cd);
|
out_base = out;
|
||||||
|
|
||||||
*outp = out_base;
|
if (u8tou16) {
|
||||||
*out_lengthp = out - out_base;
|
if (u8tou16_cd == (iconv_t)-1) {
|
||||||
(*outp)[*out_lengthp] = '\0';
|
if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
iconv(u8tou16_cd, 0, 0, 0, 0);
|
||||||
|
}
|
||||||
|
cd = u8tou16_cd;
|
||||||
|
} else if (u16tou8) {
|
||||||
|
if (u16tou8_cd == (iconv_t)-1) {
|
||||||
|
if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
iconv(u16tou8_cd, 0, 0, 0, 0);
|
||||||
|
}
|
||||||
|
cd = u16tou8_cd;
|
||||||
|
} else {
|
||||||
|
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ret = 0;
|
do {
|
||||||
|
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
||||||
|
switch(errno) {
|
||||||
|
case EILSEQ:
|
||||||
|
/*
|
||||||
|
* If an illegal sequence is found in the context of unac_string
|
||||||
|
* it means the unaccented version of a character contains
|
||||||
|
* a sequence that cannot be mapped back to the original charset.
|
||||||
|
* For instance, the 1/4 character in ISO-8859-1 is decomposed
|
||||||
|
* in three characters including the FRACTION SLASH (2044) which
|
||||||
|
* have no equivalent in the ISO-8859-1 map. One can argue that
|
||||||
|
* the conversions tables should map it to the regular / character
|
||||||
|
* or that a <compat> entry should be associated with it.
|
||||||
|
*
|
||||||
|
* To cope with this situation, convert silently transform all
|
||||||
|
* illegal sequences (EILSEQ) into a SPACE character 0x0020.
|
||||||
|
*
|
||||||
|
* In the general conversion case this behaviour is not desirable.
|
||||||
|
* However, it is not the responsibility of this program to cope
|
||||||
|
* with inconsistencies of the Unicode description and a bug report
|
||||||
|
* should be submited to Unicode so that they can fix the problem.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
if (from_utf16) {
|
||||||
|
const char* tmp = space;
|
||||||
|
size_t tmp_length = 2;
|
||||||
|
if (iconv(cd, (ICONV_CONST char **)&tmp, &tmp_length, &out, &out_remain) ==
|
||||||
|
(size_t)-1) {
|
||||||
|
if(errno == E2BIG) {
|
||||||
|
/* fall thru to the E2BIG case below */;
|
||||||
|
} else {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* The offending character was replaced by a SPACE, skip it. */
|
||||||
|
in += 2;
|
||||||
|
in_length -= 2;
|
||||||
|
/* And continue conversion. */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
case E2BIG:
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* The output does not fit in the current out buffer, enlarge it.
|
||||||
|
*/
|
||||||
|
size_t length = out - out_base;
|
||||||
|
out_size *= 2;
|
||||||
|
{
|
||||||
|
char *saved = out_base;
|
||||||
|
/* +1 for null */
|
||||||
|
out_base = (char *)realloc(out_base, out_size + 1);
|
||||||
|
if (out_base == 0) {
|
||||||
|
/* *outp potentially not valid any more. Free here,
|
||||||
|
* and zero out */
|
||||||
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
|
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||||
|
free(saved);
|
||||||
|
*outp = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = out_base + length;
|
||||||
|
out_remain = out_size - length;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
goto out;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while(in_length > 0);
|
||||||
|
|
||||||
|
if (!u8tou16 && !u16tou8)
|
||||||
|
iconv_close(cd);
|
||||||
|
|
||||||
|
*outp = out_base;
|
||||||
|
*out_lengthp = out - out_base;
|
||||||
|
(*outp)[*out_lengthp] = '\0';
|
||||||
|
|
||||||
|
ret = 0;
|
||||||
out:
|
out:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
int unacmaybefold_string(const char* charset,
|
int unacmaybefold_string(const char* charset, const char* in, size_t in_length,
|
||||||
const char* in, size_t in_length,
|
char** outp, size_t* out_lengthp, int what)
|
||||||
char** outp, size_t* out_lengthp, int what)
|
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* When converting an empty string, skip everything but alloc the
|
* When converting an empty string, skip everything but alloc the
|
||||||
* buffer if NULL pointer.
|
* buffer if NULL pointer.
|
||||||
*/
|
*/
|
||||||
if (in_length <= 0) {
|
if (in_length <= 0) {
|
||||||
if(!*outp) {
|
if(!*outp) {
|
||||||
if ((*outp = (char*)malloc(32)) == 0)
|
if ((*outp = (char*)malloc(32)) == 0)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
(*outp)[0] = '\0';
|
(*outp)[0] = '\0';
|
||||||
*out_lengthp = 0;
|
*out_lengthp = 0;
|
||||||
} else {
|
} else {
|
||||||
char* utf16 = 0;
|
char* utf16 = 0;
|
||||||
size_t utf16_length = 0;
|
size_t utf16_length = 0;
|
||||||
char* utf16_unaccented = 0;
|
char* utf16_unaccented = 0;
|
||||||
size_t utf16_unaccented_length = 0;
|
size_t utf16_unaccented_length = 0;
|
||||||
|
|
||||||
if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
|
if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented,
|
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented,
|
||||||
&utf16_unaccented_length, what);
|
&utf16_unaccented_length, what);
|
||||||
free(utf16);
|
free(utf16);
|
||||||
|
|
||||||
if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length,
|
if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length,
|
||||||
outp, out_lengthp) < 0) {
|
outp, out_lengthp) < 0) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
free(utf16_unaccented);
|
free(utf16_unaccented);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int unac_string(const char* charset,
|
int unac_string(
|
||||||
const char* in, size_t in_length,
|
const char* charset, const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||||
char** outp, size_t* out_lengthp)
|
|
||||||
{
|
{
|
||||||
return unacmaybefold_string(charset, in, in_length,
|
return unacmaybefold_string(charset, in, in_length, outp, out_lengthp, UNAC_UNAC);
|
||||||
outp, out_lengthp, UNAC_UNAC);
|
|
||||||
}
|
}
|
||||||
int unacfold_string(const char* charset,
|
int unacfold_string(
|
||||||
const char* in, size_t in_length,
|
const char* charset, const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||||
char** outp, size_t* out_lengthp)
|
|
||||||
{
|
{
|
||||||
return unacmaybefold_string(charset, in, in_length,
|
return unacmaybefold_string(charset, in, in_length, outp, out_lengthp, UNAC_UNACFOLD);
|
||||||
outp, out_lengthp, UNAC_UNACFOLD);
|
|
||||||
}
|
}
|
||||||
int fold_string(const char* charset,
|
int fold_string(
|
||||||
const char* in, size_t in_length,
|
const char* charset, const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||||
char** outp, size_t* out_lengthp)
|
|
||||||
{
|
{
|
||||||
return unacmaybefold_string(charset, in, in_length,
|
return unacmaybefold_string(charset, in, in_length, outp, out_lengthp, UNAC_FOLD);
|
||||||
outp, out_lengthp, UNAC_FOLD);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const char* unac_version(void)
|
const char* unac_version(void)
|
||||||
{
|
{
|
||||||
return UNAC_VERSION;
|
return UNAC_VERSION;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef BUILDING_RECOLL
|
#ifdef BUILDING_RECOLL
|
||||||
@ -14552,7 +14540,7 @@ void unac_set_except_translations(const char *spectrans)
|
|||||||
{
|
{
|
||||||
except_trans.clear();
|
except_trans.clear();
|
||||||
if (!spectrans || !spectrans[0])
|
if (!spectrans || !spectrans[0])
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// The translation tables out of Unicode are in machine byte order (we
|
// The translation tables out of Unicode are in machine byte order (we
|
||||||
// just let the compiler read the values).
|
// just let the compiler read the values).
|
||||||
@ -14563,41 +14551,39 @@ void unac_set_except_translations(const char *spectrans)
|
|||||||
static const char *machinecoding = 0;
|
static const char *machinecoding = 0;
|
||||||
bool littleendian = true;
|
bool littleendian = true;
|
||||||
if (machinecoding == 0) {
|
if (machinecoding == 0) {
|
||||||
const char* charshort = "\001\002";
|
const char* charshort = "\001\002";
|
||||||
short *ip = (short *)charshort;
|
short *ip = (short *)charshort;
|
||||||
if (*ip == 0x0102) {
|
if (*ip == 0x0102) {
|
||||||
littleendian = false;
|
littleendian = false;
|
||||||
machinecoding = "UTF-16BE";
|
machinecoding = "UTF-16BE";
|
||||||
} else {
|
} else {
|
||||||
littleendian = true;
|
littleendian = true;
|
||||||
machinecoding = "UTF-16LE";
|
machinecoding = "UTF-16LE";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<string> vtrans;
|
vector<string> vtrans;
|
||||||
stringToStrings(spectrans, vtrans);
|
stringToStrings(spectrans, vtrans);
|
||||||
|
|
||||||
for (vector<string>::iterator it = vtrans.begin();
|
for (const auto& trans : vtrans) {
|
||||||
it != vtrans.end(); it++) {
|
|
||||||
|
|
||||||
/* Convert the whole thing to utf-16be/le according to endianness */
|
/* Convert the whole thing to utf-16be/le according to endianness */
|
||||||
char *out = 0;
|
char *out = 0;
|
||||||
size_t outsize;
|
size_t outsize;
|
||||||
if (convert("UTF-8", machinecoding,
|
if (convert("UTF-8", machinecoding, trans.c_str(), trans.size(), &out, &outsize) != 0 ||
|
||||||
it->c_str(), it->size(),
|
outsize < 2)
|
||||||
&out, &outsize) != 0 || outsize < 2)
|
continue;
|
||||||
continue;
|
|
||||||
|
|
||||||
/* The source char must be utf-16be as this is what we convert the
|
/* The source char must be utf-16be as this is what we convert the
|
||||||
input text to for internal processing */
|
input text to for internal processing */
|
||||||
unsigned short ch;
|
unsigned short ch;
|
||||||
if (littleendian)
|
if (littleendian)
|
||||||
ch = (out[1] << 8) | (out[0] & 0xff);
|
ch = (out[1] << 8) | (out[0] & 0xff);
|
||||||
else
|
else
|
||||||
ch = (out[0] << 8) | (out[1] & 0xff);
|
ch = (out[0] << 8) | (out[1] & 0xff);
|
||||||
|
|
||||||
except_trans[ch] = string((const char *)(out + 2), outsize-2);
|
except_trans[ch] = string((const char *)(out + 2), outsize-2);
|
||||||
free(out);
|
free(out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* BUILDING_RECOLL */
|
#endif /* BUILDING_RECOLL */
|
||||||
|
|||||||
704
unac/unac.c
704
unac/unac.c
@ -13,7 +13,7 @@
|
|||||||
*
|
*
|
||||||
* You should have received a copy of the GNU General Public License
|
* You should have received a copy of the GNU General Public License
|
||||||
* along with this program; if not, write to the Free Software
|
* along with this program; if not, write to the Free Software
|
||||||
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifdef BUILDING_RECOLL
|
#ifdef BUILDING_RECOLL
|
||||||
@ -14109,7 +14109,7 @@ static int debug_level = UNAC_DEBUG_LOW;
|
|||||||
*/
|
*/
|
||||||
static void debug_doprint_default(const char* message, void* data)
|
static void debug_doprint_default(const char* message, void* data)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s", message);
|
fprintf(stderr, "%s", message);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -14130,30 +14130,30 @@ static void* debug_appdata = (void*)0;
|
|||||||
static void debug_print(const char* message, ...)
|
static void debug_print(const char* message, ...)
|
||||||
{
|
{
|
||||||
#define UNAC_MAXIMUM_MESSAGE_SIZE 512
|
#define UNAC_MAXIMUM_MESSAGE_SIZE 512
|
||||||
/*
|
/*
|
||||||
* UNAC_MAXIMUM_MESSAGE_SIZE is supposedly enough but I
|
* UNAC_MAXIMUM_MESSAGE_SIZE is supposedly enough but I
|
||||||
* do trust some vsnprintf implementations to be bugous.
|
* do trust some vsnprintf implementations to be bugous.
|
||||||
*/
|
*/
|
||||||
char unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE+1] = { '\0' };
|
char unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE+1] = { '\0' };
|
||||||
va_list args;
|
va_list args;
|
||||||
va_start(args, message);
|
va_start(args, message);
|
||||||
if(vsnprintf(unac_message_buffer, UNAC_MAXIMUM_MESSAGE_SIZE, message, args) < 0) {
|
if(vsnprintf(unac_message_buffer, UNAC_MAXIMUM_MESSAGE_SIZE, message, args) < 0) {
|
||||||
char tmp[UNAC_MAXIMUM_MESSAGE_SIZE];
|
char tmp[UNAC_MAXIMUM_MESSAGE_SIZE];
|
||||||
sprintf(tmp, "[message larger than %d, truncated]", UNAC_MAXIMUM_MESSAGE_SIZE);
|
sprintf(tmp, "[message larger than %d, truncated]", UNAC_MAXIMUM_MESSAGE_SIZE);
|
||||||
debug_doprint(tmp, debug_appdata);
|
debug_doprint(tmp, debug_appdata);
|
||||||
}
|
}
|
||||||
va_end(args);
|
va_end(args);
|
||||||
unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE] = '\0';
|
unac_message_buffer[UNAC_MAXIMUM_MESSAGE_SIZE] = '\0';
|
||||||
|
|
||||||
debug_doprint(unac_message_buffer, debug_appdata);
|
debug_doprint(unac_message_buffer, debug_appdata);
|
||||||
}
|
}
|
||||||
|
|
||||||
void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
||||||
{
|
{
|
||||||
debug_level = level;
|
debug_level = level;
|
||||||
if(function)
|
if(function)
|
||||||
debug_doprint = function;
|
debug_doprint = function;
|
||||||
debug_appdata = data;
|
debug_appdata = data;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* UNAC_DEBUG_AVAILABLE */
|
#else /* UNAC_DEBUG_AVAILABLE */
|
||||||
@ -14167,146 +14167,140 @@ void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
|||||||
#define UNAC_FOLD 2
|
#define UNAC_FOLD 2
|
||||||
|
|
||||||
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
||||||
char** outp, size_t* out_lengthp, int what)
|
char** outp, size_t* out_lengthp, int what)
|
||||||
{
|
{
|
||||||
char* out;
|
char* out;
|
||||||
size_t out_size;
|
size_t out_size;
|
||||||
size_t out_length;
|
size_t out_length;
|
||||||
size_t i;
|
|
||||||
|
|
||||||
out_size = in_length > 0 ? in_length : 1024;
|
|
||||||
|
|
||||||
out = *outp;
|
|
||||||
out = (char*)realloc(out, out_size + 1);
|
|
||||||
if(out == 0) {
|
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
|
||||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
|
||||||
/* *outp is still valid. Let the caller free it */
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
out_length = 0;
|
|
||||||
|
|
||||||
for(i = 0; i < in_length; i += 2) {
|
|
||||||
unsigned short c;
|
|
||||||
unsigned short* p;
|
|
||||||
size_t l;
|
|
||||||
size_t k;
|
|
||||||
c = (in[i] << 8) | (in[i + 1] & 0xff);
|
|
||||||
/*
|
|
||||||
* Lookup the tables for decomposition information
|
|
||||||
*/
|
|
||||||
#ifdef BUILDING_RECOLL
|
|
||||||
// Exception unac/fold values set by user. There should be 3 arrays for
|
|
||||||
// unac/fold/unac+fold. For now there is only one array, which used to
|
|
||||||
// be set for unac+fold, and is mostly or only used to prevent diacritics
|
|
||||||
// removal for some chars and languages where it should not be done.
|
|
||||||
// In conformance with current usage, but incorrectly, we do the following
|
|
||||||
// things for the special chars depending on the operation requested:
|
|
||||||
// - unaccenting: do nothing (copy original char)
|
|
||||||
// - unac+fold: use table
|
|
||||||
// - fold: use the unicode data.
|
|
||||||
string trans;
|
|
||||||
if (what != UNAC_FOLD && except_trans.size() != 0 &&
|
|
||||||
is_except_char(c, trans)) {
|
|
||||||
if (what == UNAC_UNAC) {
|
|
||||||
// Unaccent only. Do nothing
|
|
||||||
p = 0;
|
|
||||||
l = 0;
|
|
||||||
} else {
|
|
||||||
// Has to be UNAC_UNACFOLD: use table
|
|
||||||
p = (unsigned short *)trans.c_str();
|
|
||||||
l = trans.size() / 2;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
#endif /* BUILDING_RECOLL */
|
|
||||||
unac_uf_char_utf16_(c, p, l, what)
|
|
||||||
#ifdef BUILDING_RECOLL
|
|
||||||
}
|
|
||||||
#endif /* BUILDING_RECOLL */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Explain what's done in great detail
|
|
||||||
*/
|
|
||||||
if(debug_level == UNAC_DEBUG_HIGH) {
|
|
||||||
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT];
|
|
||||||
unsigned char position = (c) & UNAC_BLOCK_MASK;
|
|
||||||
DEBUG("unac_data%d[%d] & unac_positions[%d][%d]: ", index, unac_positions[index][position], index, position+1);
|
|
||||||
DEBUG_APPEND("0x%04x => ", (c));
|
|
||||||
if(l == 0) {
|
|
||||||
DEBUG_APPEND("untouched\n");
|
|
||||||
} else {
|
|
||||||
size_t i;
|
size_t i;
|
||||||
for(i = 0; i < l; i++)
|
|
||||||
DEBUG_APPEND("0x%04x ", p[i]);
|
out_size = in_length > 0 ? in_length : 1024;
|
||||||
DEBUG_APPEND("\n");
|
|
||||||
}
|
out = *outp;
|
||||||
|
out = (char*)realloc(out, out_size + 1);
|
||||||
|
if(out == 0) {
|
||||||
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
|
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||||
|
/* *outp is still valid. Let the caller free it */
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
out_length = 0;
|
||||||
* Make sure there is enough space to hold the decomposition
|
|
||||||
* Note: a previous realloc may have succeeded, which means that *outp
|
for(i = 0; i < in_length; i += 2) {
|
||||||
* is not valid any more. We have to do the freeing and zero out *outp
|
unsigned short c;
|
||||||
*/
|
unsigned short* p;
|
||||||
if(out_length + ((l + 1) * 2) > out_size) {
|
size_t l;
|
||||||
char *saved;
|
size_t k;
|
||||||
out_size += ((l + 1) * 2) + 1024;
|
c = (in[i] << 8) | (in[i + 1] & 0xff);
|
||||||
saved = out;
|
|
||||||
out = (char *)realloc(out, out_size);
|
|
||||||
if(out == 0) {
|
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
|
||||||
DEBUG("realloc %d bytes failed\n", out_size);
|
|
||||||
free(saved);
|
|
||||||
*outp = 0;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(l > 0) {
|
|
||||||
/* l == 1 && *p == 0 is the special case generated for
|
|
||||||
mark characters (which may be found if the input is
|
|
||||||
already in decomposed form. Output nothing */
|
|
||||||
if (l != 1 || *p != 0) {
|
|
||||||
/*
|
/*
|
||||||
* If there is a decomposition, insert it in the output
|
* Lookup the tables for decomposition information
|
||||||
* string.
|
|
||||||
*/
|
*/
|
||||||
for(k = 0; k < l; k++) {
|
#ifdef BUILDING_RECOLL
|
||||||
out[out_length++] = (p[k] >> 8) & 0xff;
|
// Exception unac/fold values set by user. There should be 3 arrays for
|
||||||
out[out_length++] = (p[k] & 0xff);
|
// unac/fold/unac+fold. For now there is only one array, which used to
|
||||||
|
// be set for unac+fold, and is mostly or only used to prevent diacritics
|
||||||
|
// removal for some chars and languages where it should not be done.
|
||||||
|
// In conformance with current usage, but incorrectly, we do the following
|
||||||
|
// things for the special chars depending on the operation requested:
|
||||||
|
// - unaccenting: do nothing (copy original char)
|
||||||
|
// - unac+fold: use table
|
||||||
|
// - fold: use the unicode data.
|
||||||
|
string trans;
|
||||||
|
if (what != UNAC_FOLD && except_trans.size() != 0 &&
|
||||||
|
is_except_char(c, trans)) {
|
||||||
|
if (what == UNAC_UNAC) {
|
||||||
|
// Unaccent only. Do nothing
|
||||||
|
p = 0;
|
||||||
|
l = 0;
|
||||||
|
} else {
|
||||||
|
// Has to be UNAC_UNACFOLD: use table
|
||||||
|
p = (unsigned short *)trans.c_str();
|
||||||
|
l = trans.size() / 2;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
#endif /* BUILDING_RECOLL */
|
||||||
|
unac_uf_char_utf16_(c, p, l, what)
|
||||||
|
#ifdef BUILDING_RECOLL
|
||||||
|
}
|
||||||
|
#endif /* BUILDING_RECOLL */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Explain what's done in great detail
|
||||||
|
*/
|
||||||
|
if(debug_level == UNAC_DEBUG_HIGH) {
|
||||||
|
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT];
|
||||||
|
unsigned char position = (c) & UNAC_BLOCK_MASK;
|
||||||
|
DEBUG("unac_data%d[%d] & unac_positions[%d][%d]: ", index, unac_positions[index][position], index, position+1);
|
||||||
|
DEBUG_APPEND("0x%04x => ", (c));
|
||||||
|
if(l == 0) {
|
||||||
|
DEBUG_APPEND("untouched\n");
|
||||||
|
} else {
|
||||||
|
size_t i;
|
||||||
|
for(i = 0; i < l; i++)
|
||||||
|
DEBUG_APPEND("0x%04x ", p[i]);
|
||||||
|
DEBUG_APPEND("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Make sure there is enough space to hold the decomposition
|
||||||
|
* Note: a previous realloc may have succeeded, which means that *outp
|
||||||
|
* is not valid any more. We have to do the freeing and zero out *outp
|
||||||
|
*/
|
||||||
|
if(out_length + ((l + 1) * 2) > out_size) {
|
||||||
|
char *saved;
|
||||||
|
out_size += ((l + 1) * 2) + 1024;
|
||||||
|
saved = out;
|
||||||
|
out = (char *)realloc(out, out_size);
|
||||||
|
if(out == 0) {
|
||||||
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
|
DEBUG("realloc %d bytes failed\n", out_size);
|
||||||
|
free(saved);
|
||||||
|
*outp = 0;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(l > 0) {
|
||||||
|
/* l == 1 && *p == 0 is the special case generated for
|
||||||
|
mark characters (which may be found if the input is
|
||||||
|
already in decomposed form. Output nothing */
|
||||||
|
if (l != 1 || *p != 0) {
|
||||||
|
/*
|
||||||
|
* If there is a decomposition, insert it in the output
|
||||||
|
* string.
|
||||||
|
*/
|
||||||
|
for(k = 0; k < l; k++) {
|
||||||
|
out[out_length++] = (p[k] >> 8) & 0xff;
|
||||||
|
out[out_length++] = (p[k] & 0xff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* If there is no decomposition leave it unchanged
|
||||||
|
*/
|
||||||
|
out[out_length++] = in[i];
|
||||||
|
out[out_length++] = in[i + 1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
* If there is no decomposition leave it unchanged
|
|
||||||
*/
|
|
||||||
out[out_length++] = in[i];
|
|
||||||
out[out_length++] = in[i + 1];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
*outp = out;
|
*outp = out;
|
||||||
*out_lengthp = out_length;
|
*out_lengthp = out_length;
|
||||||
(*outp)[*out_lengthp] = '\0';
|
(*outp)[*out_lengthp] = '\0';
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
int unac_string_utf16(const char* in, size_t in_length,
|
int unac_string_utf16(const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||||
char** outp, size_t* out_lengthp)
|
|
||||||
{
|
{
|
||||||
return unacmaybefold_string_utf16(in, in_length,
|
return unacmaybefold_string_utf16(in, in_length, outp, out_lengthp, UNAC_UNAC);
|
||||||
outp, out_lengthp, UNAC_UNAC);
|
|
||||||
}
|
}
|
||||||
int unacfold_string_utf16(const char* in, size_t in_length,
|
int unacfold_string_utf16(const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||||
char** outp, size_t* out_lengthp)
|
|
||||||
{
|
{
|
||||||
return unacmaybefold_string_utf16(in, in_length,
|
return unacmaybefold_string_utf16(in, in_length, outp, out_lengthp, UNAC_UNACFOLD);
|
||||||
outp, out_lengthp, UNAC_UNACFOLD);
|
|
||||||
}
|
}
|
||||||
int fold_string_utf16(const char* in, size_t in_length,
|
int fold_string_utf16(const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||||
char** outp, size_t* out_lengthp)
|
|
||||||
{
|
{
|
||||||
return unacmaybefold_string_utf16(in, in_length,
|
return unacmaybefold_string_utf16(in, in_length, outp, out_lengthp, UNAC_FOLD);
|
||||||
outp, out_lengthp, UNAC_FOLD);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char *utf16be = "UTF-16BE";
|
static const char *utf16be = "UTF-16BE";
|
||||||
@ -14322,229 +14316,223 @@ static std::mutex o_unac_mutex;
|
|||||||
* The out string is always null terminated.
|
* The out string is always null terminated.
|
||||||
*/
|
*/
|
||||||
static int convert(const char* from, const char* to,
|
static int convert(const char* from, const char* to,
|
||||||
const char* in, size_t in_length,
|
const char* in, size_t in_length,
|
||||||
char** outp, size_t* out_lengthp)
|
char** outp, size_t* out_lengthp)
|
||||||
{
|
{
|
||||||
int ret = -1;
|
int ret = -1;
|
||||||
iconv_t cd;
|
iconv_t cd;
|
||||||
char* out;
|
char* out;
|
||||||
size_t out_remain;
|
size_t out_remain;
|
||||||
size_t out_size;
|
size_t out_size;
|
||||||
char* out_base;
|
char* out_base;
|
||||||
int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
|
int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
|
||||||
const char space[] = { 0x00, 0x20 };
|
const char space[] = { 0x00, 0x20 };
|
||||||
|
|
||||||
std::unique_lock<std::mutex> lock(o_unac_mutex);
|
std::unique_lock<std::mutex> lock(o_unac_mutex);
|
||||||
|
|
||||||
if (!strcmp(utf16be, from)) {
|
if (!strcmp(utf16be, from)) {
|
||||||
from_utf8 = 0;
|
from_utf8 = 0;
|
||||||
from_utf16 = 1;
|
from_utf16 = 1;
|
||||||
} else if (!strcasecmp("UTF-8", from)) {
|
} else if (!strcasecmp("UTF-8", from)) {
|
||||||
from_utf8 = 1;
|
from_utf8 = 1;
|
||||||
from_utf16 = 0;
|
from_utf16 = 0;
|
||||||
} else {
|
|
||||||
from_utf8 = from_utf16 = 0;
|
|
||||||
}
|
|
||||||
if (!strcmp(utf16be, to)) {
|
|
||||||
to_utf8 = 0;
|
|
||||||
to_utf16 = 1;
|
|
||||||
} else if (!strcasecmp("UTF-8", to)) {
|
|
||||||
to_utf8 = 1;
|
|
||||||
to_utf16 = 0;
|
|
||||||
} else {
|
|
||||||
to_utf8 = to_utf16 = 0;
|
|
||||||
}
|
|
||||||
u16tou8 = from_utf16 && to_utf8;
|
|
||||||
u8tou16 = from_utf8 && to_utf16;
|
|
||||||
|
|
||||||
out_size = in_length > 0 ? in_length : 1024;
|
|
||||||
|
|
||||||
out = *outp;
|
|
||||||
out = (char *)realloc(out, out_size + 1);
|
|
||||||
if(out == 0) {
|
|
||||||
/* *outp still valid, no freeing */
|
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
|
||||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
out_remain = out_size;
|
|
||||||
out_base = out;
|
|
||||||
|
|
||||||
if (u8tou16) {
|
|
||||||
if (u8tou16_cd == (iconv_t)-1) {
|
|
||||||
if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
iconv(u8tou16_cd, 0, 0, 0, 0);
|
|
||||||
}
|
|
||||||
cd = u8tou16_cd;
|
|
||||||
} else if (u16tou8) {
|
|
||||||
if (u16tou8_cd == (iconv_t)-1) {
|
|
||||||
if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
iconv(u16tou8_cd, 0, 0, 0, 0);
|
|
||||||
}
|
|
||||||
cd = u16tou8_cd;
|
|
||||||
} else {
|
|
||||||
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
do {
|
|
||||||
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
|
||||||
switch(errno) {
|
|
||||||
case EILSEQ:
|
|
||||||
/*
|
|
||||||
* If an illegal sequence is found in the context of unac_string
|
|
||||||
* it means the unaccented version of a character contains
|
|
||||||
* a sequence that cannot be mapped back to the original charset.
|
|
||||||
* For instance, the 1/4 character in ISO-8859-1 is decomposed
|
|
||||||
* in three characters including the FRACTION SLASH (2044) which
|
|
||||||
* have no equivalent in the ISO-8859-1 map. One can argue that
|
|
||||||
* the conversions tables should map it to the regular / character
|
|
||||||
* or that a <compat> entry should be associated with it.
|
|
||||||
*
|
|
||||||
* To cope with this situation, convert silently transform all
|
|
||||||
* illegal sequences (EILSEQ) into a SPACE character 0x0020.
|
|
||||||
*
|
|
||||||
* In the general conversion case this behaviour is not desirable.
|
|
||||||
* However, it is not the responsibility of this program to cope
|
|
||||||
* with inconsistencies of the Unicode description and a bug report
|
|
||||||
* should be submited to Unicode so that they can fix the problem.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
if(from_utf16) {
|
|
||||||
const char* tmp = space;
|
|
||||||
size_t tmp_length = 2;
|
|
||||||
if(iconv(cd, (ICONV_CONST char **) &tmp, &tmp_length, &out, &out_remain) == (size_t)-1) {
|
|
||||||
if(errno == E2BIG) {
|
|
||||||
/* fall thru to the E2BIG case below */;
|
|
||||||
} else {
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
/* The offending character was replaced by a SPACE, skip it. */
|
|
||||||
in += 2;
|
|
||||||
in_length -= 2;
|
|
||||||
/* And continue conversion. */
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
goto out;
|
from_utf8 = from_utf16 = 0;
|
||||||
}
|
}
|
||||||
case E2BIG:
|
if (!strcmp(utf16be, to)) {
|
||||||
{
|
to_utf8 = 0;
|
||||||
/*
|
to_utf16 = 1;
|
||||||
* The output does not fit in the current out buffer, enlarge it.
|
} else if (!strcasecmp("UTF-8", to)) {
|
||||||
*/
|
to_utf8 = 1;
|
||||||
size_t length = out - out_base;
|
to_utf16 = 0;
|
||||||
out_size *= 2;
|
} else {
|
||||||
{
|
to_utf8 = to_utf16 = 0;
|
||||||
char *saved = out_base;
|
|
||||||
/* +1 for null */
|
|
||||||
out_base = (char *)realloc(out_base, out_size + 1);
|
|
||||||
if (out_base == 0) {
|
|
||||||
/* *outp potentially not valid any more. Free here,
|
|
||||||
* and zero out */
|
|
||||||
if(debug_level >= UNAC_DEBUG_LOW)
|
|
||||||
DEBUG("realloc %d bytes failed\n", out_size+1);
|
|
||||||
free(saved);
|
|
||||||
*outp = 0;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
out = out_base + length;
|
|
||||||
out_remain = out_size - length;
|
|
||||||
}
|
}
|
||||||
break;
|
u16tou8 = from_utf16 && to_utf8;
|
||||||
default:
|
u8tou16 = from_utf8 && to_utf16;
|
||||||
goto out;
|
|
||||||
break;
|
out_size = in_length > 0 ? in_length : 1024;
|
||||||
}
|
|
||||||
|
out = *outp;
|
||||||
|
out = (char *)realloc(out, out_size + 1);
|
||||||
|
if(out == 0) {
|
||||||
|
/* *outp still valid, no freeing */
|
||||||
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
|
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||||
|
goto out;
|
||||||
}
|
}
|
||||||
} while(in_length > 0);
|
|
||||||
|
|
||||||
if (!u8tou16 && !u16tou8)
|
out_remain = out_size;
|
||||||
iconv_close(cd);
|
out_base = out;
|
||||||
|
|
||||||
*outp = out_base;
|
if (u8tou16) {
|
||||||
*out_lengthp = out - out_base;
|
if (u8tou16_cd == (iconv_t)-1) {
|
||||||
(*outp)[*out_lengthp] = '\0';
|
if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
iconv(u8tou16_cd, 0, 0, 0, 0);
|
||||||
|
}
|
||||||
|
cd = u8tou16_cd;
|
||||||
|
} else if (u16tou8) {
|
||||||
|
if (u16tou8_cd == (iconv_t)-1) {
|
||||||
|
if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
iconv(u16tou8_cd, 0, 0, 0, 0);
|
||||||
|
}
|
||||||
|
cd = u16tou8_cd;
|
||||||
|
} else {
|
||||||
|
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ret = 0;
|
do {
|
||||||
|
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
||||||
|
switch(errno) {
|
||||||
|
case EILSEQ:
|
||||||
|
/*
|
||||||
|
* If an illegal sequence is found in the context of unac_string
|
||||||
|
* it means the unaccented version of a character contains
|
||||||
|
* a sequence that cannot be mapped back to the original charset.
|
||||||
|
* For instance, the 1/4 character in ISO-8859-1 is decomposed
|
||||||
|
* in three characters including the FRACTION SLASH (2044) which
|
||||||
|
* have no equivalent in the ISO-8859-1 map. One can argue that
|
||||||
|
* the conversions tables should map it to the regular / character
|
||||||
|
* or that a <compat> entry should be associated with it.
|
||||||
|
*
|
||||||
|
* To cope with this situation, convert silently transform all
|
||||||
|
* illegal sequences (EILSEQ) into a SPACE character 0x0020.
|
||||||
|
*
|
||||||
|
* In the general conversion case this behaviour is not desirable.
|
||||||
|
* However, it is not the responsibility of this program to cope
|
||||||
|
* with inconsistencies of the Unicode description and a bug report
|
||||||
|
* should be submited to Unicode so that they can fix the problem.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
if (from_utf16) {
|
||||||
|
const char* tmp = space;
|
||||||
|
size_t tmp_length = 2;
|
||||||
|
if (iconv(cd, (ICONV_CONST char **)&tmp, &tmp_length, &out, &out_remain) ==
|
||||||
|
(size_t)-1) {
|
||||||
|
if(errno == E2BIG) {
|
||||||
|
/* fall thru to the E2BIG case below */;
|
||||||
|
} else {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* The offending character was replaced by a SPACE, skip it. */
|
||||||
|
in += 2;
|
||||||
|
in_length -= 2;
|
||||||
|
/* And continue conversion. */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
case E2BIG:
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* The output does not fit in the current out buffer, enlarge it.
|
||||||
|
*/
|
||||||
|
size_t length = out - out_base;
|
||||||
|
out_size *= 2;
|
||||||
|
{
|
||||||
|
char *saved = out_base;
|
||||||
|
/* +1 for null */
|
||||||
|
out_base = (char *)realloc(out_base, out_size + 1);
|
||||||
|
if (out_base == 0) {
|
||||||
|
/* *outp potentially not valid any more. Free here,
|
||||||
|
* and zero out */
|
||||||
|
if(debug_level >= UNAC_DEBUG_LOW)
|
||||||
|
DEBUG("realloc %d bytes failed\n", out_size+1);
|
||||||
|
free(saved);
|
||||||
|
*outp = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = out_base + length;
|
||||||
|
out_remain = out_size - length;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
goto out;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while(in_length > 0);
|
||||||
|
|
||||||
|
if (!u8tou16 && !u16tou8)
|
||||||
|
iconv_close(cd);
|
||||||
|
|
||||||
|
*outp = out_base;
|
||||||
|
*out_lengthp = out - out_base;
|
||||||
|
(*outp)[*out_lengthp] = '\0';
|
||||||
|
|
||||||
|
ret = 0;
|
||||||
out:
|
out:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
int unacmaybefold_string(const char* charset,
|
int unacmaybefold_string(const char* charset, const char* in, size_t in_length,
|
||||||
const char* in, size_t in_length,
|
char** outp, size_t* out_lengthp, int what)
|
||||||
char** outp, size_t* out_lengthp, int what)
|
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* When converting an empty string, skip everything but alloc the
|
* When converting an empty string, skip everything but alloc the
|
||||||
* buffer if NULL pointer.
|
* buffer if NULL pointer.
|
||||||
*/
|
*/
|
||||||
if (in_length <= 0) {
|
if (in_length <= 0) {
|
||||||
if(!*outp) {
|
if(!*outp) {
|
||||||
if ((*outp = (char*)malloc(32)) == 0)
|
if ((*outp = (char*)malloc(32)) == 0)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
(*outp)[0] = '\0';
|
(*outp)[0] = '\0';
|
||||||
*out_lengthp = 0;
|
*out_lengthp = 0;
|
||||||
} else {
|
} else {
|
||||||
char* utf16 = 0;
|
char* utf16 = 0;
|
||||||
size_t utf16_length = 0;
|
size_t utf16_length = 0;
|
||||||
char* utf16_unaccented = 0;
|
char* utf16_unaccented = 0;
|
||||||
size_t utf16_unaccented_length = 0;
|
size_t utf16_unaccented_length = 0;
|
||||||
|
|
||||||
if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
|
if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented,
|
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented,
|
||||||
&utf16_unaccented_length, what);
|
&utf16_unaccented_length, what);
|
||||||
free(utf16);
|
free(utf16);
|
||||||
|
|
||||||
if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length,
|
if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length,
|
||||||
outp, out_lengthp) < 0) {
|
outp, out_lengthp) < 0) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
free(utf16_unaccented);
|
free(utf16_unaccented);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int unac_string(const char* charset,
|
int unac_string(
|
||||||
const char* in, size_t in_length,
|
const char* charset, const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||||
char** outp, size_t* out_lengthp)
|
|
||||||
{
|
{
|
||||||
return unacmaybefold_string(charset, in, in_length,
|
return unacmaybefold_string(charset, in, in_length, outp, out_lengthp, UNAC_UNAC);
|
||||||
outp, out_lengthp, UNAC_UNAC);
|
|
||||||
}
|
}
|
||||||
int unacfold_string(const char* charset,
|
int unacfold_string(
|
||||||
const char* in, size_t in_length,
|
const char* charset, const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||||
char** outp, size_t* out_lengthp)
|
|
||||||
{
|
{
|
||||||
return unacmaybefold_string(charset, in, in_length,
|
return unacmaybefold_string(charset, in, in_length, outp, out_lengthp, UNAC_UNACFOLD);
|
||||||
outp, out_lengthp, UNAC_UNACFOLD);
|
|
||||||
}
|
}
|
||||||
int fold_string(const char* charset,
|
int fold_string(
|
||||||
const char* in, size_t in_length,
|
const char* charset, const char* in, size_t in_length, char** outp, size_t* out_lengthp)
|
||||||
char** outp, size_t* out_lengthp)
|
|
||||||
{
|
{
|
||||||
return unacmaybefold_string(charset, in, in_length,
|
return unacmaybefold_string(charset, in, in_length, outp, out_lengthp, UNAC_FOLD);
|
||||||
outp, out_lengthp, UNAC_FOLD);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const char* unac_version(void)
|
const char* unac_version(void)
|
||||||
{
|
{
|
||||||
return UNAC_VERSION;
|
return UNAC_VERSION;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef BUILDING_RECOLL
|
#ifdef BUILDING_RECOLL
|
||||||
@ -14552,7 +14540,7 @@ void unac_set_except_translations(const char *spectrans)
|
|||||||
{
|
{
|
||||||
except_trans.clear();
|
except_trans.clear();
|
||||||
if (!spectrans || !spectrans[0])
|
if (!spectrans || !spectrans[0])
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// The translation tables out of Unicode are in machine byte order (we
|
// The translation tables out of Unicode are in machine byte order (we
|
||||||
// just let the compiler read the values).
|
// just let the compiler read the values).
|
||||||
@ -14563,41 +14551,39 @@ void unac_set_except_translations(const char *spectrans)
|
|||||||
static const char *machinecoding = 0;
|
static const char *machinecoding = 0;
|
||||||
bool littleendian = true;
|
bool littleendian = true;
|
||||||
if (machinecoding == 0) {
|
if (machinecoding == 0) {
|
||||||
const char* charshort = "\001\002";
|
const char* charshort = "\001\002";
|
||||||
short *ip = (short *)charshort;
|
short *ip = (short *)charshort;
|
||||||
if (*ip == 0x0102) {
|
if (*ip == 0x0102) {
|
||||||
littleendian = false;
|
littleendian = false;
|
||||||
machinecoding = "UTF-16BE";
|
machinecoding = "UTF-16BE";
|
||||||
} else {
|
} else {
|
||||||
littleendian = true;
|
littleendian = true;
|
||||||
machinecoding = "UTF-16LE";
|
machinecoding = "UTF-16LE";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<string> vtrans;
|
vector<string> vtrans;
|
||||||
stringToStrings(spectrans, vtrans);
|
stringToStrings(spectrans, vtrans);
|
||||||
|
|
||||||
for (vector<string>::iterator it = vtrans.begin();
|
for (const auto& trans : vtrans) {
|
||||||
it != vtrans.end(); it++) {
|
|
||||||
|
|
||||||
/* Convert the whole thing to utf-16be/le according to endianness */
|
/* Convert the whole thing to utf-16be/le according to endianness */
|
||||||
char *out = 0;
|
char *out = 0;
|
||||||
size_t outsize;
|
size_t outsize;
|
||||||
if (convert("UTF-8", machinecoding,
|
if (convert("UTF-8", machinecoding, trans.c_str(), trans.size(), &out, &outsize) != 0 ||
|
||||||
it->c_str(), it->size(),
|
outsize < 2)
|
||||||
&out, &outsize) != 0 || outsize < 2)
|
continue;
|
||||||
continue;
|
|
||||||
|
|
||||||
/* The source char must be utf-16be as this is what we convert the
|
/* The source char must be utf-16be as this is what we convert the
|
||||||
input text to for internal processing */
|
input text to for internal processing */
|
||||||
unsigned short ch;
|
unsigned short ch;
|
||||||
if (littleendian)
|
if (littleendian)
|
||||||
ch = (out[1] << 8) | (out[0] & 0xff);
|
ch = (out[1] << 8) | (out[0] & 0xff);
|
||||||
else
|
else
|
||||||
ch = (out[0] << 8) | (out[1] & 0xff);
|
ch = (out[0] << 8) | (out[1] & 0xff);
|
||||||
|
|
||||||
except_trans[ch] = string((const char *)(out + 2), outsize-2);
|
except_trans[ch] = string((const char *)(out + 2), outsize-2);
|
||||||
free(out);
|
free(out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* BUILDING_RECOLL */
|
#endif /* BUILDING_RECOLL */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user