perform some iconv_open caching
This commit is contained in:
parent
802ebc7704
commit
773ab56327
@ -10438,31 +10438,6 @@ void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
|||||||
#define DEBUG_APPEND
|
#define DEBUG_APPEND
|
||||||
#endif /* UNAC_DEBUG_AVAILABLE */
|
#endif /* UNAC_DEBUG_AVAILABLE */
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If UTF-16BE exists, use it. If not, use UTF-16 and hope it is
|
|
||||||
* encoded in big endian. This fallback is a iconv related
|
|
||||||
* compatibility hack introduced in some GNU/Linux distributions that
|
|
||||||
* did not know UTF-16BE.
|
|
||||||
*/
|
|
||||||
static const char* utf16be(void)
|
|
||||||
{
|
|
||||||
iconv_t cd;
|
|
||||||
static char* name = 0;
|
|
||||||
|
|
||||||
if(name == 0) {
|
|
||||||
if((cd = iconv_open("UTF-16BE", "UTF-16BE")) == (iconv_t)-1) {
|
|
||||||
if(debug_level >= UNAC_DEBUG_LOW) DEBUG("could not find UTF-16BE (see iconv -l), using UTF-16. If UTF-16 happens to be encoded in little endian, be prepared for an horrible mess.");
|
|
||||||
name = "UTF-16";
|
|
||||||
} else {
|
|
||||||
iconv_close(cd);
|
|
||||||
name = "UTF-16BE";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return name;
|
|
||||||
}
|
|
||||||
|
|
||||||
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
||||||
char** outp, size_t* out_lengthp, int dofold)
|
char** outp, size_t* out_lengthp, int dofold)
|
||||||
{
|
{
|
||||||
@ -10586,6 +10561,10 @@ static int convert(const char* from, const char* to,
|
|||||||
const char* in, size_t in_length,
|
const char* in, size_t in_length,
|
||||||
char** outp, size_t* out_lengthp);
|
char** outp, size_t* out_lengthp);
|
||||||
|
|
||||||
|
static const char *utf16be = "UTF-16BE";
|
||||||
|
static iconv_t u8tou16_cd = (iconv_t)-1;
|
||||||
|
static iconv_t u16tou8_cd = (iconv_t)-1;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Convert buffer <in> containing string encoded in charset <from> into
|
* Convert buffer <in> containing string encoded in charset <from> into
|
||||||
* a string in charset <to> and return it in buffer <outp>. The <outp>
|
* a string in charset <to> and return it in buffer <outp>. The <outp>
|
||||||
@ -10602,9 +10581,30 @@ static int convert(const char* from, const char* to,
|
|||||||
size_t out_remain;
|
size_t out_remain;
|
||||||
size_t out_size;
|
size_t out_size;
|
||||||
char* out_base;
|
char* out_base;
|
||||||
int from_utf16 = !strcmp(utf16be(), from);
|
int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
|
||||||
const char space[] = { 0x00, 0x20 };
|
const char space[] = { 0x00, 0x20 };
|
||||||
|
|
||||||
|
if (!strcmp(utf16be, from)) {
|
||||||
|
from_utf8 = 0;
|
||||||
|
from_utf16 = 1;
|
||||||
|
} else if (!strcasecmp("UTF-8", from)) {
|
||||||
|
from_utf8 = 1;
|
||||||
|
from_utf16 = 0;
|
||||||
|
} else {
|
||||||
|
from_utf8 = from_utf16 = 0;
|
||||||
|
}
|
||||||
|
if (!strcmp(utf16be, to)) {
|
||||||
|
to_utf8 = 0;
|
||||||
|
to_utf16 = 1;
|
||||||
|
} else if (!strcasecmp("UTF-8", to)) {
|
||||||
|
to_utf8 = 1;
|
||||||
|
to_utf16 = 0;
|
||||||
|
} else {
|
||||||
|
to_utf8 = to_utf16 = 0;
|
||||||
|
}
|
||||||
|
u16tou8 = from_utf16 && to_utf8;
|
||||||
|
u8tou16 = from_utf8 && to_utf16;
|
||||||
|
|
||||||
out_size = in_length > 0 ? in_length : 1024;
|
out_size = in_length > 0 ? in_length : 1024;
|
||||||
if(*outp) {
|
if(*outp) {
|
||||||
out = *outp;
|
out = *outp;
|
||||||
@ -10628,9 +10628,30 @@ static int convert(const char* from, const char* to,
|
|||||||
out_remain = out_size;
|
out_remain = out_size;
|
||||||
out_base = out;
|
out_base = out;
|
||||||
|
|
||||||
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
if (u8tou16) {
|
||||||
return -1;
|
if (u8tou16_cd == (iconv_t)-1) {
|
||||||
|
if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
iconv(u8tou16_cd, 0, 0, 0, 0);
|
||||||
|
}
|
||||||
|
cd = u8tou16_cd;
|
||||||
|
} else if (u16tou8) {
|
||||||
|
if (u16tou8_cd == (iconv_t)-1) {
|
||||||
|
if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
iconv(u16tou8_cd, 0, 0, 0, 0);
|
||||||
|
}
|
||||||
|
cd = u16tou8_cd;
|
||||||
|
} else {
|
||||||
|
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
do {
|
do {
|
||||||
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
||||||
switch(errno) {
|
switch(errno) {
|
||||||
@ -10703,7 +10724,9 @@ static int convert(const char* from, const char* to,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while(in_length > 0);
|
} while(in_length > 0);
|
||||||
iconv_close(cd);
|
|
||||||
|
if (!u8tou16 && !u16tou8)
|
||||||
|
iconv_close(cd);
|
||||||
|
|
||||||
*outp = out_base;
|
*outp = out_base;
|
||||||
*out_lengthp = out - out_base;
|
*out_lengthp = out - out_base;
|
||||||
@ -10733,14 +10756,14 @@ int unacmaybefold_string(const char* charset,
|
|||||||
char* utf16_unaccented = 0;
|
char* utf16_unaccented = 0;
|
||||||
size_t utf16_unaccented_length = 0;
|
size_t utf16_unaccented_length = 0;
|
||||||
|
|
||||||
if(convert(charset, utf16be(), in, in_length, &utf16, &utf16_length) < 0) {
|
if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented, &utf16_unaccented_length, dofold);
|
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented, &utf16_unaccented_length, dofold);
|
||||||
free(utf16);
|
free(utf16);
|
||||||
|
|
||||||
if(convert(utf16be(), charset, utf16_unaccented, utf16_unaccented_length, outp, out_lengthp) < 0) {
|
if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length, outp, out_lengthp) < 0) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
free(utf16_unaccented);
|
free(utf16_unaccented);
|
||||||
|
|||||||
@ -37,8 +37,8 @@ using std::string;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// We gain approximately 28% exec time for word at a time conversions by
|
// We gain approximately 28% exec time for word at a time conversions by
|
||||||
// caching the iconv_open thing. This is probably not worth it.
|
// caching the iconv_open thing.
|
||||||
//#define ICONV_CACHE_OPEN
|
#define ICONV_CACHE_OPEN
|
||||||
|
|
||||||
bool transcode(const string &in, string &out, const string &icode,
|
bool transcode(const string &in, string &out, const string &icode,
|
||||||
const string &ocode, int *ecnt)
|
const string &ocode, int *ecnt)
|
||||||
|
|||||||
85
unac/unac.c
85
unac/unac.c
@ -10438,31 +10438,6 @@ void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
|||||||
#define DEBUG_APPEND
|
#define DEBUG_APPEND
|
||||||
#endif /* UNAC_DEBUG_AVAILABLE */
|
#endif /* UNAC_DEBUG_AVAILABLE */
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If UTF-16BE exists, use it. If not, use UTF-16 and hope it is
|
|
||||||
* encoded in big endian. This fallback is a iconv related
|
|
||||||
* compatibility hack introduced in some GNU/Linux distributions that
|
|
||||||
* did not know UTF-16BE.
|
|
||||||
*/
|
|
||||||
static const char* utf16be(void)
|
|
||||||
{
|
|
||||||
iconv_t cd;
|
|
||||||
static char* name = 0;
|
|
||||||
|
|
||||||
if(name == 0) {
|
|
||||||
if((cd = iconv_open("UTF-16BE", "UTF-16BE")) == (iconv_t)-1) {
|
|
||||||
if(debug_level >= UNAC_DEBUG_LOW) DEBUG("could not find UTF-16BE (see iconv -l), using UTF-16. If UTF-16 happens to be encoded in little endian, be prepared for an horrible mess.");
|
|
||||||
name = "UTF-16";
|
|
||||||
} else {
|
|
||||||
iconv_close(cd);
|
|
||||||
name = "UTF-16BE";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return name;
|
|
||||||
}
|
|
||||||
|
|
||||||
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
||||||
char** outp, size_t* out_lengthp, int dofold)
|
char** outp, size_t* out_lengthp, int dofold)
|
||||||
{
|
{
|
||||||
@ -10586,6 +10561,10 @@ static int convert(const char* from, const char* to,
|
|||||||
const char* in, size_t in_length,
|
const char* in, size_t in_length,
|
||||||
char** outp, size_t* out_lengthp);
|
char** outp, size_t* out_lengthp);
|
||||||
|
|
||||||
|
static const char *utf16be = "UTF-16BE";
|
||||||
|
static iconv_t u8tou16_cd = (iconv_t)-1;
|
||||||
|
static iconv_t u16tou8_cd = (iconv_t)-1;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Convert buffer <in> containing string encoded in charset <from> into
|
* Convert buffer <in> containing string encoded in charset <from> into
|
||||||
* a string in charset <to> and return it in buffer <outp>. The <outp>
|
* a string in charset <to> and return it in buffer <outp>. The <outp>
|
||||||
@ -10602,9 +10581,30 @@ static int convert(const char* from, const char* to,
|
|||||||
size_t out_remain;
|
size_t out_remain;
|
||||||
size_t out_size;
|
size_t out_size;
|
||||||
char* out_base;
|
char* out_base;
|
||||||
int from_utf16 = !strcmp(utf16be(), from);
|
int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
|
||||||
const char space[] = { 0x00, 0x20 };
|
const char space[] = { 0x00, 0x20 };
|
||||||
|
|
||||||
|
if (!strcmp(utf16be, from)) {
|
||||||
|
from_utf8 = 0;
|
||||||
|
from_utf16 = 1;
|
||||||
|
} else if (!strcasecmp("UTF-8", from)) {
|
||||||
|
from_utf8 = 1;
|
||||||
|
from_utf16 = 0;
|
||||||
|
} else {
|
||||||
|
from_utf8 = from_utf16 = 0;
|
||||||
|
}
|
||||||
|
if (!strcmp(utf16be, to)) {
|
||||||
|
to_utf8 = 0;
|
||||||
|
to_utf16 = 1;
|
||||||
|
} else if (!strcasecmp("UTF-8", to)) {
|
||||||
|
to_utf8 = 1;
|
||||||
|
to_utf16 = 0;
|
||||||
|
} else {
|
||||||
|
to_utf8 = to_utf16 = 0;
|
||||||
|
}
|
||||||
|
u16tou8 = from_utf16 && to_utf8;
|
||||||
|
u8tou16 = from_utf8 && to_utf16;
|
||||||
|
|
||||||
out_size = in_length > 0 ? in_length : 1024;
|
out_size = in_length > 0 ? in_length : 1024;
|
||||||
if(*outp) {
|
if(*outp) {
|
||||||
out = *outp;
|
out = *outp;
|
||||||
@ -10628,9 +10628,30 @@ static int convert(const char* from, const char* to,
|
|||||||
out_remain = out_size;
|
out_remain = out_size;
|
||||||
out_base = out;
|
out_base = out;
|
||||||
|
|
||||||
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
if (u8tou16) {
|
||||||
return -1;
|
if (u8tou16_cd == (iconv_t)-1) {
|
||||||
|
if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
iconv(u8tou16_cd, 0, 0, 0, 0);
|
||||||
|
}
|
||||||
|
cd = u8tou16_cd;
|
||||||
|
} else if (u16tou8) {
|
||||||
|
if (u16tou8_cd == (iconv_t)-1) {
|
||||||
|
if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
iconv(u16tou8_cd, 0, 0, 0, 0);
|
||||||
|
}
|
||||||
|
cd = u16tou8_cd;
|
||||||
|
} else {
|
||||||
|
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
do {
|
do {
|
||||||
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
||||||
switch(errno) {
|
switch(errno) {
|
||||||
@ -10703,7 +10724,9 @@ static int convert(const char* from, const char* to,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while(in_length > 0);
|
} while(in_length > 0);
|
||||||
iconv_close(cd);
|
|
||||||
|
if (!u8tou16 && !u16tou8)
|
||||||
|
iconv_close(cd);
|
||||||
|
|
||||||
*outp = out_base;
|
*outp = out_base;
|
||||||
*out_lengthp = out - out_base;
|
*out_lengthp = out - out_base;
|
||||||
@ -10733,14 +10756,14 @@ int unacmaybefold_string(const char* charset,
|
|||||||
char* utf16_unaccented = 0;
|
char* utf16_unaccented = 0;
|
||||||
size_t utf16_unaccented_length = 0;
|
size_t utf16_unaccented_length = 0;
|
||||||
|
|
||||||
if(convert(charset, utf16be(), in, in_length, &utf16, &utf16_length) < 0) {
|
if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented, &utf16_unaccented_length, dofold);
|
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented, &utf16_unaccented_length, dofold);
|
||||||
free(utf16);
|
free(utf16);
|
||||||
|
|
||||||
if(convert(utf16be(), charset, utf16_unaccented, utf16_unaccented_length, outp, out_lengthp) < 0) {
|
if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length, outp, out_lengthp) < 0) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
free(utf16_unaccented);
|
free(utf16_unaccented);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user