Avoid calling isalpha() with big ints, may crash, depending on version

This commit is contained in:
Jean-Francois Dockes 2020-04-25 11:19:52 +02:00
parent b063f6e45d
commit 07e3387fc1

View File

@ -1001,25 +1001,26 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
// We use an offset buffer to remember the starts of the utf-8 // We use an offset buffer to remember the starts of the utf-8
// characters which we still need to use. // characters which we still need to use.
assert(o_CJKNgramLen < o_CJKMaxNgramLen); assert(o_CJKNgramLen < o_CJKMaxNgramLen);
unsigned int boffs[o_CJKMaxNgramLen+1]; string::size_type boffs[o_CJKMaxNgramLen+1];
string mybuf; string mybuf;
unsigned int myboffs[o_CJKMaxNgramLen+1]; string::size_type myboffs[o_CJKMaxNgramLen+1];
// Current number of valid offsets; // Current number of valid offsets;
unsigned int nchars = 0; unsigned int nchars = 0;
unsigned int c = 0; unsigned int c = 0;
for (; !it.eof() && !it.error(); it++) { for (; !it.eof() && !it.error(); it++) {
c = *it; c = *it;
// We had a version which ignored whitespace for some time, // We had a version which ignored whitespace for some time,
// but this was a bad idea. Only break on an non-cjk // but this was a bad idea. Only break on an non-cjk
// alphabetic character. // alphabetic character.
if (!UNICODE_IS_CJK(c) && isalpha(c)) { if (!UNICODE_IS_CJK(c) && (c > 255 || isalpha(c))) {
// Return to normal handler // Return to normal handler
break; break;
} }
if (whatcc(c) == SPACE) { if (whatcc(c) == SPACE) {
// Flush the ngram buffer and go on // Flush the ngram buffer and go on
nchars = 0; nchars = 0;
mybuf.clear();
continue; continue;
} }
if (nchars == o_CJKNgramLen) { if (nchars == o_CJKNgramLen) {
@ -1040,13 +1041,13 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
myboffs[nchars-1] = mybuf.size(); myboffs[nchars-1] = mybuf.size();
it.appendchartostring(mybuf); it.appendchartostring(mybuf);
// Take note of document byte offset for this character. // Take note of document byte offset for this character.
boffs[nchars-1] = int(it.getBpos()); boffs[nchars-1] = it.getBpos();
// Output all new ngrams: they begin at each existing position // Output all new ngrams: they begin at each existing position
// and end after the new character. onlyspans->only output // and end after the new character. onlyspans->only output
// maximum words, nospans=> single chars // maximum words, nospans=> single chars
if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) { if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) {
int btend = int(it.getBpos() + it.getBlen()); int btend = it.getBpos() + it.getBlen();
int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0; int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars; int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
for (int i = loopbeg; i < loopend; i++) { for (int i = loopbeg; i < loopend; i++) {