From 07e3387fc10696e4ead290ef2614b1d618cf7196 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 25 Apr 2020 11:19:52 +0200 Subject: [PATCH] Avoid calling isalpha() with big ints, may crash, depending on version --- src/common/textsplit.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 3a515466..13dd1c71 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1001,25 +1001,26 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp) // We use an offset buffer to remember the starts of the utf-8 // characters which we still need to use. assert(o_CJKNgramLen < o_CJKMaxNgramLen); - unsigned int boffs[o_CJKMaxNgramLen+1]; + string::size_type boffs[o_CJKMaxNgramLen+1]; string mybuf; - unsigned int myboffs[o_CJKMaxNgramLen+1]; + string::size_type myboffs[o_CJKMaxNgramLen+1]; // Current number of valid offsets; unsigned int nchars = 0; unsigned int c = 0; for (; !it.eof() && !it.error(); it++) { c = *it; - // We had a version which ignored whitespace for some time, - // but this was a bad idea. Only break on an non-cjk - // alphabetic character. - if (!UNICODE_IS_CJK(c) && isalpha(c)) { + // We had a version which ignored whitespace for some time, + // but this was a bad idea. Only break on an non-cjk + // alphabetic character. + if (!UNICODE_IS_CJK(c) && (c > 255 || isalpha(c))) { // Return to normal handler break; } if (whatcc(c) == SPACE) { // Flush the ngram buffer and go on nchars = 0; + mybuf.clear(); continue; } if (nchars == o_CJKNgramLen) { @@ -1040,13 +1041,13 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp) myboffs[nchars-1] = mybuf.size(); it.appendchartostring(mybuf); // Take note of document byte offset for this character. - boffs[nchars-1] = int(it.getBpos()); + boffs[nchars-1] = it.getBpos(); // Output all new ngrams: they begin at each existing position // and end after the new character. onlyspans->only output // maximum words, nospans=> single chars if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) { - int btend = int(it.getBpos() + it.getBlen()); + int btend = it.getBpos() + it.getBlen(); int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0; int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars; for (int i = loopbeg; i < loopend; i++) {