CJK indexing: return to western word indexing if encountering numeric after punctuation

This commit is contained in:
Jean-Francois Dockes 2020-11-25 17:56:32 +01:00
parent b25dab7cab
commit 8285e18039
2 changed files with 15 additions and 10 deletions

View File

@ -684,7 +684,7 @@ bool TextSplit::text_to_words(const string &in)
return false; return false;
} }
} else { } else {
if (!cjk_to_words(&it, &c)) { if (!cjk_to_words(it, &c)) {
LOGERR("Textsplit: scan error in cjk handler\n"); LOGERR("Textsplit: scan error in cjk handler\n");
return false; return false;
} }
@ -993,9 +993,6 @@ bool TextSplit::text_to_words(const string &in)
return true; return true;
} }
// Using an utf8iter pointer just to avoid needing its definition in
// textsplit.h
//
// We output ngrams for exemple for char input a b c and ngramlen== 2, // We output ngrams for exemple for char input a b c and ngramlen== 2,
// we generate: a ab b bc c as words // we generate: a ab b bc c as words
// //
@ -1004,10 +1001,9 @@ bool TextSplit::text_to_words(const string &in)
// //
// The routine is sort of a mess and goes to show that we'd probably // The routine is sort of a mess and goes to show that we'd probably
// be better off converting the whole buffer to utf32 on entry... // be better off converting the whole buffer to utf32 on entry...
bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp) bool TextSplit::cjk_to_words(Utf8Iter& it, unsigned int *cp)
{ {
LOGDEB1("cjk_to_words: m_wordpos " << m_wordpos << "\n"); LOGDEB1("cjk_to_words: m_wordpos " << m_wordpos << "\n");
Utf8Iter &it = *itp;
// We use an offset buffer to remember the starts of the utf-8 // We use an offset buffer to remember the starts of the utf-8
// characters which we still need to use. // characters which we still need to use.
@ -1019,12 +1015,18 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
// Current number of valid offsets; // Current number of valid offsets;
unsigned int nchars = 0; unsigned int nchars = 0;
unsigned int c = 0; unsigned int c = 0;
bool spacebefore{false};
for (; !it.eof() && !it.error(); it++) { for (; !it.eof() && !it.error(); it++) {
c = *it; c = *it;
// We had a version which ignored whitespace for some time, // We had a version which ignored whitespace for some time,
// but this was a bad idea. Only break on an non-cjk // but this was a bad idea. Only break on a non-cjk
// alphabetic character. // *alphabetic* character, except if following punctuation, in
if (!UNICODE_IS_CJK(c) && (c > 255 || isalpha(c))) { // which case we return for any non-cjk. This allows compound
// cjk+numeric spans, or punctuated cjk spans to be
// continually indexed as cjk. The best approach is a matter
// of appreciation...
if (!UNICODE_IS_CJK(c) &&
(spacebefore || (c > 255 || isalpha(c)))) {
// Return to normal handler // Return to normal handler
break; break;
} }
@ -1032,7 +1034,10 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
// Flush the ngram buffer and go on // Flush the ngram buffer and go on
nchars = 0; nchars = 0;
mybuf.clear(); mybuf.clear();
spacebefore = true;
continue; continue;
} else {
spacebefore = false;
} }
if (nchars == o_CJKNgramLen) { if (nchars == o_CJKNgramLen) {
// Offset buffer full, shift it. Might be more efficient // Offset buffer full, shift it. Might be more efficient

View File

@ -200,7 +200,7 @@ private:
} }
// This processes cjk text: // This processes cjk text:
bool cjk_to_words(Utf8Iter *it, unsigned int *cp); bool cjk_to_words(Utf8Iter& it, unsigned int *cp);
// Experimental Korean splitter. This uses an external Python tokenizer // Experimental Korean splitter. This uses an external Python tokenizer
bool ko_to_words(Utf8Iter *it, unsigned int *cp); bool ko_to_words(Utf8Iter *it, unsigned int *cp);