CJK indexing: return to western word indexing if encountering numeric after punctuation
This commit is contained in:
parent
b25dab7cab
commit
8285e18039
@ -684,7 +684,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (!cjk_to_words(&it, &c)) {
|
if (!cjk_to_words(it, &c)) {
|
||||||
LOGERR("Textsplit: scan error in cjk handler\n");
|
LOGERR("Textsplit: scan error in cjk handler\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -993,9 +993,6 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Using an utf8iter pointer just to avoid needing its definition in
|
|
||||||
// textsplit.h
|
|
||||||
//
|
|
||||||
// We output ngrams for exemple for char input a b c and ngramlen== 2,
|
// We output ngrams for exemple for char input a b c and ngramlen== 2,
|
||||||
// we generate: a ab b bc c as words
|
// we generate: a ab b bc c as words
|
||||||
//
|
//
|
||||||
@ -1004,10 +1001,9 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
//
|
//
|
||||||
// The routine is sort of a mess and goes to show that we'd probably
|
// The routine is sort of a mess and goes to show that we'd probably
|
||||||
// be better off converting the whole buffer to utf32 on entry...
|
// be better off converting the whole buffer to utf32 on entry...
|
||||||
bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
bool TextSplit::cjk_to_words(Utf8Iter& it, unsigned int *cp)
|
||||||
{
|
{
|
||||||
LOGDEB1("cjk_to_words: m_wordpos " << m_wordpos << "\n");
|
LOGDEB1("cjk_to_words: m_wordpos " << m_wordpos << "\n");
|
||||||
Utf8Iter &it = *itp;
|
|
||||||
|
|
||||||
// We use an offset buffer to remember the starts of the utf-8
|
// We use an offset buffer to remember the starts of the utf-8
|
||||||
// characters which we still need to use.
|
// characters which we still need to use.
|
||||||
@ -1019,12 +1015,18 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
// Current number of valid offsets;
|
// Current number of valid offsets;
|
||||||
unsigned int nchars = 0;
|
unsigned int nchars = 0;
|
||||||
unsigned int c = 0;
|
unsigned int c = 0;
|
||||||
|
bool spacebefore{false};
|
||||||
for (; !it.eof() && !it.error(); it++) {
|
for (; !it.eof() && !it.error(); it++) {
|
||||||
c = *it;
|
c = *it;
|
||||||
// We had a version which ignored whitespace for some time,
|
// We had a version which ignored whitespace for some time,
|
||||||
// but this was a bad idea. Only break on an non-cjk
|
// but this was a bad idea. Only break on a non-cjk
|
||||||
// alphabetic character.
|
// *alphabetic* character, except if following punctuation, in
|
||||||
if (!UNICODE_IS_CJK(c) && (c > 255 || isalpha(c))) {
|
// which case we return for any non-cjk. This allows compound
|
||||||
|
// cjk+numeric spans, or punctuated cjk spans to be
|
||||||
|
// continually indexed as cjk. The best approach is a matter
|
||||||
|
// of appreciation...
|
||||||
|
if (!UNICODE_IS_CJK(c) &&
|
||||||
|
(spacebefore || (c > 255 || isalpha(c)))) {
|
||||||
// Return to normal handler
|
// Return to normal handler
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1032,7 +1034,10 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
// Flush the ngram buffer and go on
|
// Flush the ngram buffer and go on
|
||||||
nchars = 0;
|
nchars = 0;
|
||||||
mybuf.clear();
|
mybuf.clear();
|
||||||
|
spacebefore = true;
|
||||||
continue;
|
continue;
|
||||||
|
} else {
|
||||||
|
spacebefore = false;
|
||||||
}
|
}
|
||||||
if (nchars == o_CJKNgramLen) {
|
if (nchars == o_CJKNgramLen) {
|
||||||
// Offset buffer full, shift it. Might be more efficient
|
// Offset buffer full, shift it. Might be more efficient
|
||||||
|
|||||||
@ -200,7 +200,7 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// This processes cjk text:
|
// This processes cjk text:
|
||||||
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
bool cjk_to_words(Utf8Iter& it, unsigned int *cp);
|
||||||
|
|
||||||
// Experimental Korean splitter. This uses an external Python tokenizer
|
// Experimental Korean splitter. This uses an external Python tokenizer
|
||||||
bool ko_to_words(Utf8Iter *it, unsigned int *cp);
|
bool ko_to_words(Utf8Iter *it, unsigned int *cp);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user