textsplit: create isNGRAMMED() method to replace isCJK() and let the latter actually return what it says

This commit is contained in:
Jean-Francois Dockes 2020-04-14 09:27:26 +02:00
parent 5dd8774b3c
commit 9565663f09
4 changed files with 13 additions and 8 deletions

View File

@ -319,8 +319,7 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
bool TextSplit::isCJK(int c)
{
return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) &&
!UNICODE_IS_HANGUL(c);
return UNICODE_IS_CJK(c);
}
bool TextSplit::isKATAKANA(int c)
{
@ -330,6 +329,11 @@ bool TextSplit::isHANGUL(int c)
{
return UNICODE_IS_HANGUL(c);
}
bool TextSplit::isNGRAMMED(int c)
{
return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) &&
!UNICODE_IS_HANGUL(c);
}
// This is used to detect katakana/other transitions, which must

View File

@ -89,11 +89,13 @@ public:
static bool stringToStrings(const std::string &s,
std::vector<std::string> &tokens);
/** Is char CJK ? (excluding Katakana) */
/** Is char CJK ? */
static bool isCJK(int c);
static bool isKATAKANA(int c);
static bool isHANGUL(int c);
/* Not split in words */
static bool isNGRAMMED(int c);
/** Statistics about word length (average and dispersion) can
* detect bad data like undecoded base64 or other mis-identified
* pieces of data taken as text. In practise, this keeps some junk out

View File

@ -478,7 +478,7 @@ void Query::Native::abstractCreateSnippetsVector(
}
Utf8Iter uit(ent.second);
bool newcjk = false;
if (TextSplit::isCJK(*uit))
if (TextSplit::isNGRAMMED(*uit))
newcjk = true;
if (!incjk || (incjk && !newcjk))
chunk += " ";

View File

@ -202,9 +202,8 @@ public:
Utf8Iter u8i(term);
if (with_aspell) {
// If spelling with aspell, neither katakana nor other cjk
// scripts are candidates
if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i))
// If spelling with aspell, CJK scripts are not candidates
if (TextSplit::isCJK(*u8i))
return false;
} else {
#ifdef TESTING_XAPIAN_SPELL