textsplit: create isNGRAMMED() method to replace isCJK() and let the latter actually return what it says
This commit is contained in:
parent
5dd8774b3c
commit
9565663f09
@ -319,8 +319,7 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
|
||||
|
||||
bool TextSplit::isCJK(int c)
|
||||
{
|
||||
return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) &&
|
||||
!UNICODE_IS_HANGUL(c);
|
||||
return UNICODE_IS_CJK(c);
|
||||
}
|
||||
bool TextSplit::isKATAKANA(int c)
|
||||
{
|
||||
@ -330,6 +329,11 @@ bool TextSplit::isHANGUL(int c)
|
||||
{
|
||||
return UNICODE_IS_HANGUL(c);
|
||||
}
|
||||
bool TextSplit::isNGRAMMED(int c)
|
||||
{
|
||||
return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) &&
|
||||
!UNICODE_IS_HANGUL(c);
|
||||
}
|
||||
|
||||
|
||||
// This is used to detect katakana/other transitions, which must
|
||||
|
||||
@ -89,11 +89,13 @@ public:
|
||||
static bool stringToStrings(const std::string &s,
|
||||
std::vector<std::string> &tokens);
|
||||
|
||||
/** Is char CJK ? (excluding Katakana) */
|
||||
/** Is char CJK ? */
|
||||
static bool isCJK(int c);
|
||||
static bool isKATAKANA(int c);
|
||||
static bool isHANGUL(int c);
|
||||
|
||||
/* Not split in words */
|
||||
static bool isNGRAMMED(int c);
|
||||
|
||||
/** Statistics about word length (average and dispersion) can
|
||||
* detect bad data like undecoded base64 or other mis-identified
|
||||
* pieces of data taken as text. In practise, this keeps some junk out
|
||||
|
||||
@ -478,7 +478,7 @@ void Query::Native::abstractCreateSnippetsVector(
|
||||
}
|
||||
Utf8Iter uit(ent.second);
|
||||
bool newcjk = false;
|
||||
if (TextSplit::isCJK(*uit))
|
||||
if (TextSplit::isNGRAMMED(*uit))
|
||||
newcjk = true;
|
||||
if (!incjk || (incjk && !newcjk))
|
||||
chunk += " ";
|
||||
|
||||
@ -202,9 +202,8 @@ public:
|
||||
|
||||
Utf8Iter u8i(term);
|
||||
if (with_aspell) {
|
||||
// If spelling with aspell, neither katakana nor other cjk
|
||||
// scripts are candidates
|
||||
if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i))
|
||||
// If spelling with aspell, CJK scripts are not candidates
|
||||
if (TextSplit::isCJK(*u8i))
|
||||
return false;
|
||||
} else {
|
||||
#ifdef TESTING_XAPIAN_SPELL
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user