textsplit: create isNGRAMMED() method to replace isCJK() and let the latter actually return what it says
This commit is contained in:
parent
5dd8774b3c
commit
9565663f09
@ -319,8 +319,7 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
|
|||||||
|
|
||||||
bool TextSplit::isCJK(int c)
|
bool TextSplit::isCJK(int c)
|
||||||
{
|
{
|
||||||
return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) &&
|
return UNICODE_IS_CJK(c);
|
||||||
!UNICODE_IS_HANGUL(c);
|
|
||||||
}
|
}
|
||||||
bool TextSplit::isKATAKANA(int c)
|
bool TextSplit::isKATAKANA(int c)
|
||||||
{
|
{
|
||||||
@ -330,6 +329,11 @@ bool TextSplit::isHANGUL(int c)
|
|||||||
{
|
{
|
||||||
return UNICODE_IS_HANGUL(c);
|
return UNICODE_IS_HANGUL(c);
|
||||||
}
|
}
|
||||||
|
bool TextSplit::isNGRAMMED(int c)
|
||||||
|
{
|
||||||
|
return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) &&
|
||||||
|
!UNICODE_IS_HANGUL(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// This is used to detect katakana/other transitions, which must
|
// This is used to detect katakana/other transitions, which must
|
||||||
|
|||||||
@ -89,10 +89,12 @@ public:
|
|||||||
static bool stringToStrings(const std::string &s,
|
static bool stringToStrings(const std::string &s,
|
||||||
std::vector<std::string> &tokens);
|
std::vector<std::string> &tokens);
|
||||||
|
|
||||||
/** Is char CJK ? (excluding Katakana) */
|
/** Is char CJK ? */
|
||||||
static bool isCJK(int c);
|
static bool isCJK(int c);
|
||||||
static bool isKATAKANA(int c);
|
static bool isKATAKANA(int c);
|
||||||
static bool isHANGUL(int c);
|
static bool isHANGUL(int c);
|
||||||
|
/* Not split in words */
|
||||||
|
static bool isNGRAMMED(int c);
|
||||||
|
|
||||||
/** Statistics about word length (average and dispersion) can
|
/** Statistics about word length (average and dispersion) can
|
||||||
* detect bad data like undecoded base64 or other mis-identified
|
* detect bad data like undecoded base64 or other mis-identified
|
||||||
|
|||||||
@ -478,7 +478,7 @@ void Query::Native::abstractCreateSnippetsVector(
|
|||||||
}
|
}
|
||||||
Utf8Iter uit(ent.second);
|
Utf8Iter uit(ent.second);
|
||||||
bool newcjk = false;
|
bool newcjk = false;
|
||||||
if (TextSplit::isCJK(*uit))
|
if (TextSplit::isNGRAMMED(*uit))
|
||||||
newcjk = true;
|
newcjk = true;
|
||||||
if (!incjk || (incjk && !newcjk))
|
if (!incjk || (incjk && !newcjk))
|
||||||
chunk += " ";
|
chunk += " ";
|
||||||
|
|||||||
@ -202,9 +202,8 @@ public:
|
|||||||
|
|
||||||
Utf8Iter u8i(term);
|
Utf8Iter u8i(term);
|
||||||
if (with_aspell) {
|
if (with_aspell) {
|
||||||
// If spelling with aspell, neither katakana nor other cjk
|
// If spelling with aspell, CJK scripts are not candidates
|
||||||
// scripts are candidates
|
if (TextSplit::isCJK(*u8i))
|
||||||
if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i))
|
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
#ifdef TESTING_XAPIAN_SPELL
|
#ifdef TESTING_XAPIAN_SPELL
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user