From 9565663f09ede1b20c628d3cf245f900326778de Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 14 Apr 2020 09:27:26 +0200 Subject: [PATCH] textsplit: create isNGRAMMED() method to replace isCJK() and let the latter actually return what it says --- src/common/textsplit.cpp | 8 ++++++-- src/common/textsplit.h | 6 ++++-- src/rcldb/rclabstract.cpp | 2 +- src/rcldb/rcldb.h | 5 ++--- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 614a2b5f..1c782fca 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -319,8 +319,7 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr) bool TextSplit::isCJK(int c) { - return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) && - !UNICODE_IS_HANGUL(c); + return UNICODE_IS_CJK(c); } bool TextSplit::isKATAKANA(int c) { @@ -330,6 +329,11 @@ bool TextSplit::isHANGUL(int c) { return UNICODE_IS_HANGUL(c); } +bool TextSplit::isNGRAMMED(int c) +{ + return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) && + !UNICODE_IS_HANGUL(c); +} // This is used to detect katakana/other transitions, which must diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 3cf7adf3..573b723c 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -89,11 +89,13 @@ public: static bool stringToStrings(const std::string &s, std::vector &tokens); - /** Is char CJK ? (excluding Katakana) */ + /** Is char CJK ? */ static bool isCJK(int c); static bool isKATAKANA(int c); static bool isHANGUL(int c); - + /* Not split in words */ + static bool isNGRAMMED(int c); + /** Statistics about word length (average and dispersion) can * detect bad data like undecoded base64 or other mis-identified * pieces of data taken as text. In practise, this keeps some junk out diff --git a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp index b6fb729a..6f333eba 100644 --- a/src/rcldb/rclabstract.cpp +++ b/src/rcldb/rclabstract.cpp @@ -478,7 +478,7 @@ void Query::Native::abstractCreateSnippetsVector( } Utf8Iter uit(ent.second); bool newcjk = false; - if (TextSplit::isCJK(*uit)) + if (TextSplit::isNGRAMMED(*uit)) newcjk = true; if (!incjk || (incjk && !newcjk)) chunk += " "; diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 10d446e0..520f516f 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -202,9 +202,8 @@ public: Utf8Iter u8i(term); if (with_aspell) { - // If spelling with aspell, neither katakana nor other cjk - // scripts are candidates - if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i)) + // If spelling with aspell, CJK scripts are not candidates + if (TextSplit::isCJK(*u8i)) return false; } else { #ifdef TESTING_XAPIAN_SPELL