From f853f39ef3e9caf9857a1693caafb65bf32bfb90 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 25 Apr 2017 10:20:38 +0200 Subject: [PATCH] Partially revert change treating Katakana as words, going back to n-grams. Did not work well because of separator-less compounds mostly --- src/Makefile.am | 1 - src/common/textsplit.cpp | 24 ++++++++++++++++++++++-- src/qtgui/recoll.pro.in | 2 +- src/rcldb/rcldb.h | 22 +++++++++++++++++++--- 4 files changed, 42 insertions(+), 7 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index e63d56cd..56b49b4e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -32,7 +32,6 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \ $(X_CFLAGS) \ -DRECOLL_DATADIR=\"${pkgdatadir}\" \ -D_GNU_SOURCE \ - -DTESTING_XAPIAN_SPELL \ $(DEFS) ACLOCAL_AMFLAGS = -I m4 diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 52bc26f2..081e05d4 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -33,6 +33,16 @@ #include "uproplist.h" #include "smallut.h" +// Decide if we treat katakana as western scripts, splitting into +// words instead of n-grams. This is not absurd (katakana is a kind of +// alphabet, albeit phonetic and syllabic and is mostly used to +// transcribe western words), but it does not work well because +// japanese uses separator-less compound katakana words, and because +// the plural terminaisons are irregular and would need a specialized +// stemmer. So we for now process katakana as the rest of cjk, using +// ngrams +#undef KATAKANA_AS_WORDS + using namespace std; /** @@ -209,11 +219,15 @@ static inline int whatcc(unsigned int c) // katakana variants' to something else. Look up "Kuromoji" Lucene // filter, KuromojiNormalizeFilter.java // 309F is Hiragana. +#ifdef KATAKANA_AS_WORDS #define UNICODE_IS_KATAKANA(p) \ ((p) != 0x309F && \ (((p) >= 0x3099 && (p) <= 0x30FF) || \ ((p) >= 0x31F0 && (p) <= 0x31FF))) - +#else +#define UNICODE_IS_KATAKANA(p) false +#endif + bool TextSplit::isCJK(int c) { return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c); @@ -520,6 +534,7 @@ bool TextSplit::text_to_words(const string &in) LOGERR("Textsplit: error occured while scanning UTF-8 string\n"); return false; } + CharSpanClass csc; if (UNICODE_IS_KATAKANA(c)) { csc = CSC_KATAKANA; @@ -528,6 +543,7 @@ bool TextSplit::text_to_words(const string &in) } else { csc = CSC_OTHER; } + if (o_processCJK && csc == CSC_CJK) { // CJK excluding Katakana character hit. // Do like at EOF with the current non-cjk data. @@ -548,6 +564,9 @@ bool TextSplit::text_to_words(const string &in) break; } +#ifdef KATAKANA_AS_WORDS + // Only needed if we have script transitions inside this + // routine, else the call to cjk_to_words does the job. if (csc != prev_csc && (m_wordLen || m_span.length())) { LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " << m_wordLen << " spl " << m_span.length() << endl); @@ -555,8 +574,9 @@ bool TextSplit::text_to_words(const string &in) return false; } } +#endif + prev_csc = csc; - int cc = whatcc(c); switch (cc) { diff --git a/src/qtgui/recoll.pro.in b/src/qtgui/recoll.pro.in index e8dc3345..c58105f0 100644 --- a/src/qtgui/recoll.pro.in +++ b/src/qtgui/recoll.pro.in @@ -2,7 +2,7 @@ TEMPLATE = app LANGUAGE = C++ VPATH = @srcdir@ -DEFINES += BUILDING_RECOLL TESTING_XAPIAN_SPELL +DEFINES += BUILDING_RECOLL @QMAKE_ENABLE_WEBKIT@ QT += webkit @QMAKE_DISABLE_WEBKIT@ QMAKE_CXXFLAGS += -DRESLIST_TEXTBROWSER -DSNIPPETS_TEXTBROWSER diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 89a5c360..ebfcfc74 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -200,21 +200,37 @@ class Db { vector getStemLangs(); /** Test word for spelling correction candidate: not too long, no - special chars... */ - static bool isSpellingCandidate(const string& term, bool aspell=true) + * special chars... + * @param with_aspell test for use with aspell, else for xapian speller + */ + static bool isSpellingCandidate(const string& term, bool with_aspell=true) { if (term.empty() || term.length() > 50) return false; if (has_prefix(term)) return false; Utf8Iter u8i(term); - if (aspell) { + if (with_aspell) { + // If spelling with aspell, neither katakana nor other cjk + // scripts are candidates if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i)) return false; } else { +#ifdef TESTING_XAPIAN_SPELL + // The Xapian speller (purely proximity-based) can be used + // for Katakana (when split as words which is not always + // completely feasible because of separator-less + // compounds). Currently we don't try to use the Xapian + // speller with other scripts with which it would be usable + // in the absence of aspell (it would indeed be better + // than nothing with e.g. european languages). This would + // require a few more config variables, maybe one day. if (!TextSplit::isKATAKANA(*u8i)) { return false; } +#else + return false; +#endif } if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") != string::npos)