Partially revert change treating Katakana as words, going back to n-grams. Did not work well because of separator-less compounds mostly

2017-04-25 10:20:38 +02:00 · 2017-04-25 10:20:38 +02:00 · f853f39ef3
commit f853f39ef3
parent 59e5cc4150
4 changed files with 42 additions and 7 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -32,7 +32,6 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
    $(X_CFLAGS) \
    -DRECOLL_DATADIR=\"${pkgdatadir}\" \
    -D_GNU_SOURCE \
    -DTESTING_XAPIAN_SPELL \
    $(DEFS)
 ACLOCAL_AMFLAGS = -I m4
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -33,6 +33,16 @@
 #include "uproplist.h"
 #include "smallut.h"
 // Decide if we treat katakana as western scripts, splitting into
 // words instead of n-grams. This is not absurd (katakana is a kind of
 // alphabet, albeit phonetic and syllabic and is mostly used to
 // transcribe western words), but it does not work well because
 // japanese uses separator-less compound katakana words, and because
 // the plural terminaisons are irregular and would need a specialized
 // stemmer. So we for now process katakana as the rest of cjk, using
 // ngrams
 #undef KATAKANA_AS_WORDS
 using namespace std;
 /**
@ -209,11 +219,15 @@ static inline int whatcc(unsigned int c)
 // katakana variants' to something else.  Look up "Kuromoji" Lucene
 // filter, KuromojiNormalizeFilter.java
 // 309F is Hiragana.
 #ifdef KATAKANA_AS_WORDS
 #define UNICODE_IS_KATAKANA(p)                                          \
    ((p) != 0x309F &&                                                   \
     (((p) >= 0x3099 && (p) <= 0x30FF) ||                               \
      ((p) >= 0x31F0 && (p) <= 0x31FF)))
-    
+#else
 #define UNICODE_IS_KATAKANA(p) false
 #endif
 bool TextSplit::isCJK(int c)
 {
    return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
@ -520,6 +534,7 @@ bool TextSplit::text_to_words(const string &in)
 	    LOGERR("Textsplit: error occured while scanning UTF-8 string\n");
 	    return false;
 	}
        CharSpanClass csc;
        if (UNICODE_IS_KATAKANA(c)) {
            csc = CSC_KATAKANA;
@ -528,6 +543,7 @@ bool TextSplit::text_to_words(const string &in)
        } else {
            csc = CSC_OTHER;
        }
 	if (o_processCJK && csc == CSC_CJK) {
 	    // CJK excluding Katakana character hit. 
 	    // Do like at EOF with the current non-cjk data.
@ -548,6 +564,9 @@ bool TextSplit::text_to_words(const string &in)
 		break;
 	}
 #ifdef KATAKANA_AS_WORDS
        // Only needed if we have script transitions inside this
        // routine, else the call to cjk_to_words does the job.
        if (csc != prev_csc && (m_wordLen || m_span.length())) {
            LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
                   m_wordLen << " spl " << m_span.length() << endl);
@ -555,8 +574,9 @@ bool TextSplit::text_to_words(const string &in)
                return false;
            }
        }
 #endif
        prev_csc = csc;
 	int cc = whatcc(c);
 	switch (cc) {
--- a/src/qtgui/recoll.pro.in
+++ b/src/qtgui/recoll.pro.in
@ -2,7 +2,7 @@ TEMPLATE        = app
 LANGUAGE        = C++
 VPATH = @srcdir@
-DEFINES += BUILDING_RECOLL  TESTING_XAPIAN_SPELL
+DEFINES += BUILDING_RECOLL
@QMAKE_ENABLE_WEBKIT@ QT += webkit
@QMAKE_DISABLE_WEBKIT@ QMAKE_CXXFLAGS += -DRESLIST_TEXTBROWSER -DSNIPPETS_TEXTBROWSER
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -200,21 +200,37 @@ class Db {
    vector<string> getStemLangs();
    /** Test word for spelling correction candidate: not too long, no 
-	special chars... */
+     * special chars... 
-    static bool isSpellingCandidate(const string& term, bool aspell=true)
+     * @param with_aspell test for use with aspell, else for xapian speller
     */
    static bool isSpellingCandidate(const string& term, bool with_aspell=true)
    {
 	if (term.empty() || term.length() > 50)
 	    return false;
 	if (has_prefix(term))
 	    return false;
 	Utf8Iter u8i(term);
-        if (aspell) {
+        if (with_aspell) {
            // If spelling with aspell, neither katakana nor other cjk
            // scripts are candidates
            if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i))
                return false;
        } else {
 #ifdef TESTING_XAPIAN_SPELL
            // The Xapian speller (purely proximity-based) can be used
            // for Katakana (when split as words which is not always
            // completely feasible because of separator-less
            // compounds). Currently we don't try to use the Xapian
            // speller with other scripts with which it would be usable
            // in the absence of aspell (it would indeed be better
            // than nothing with e.g. european languages). This would
            // require a few more config variables, maybe one day.
            if (!TextSplit::isKATAKANA(*u8i)) {
                return false;
            }
 #else
            return false;
 #endif
        }
 	if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") 
 	    != string::npos)