Partially revert change treating Katakana as words, going back to n-grams. Did not work well because of separator-less compounds mostly

2017-04-25 10:20:38 +02:00 · 2017-04-25 10:20:38 +02:00 · f853f39ef3
commit f853f39ef3
parent 59e5cc4150
4 changed files with 42 additions and 7 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -32,7 +32,6 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
    $(X_CFLAGS) \
    -DRECOLL_DATADIR=\"${pkgdatadir}\" \
    -D_GNU_SOURCE \
-    -DTESTING_XAPIAN_SPELL \
    $(DEFS)

 ACLOCAL_AMFLAGS = -I m4
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -33,6 +33,16 @@
 #include "uproplist.h"
 #include "smallut.h"

+// Decide if we treat katakana as western scripts, splitting into
+// words instead of n-grams. This is not absurd (katakana is a kind of
+// alphabet, albeit phonetic and syllabic and is mostly used to
+// transcribe western words), but it does not work well because
+// japanese uses separator-less compound katakana words, and because
+// the plural terminaisons are irregular and would need a specialized
+// stemmer. So we for now process katakana as the rest of cjk, using
+// ngrams
+#undef KATAKANA_AS_WORDS
+
 using namespace std;

 /**
@ -209,11 +219,15 @@ static inline int whatcc(unsigned int c)
 // katakana variants' to something else.  Look up "Kuromoji" Lucene
 // filter, KuromojiNormalizeFilter.java
 // 309F is Hiragana.
+#ifdef KATAKANA_AS_WORDS
 #define UNICODE_IS_KATAKANA(p)                                          \
    ((p) != 0x309F &&                                                   \
     (((p) >= 0x3099 && (p) <= 0x30FF) ||                               \
      ((p) >= 0x31F0 && (p) <= 0x31FF)))
-    
+#else
+#define UNICODE_IS_KATAKANA(p) false
+#endif
+
 bool TextSplit::isCJK(int c)
 {
    return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
@ -520,6 +534,7 @@ bool TextSplit::text_to_words(const string &in)
 	    LOGERR("Textsplit: error occured while scanning UTF-8 string\n");
 	    return false;
 	}
+
        CharSpanClass csc;
        if (UNICODE_IS_KATAKANA(c)) {
            csc = CSC_KATAKANA;
@ -528,6 +543,7 @@ bool TextSplit::text_to_words(const string &in)
        } else {
            csc = CSC_OTHER;
        }
+
 	if (o_processCJK && csc == CSC_CJK) {
 	    // CJK excluding Katakana character hit. 
 	    // Do like at EOF with the current non-cjk data.
@ -548,6 +564,9 @@ bool TextSplit::text_to_words(const string &in)
 		break;
 	}

+#ifdef KATAKANA_AS_WORDS
+        // Only needed if we have script transitions inside this
+        // routine, else the call to cjk_to_words does the job.
        if (csc != prev_csc && (m_wordLen || m_span.length())) {
            LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
                   m_wordLen << " spl " << m_span.length() << endl);
@ -555,8 +574,9 @@ bool TextSplit::text_to_words(const string &in)
                return false;
            }
        }
+#endif
+
        prev_csc = csc;
-        
 	int cc = whatcc(c);

 	switch (cc) {
--- a/src/qtgui/recoll.pro.in
+++ b/src/qtgui/recoll.pro.in
@ -2,7 +2,7 @@ TEMPLATE        = app
 LANGUAGE        = C++

 VPATH = @srcdir@
-DEFINES += BUILDING_RECOLL  TESTING_XAPIAN_SPELL
+DEFINES += BUILDING_RECOLL

@QMAKE_ENABLE_WEBKIT@ QT += webkit
@QMAKE_DISABLE_WEBKIT@ QMAKE_CXXFLAGS += -DRESLIST_TEXTBROWSER -DSNIPPETS_TEXTBROWSER
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -200,21 +200,37 @@ class Db {
    vector<string> getStemLangs();

    /** Test word for spelling correction candidate: not too long, no 
-	special chars... */
-    static bool isSpellingCandidate(const string& term, bool aspell=true)
+     * special chars... 
+     * @param with_aspell test for use with aspell, else for xapian speller
+     */
+    static bool isSpellingCandidate(const string& term, bool with_aspell=true)
    {
 	if (term.empty() || term.length() > 50)
 	    return false;
 	if (has_prefix(term))
 	    return false;
 	Utf8Iter u8i(term);
-        if (aspell) {
+        if (with_aspell) {
+            // If spelling with aspell, neither katakana nor other cjk
+            // scripts are candidates
            if (TextSplit::isCJK(*u8i) || TextSplit::isKATAKANA(*u8i))
                return false;
        } else {
+#ifdef TESTING_XAPIAN_SPELL
+            // The Xapian speller (purely proximity-based) can be used
+            // for Katakana (when split as words which is not always
+            // completely feasible because of separator-less
+            // compounds). Currently we don't try to use the Xapian
+            // speller with other scripts with which it would be usable
+            // in the absence of aspell (it would indeed be better
+            // than nothing with e.g. european languages). This would
+            // require a few more config variables, maybe one day.
            if (!TextSplit::isKATAKANA(*u8i)) {
                return false;
            }
+#else
+            return false;
+#endif
        }
 	if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~") 
 	    != string::npos)