Simplify initialization by moving static config textsplit init from rclconfig to textsplit

2019-02-01 09:09:15 +01:00 · 2019-02-01 09:09:15 +01:00 · b1ff34407d
commit b1ff34407d
parent 04f3449f99
4 changed files with 62 additions and 80 deletions
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -47,7 +47,6 @@
 #include "conftree.h"
 #include "log.h"
 #include "smallut.h"
 #include "textsplit.h"
 #include "readfile.h"
 #include "fstreewalk.h"
 #include "cpuconf.h"
@ -394,32 +393,7 @@ bool RclConfig::updateMainConfig()
    setKeyDir(cstr_null);
-    // Texsplit customization
+    bool bvalue = true;
    bool bvalue = false;
    if (getConfParam("nocjk", &bvalue) && bvalue == true) {
        TextSplit::cjkProcessing(false);
    } else {
        int ngramlen;
        if (getConfParam("cjkngramlen", &ngramlen)) {
            TextSplit::cjkProcessing(true, (unsigned int)ngramlen);
        } else {
            TextSplit::cjkProcessing(true);
        }
    }
    bvalue = false;
    if (getConfParam("nonumbers", &bvalue) && bvalue == true) {
        TextSplit::noNumbers();
    }
    bvalue = false;
    if (getConfParam("dehyphenate", &bvalue)) {
        TextSplit::deHyphenate(bvalue);
    }
    bvalue = false;
    if (getConfParam("backslashasletter", &bvalue)) {
        TextSplit::backslashAsLetter(bvalue);
    }
    bvalue = true;
    if (getConfParam("skippedPathsFnmPathname", &bvalue) && bvalue == false) {
        FsTreeWalker::setNoFnmPathname();
    }
--- a/src/common/rclinit.cpp
+++ b/src/common/rclinit.cpp
@ -37,6 +37,7 @@
 #include "unac.h"
 #include "smallut.h"
 #include "execmd.h"
 #include "textsplit.h"
 std::thread::id mainthread_id;
@ -273,6 +274,8 @@ RclConfig *recollinit(int flags,
 	return 0;
    }
    TextSplit::staticConfInit(config);
    // Retrieve the log file name and level. Daemon and batch indexing
    // processes may use specific values, else fall back on common
    // ones.
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -1,4 +1,4 @@
-/* Copyright (C) 2004 J.F.Dockes
+/* Copyright (C) 2004-2019 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
@ -32,6 +32,7 @@
 #include "utf8iter.h"
 #include "uproplist.h"
 #include "smallut.h"
 #include "rclconfig.h"
 // Decide if we treat katakana as western scripts, splitting into
 // words instead of n-grams. This is not absurd (katakana is a kind of
@ -137,14 +138,6 @@ public:
 };
 static const CharClassInit charClassInitInstance;
 void TextSplit::backslashAsLetter(bool on) {
    if (on) {
        charclasses[int('\\')] = A_LLETTER;
    } else {
        charclasses[int('\\')] = SPACE;
    }
 }
 static inline int whatcc(unsigned int c)
 {
    if (c <= 127) {
@ -251,10 +244,47 @@ bool TextSplit::isKATAKANA(int c)
 // which has its span reader causing a word break)
 enum CharSpanClass {CSC_CJK, CSC_KATAKANA, CSC_OTHER};
-bool          TextSplit::o_processCJK = true;
+bool          TextSplit::o_processCJK{true};
-unsigned int  TextSplit::o_CJKNgramLen = 2;
+unsigned int  TextSplit::o_CJKNgramLen{2};
-bool          TextSplit::o_noNumbers = false;
+bool          TextSplit::o_noNumbers{false};
-bool          TextSplit::o_deHyphenate = false;
+bool          TextSplit::o_deHyphenate{false};
 int           TextSplit::o_maxWordLength{40};
 static const int o_CJKMaxNgramLen{5};
 void TextSplit::staticConfInit(RclConfig *config)
 {
    config->getConfParam("maxtermlength", &o_maxWordLength);
    bool bvalue{false};
    if (config->getConfParam("nocjk", &bvalue) && bvalue == true) {
 	o_processCJK = false;
    } else {
 	o_processCJK = true;
        int ngramlen;
        if (config->getConfParam("cjkngramlen", &ngramlen)) {
            o_CJKNgramLen = (unsigned int)(ngramlen <= o_CJKMaxNgramLen ?
                                           ngramlen : o_CJKMaxNgramLen);
        }
    }
    bvalue = false;
    if (config->getConfParam("nonumbers", &bvalue)) {
 	o_noNumbers = bvalue;
    }
    bvalue = false;
    if (config->getConfParam("dehyphenate", &bvalue)) {
 	o_deHyphenate = bvalue;
    }
    bvalue = false;
    if (config->getConfParam("backslashasletter", &bvalue)) {
        if (bvalue) {
        } else {
            charclasses[int('\\')] = SPACE;
        }
    }
 }    
 // Final term checkpoint: do some checking (the kind which is simpler
 // to do here than in the main loop), then send term to our client.
@ -272,7 +302,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
 	m_stats.newsamp(m_wordChars);
 #endif
-    if (l > 0 && l < m_maxWordLength) {
+    if (l > 0 && l <= o_maxWordLength) {
 	// 1 byte word: we index single ascii letters and digits, but
 	// nothing else. We might want to turn this into a test for a
 	// single utf8 character instead ?
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -23,6 +23,7 @@
 #include <vector>
 class Utf8Iter;
 class RclConfig;
 /** 
 * Split text into words. 
@ -32,38 +33,6 @@ class Utf8Iter;
 */
 class TextSplit {
 public:
    // Should we activate special processing of Chinese characters ? This
    // needs a little more cpu, so it can be turned off globally. This is set
    // by rclconfig, changing it means reindexing
    static bool o_processCJK;
    static unsigned int  o_CJKNgramLen;
    static const unsigned int o_CJKMaxNgramLen =  5;
    static void cjkProcessing(bool onoff, unsigned int ngramlen = 2) 
    {
 	o_processCJK = onoff;
 	o_CJKNgramLen = ngramlen <= o_CJKMaxNgramLen ? 
 	    ngramlen : o_CJKMaxNgramLen;
    }
    // Are we indexing numbers ? Set by rclconfig. Change needs reindex
    static bool o_noNumbers;
    static void noNumbers()
    {
 	o_noNumbers = true;
    }
    // Given [co-worker] as input, do we also generate [coworker] ?
    // Set by rclconfig
    static bool o_deHyphenate;
    static void deHyphenate(bool on) {
 	o_deHyphenate = on;
    }
    // Process backslashes as letters? Default is off, but it may be
    // useful for searching for tex commands. Config variable:
    // backslashasletter
    static void backslashAsLetter(bool on);
    enum Flags {
        // Default: will return spans and words (a_b, a, b)
        TXTS_NONE = 0, 
@ -79,11 +48,13 @@ public:
    };
    TextSplit(Flags flags = Flags(TXTS_NONE))
-	: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
+	: m_flags(flags) {}
    {
    }
    virtual ~TextSplit() {}
    /** Call at program initialization to read non default values from the 
        configuration */
    static void staticConfInit(RclConfig *config);
    /** Split text, emit words and positions. */
    virtual bool text_to_words(const std::string &in);
@ -97,8 +68,7 @@ public:
    /** Called when we encounter formfeed \f 0x0c. Override to use the event.
     * Mostly or exclusively used with pdftoxx output. Other filters mostly 
     * just don't know about pages. */
-    virtual void newpage(int /*pos*/) {
+    virtual void newpage(int /*pos*/) {}
    }
    // Static utility functions:
@ -184,8 +154,13 @@ public:
 #endif // TEXTSPLIT_STATS
 private:
    static bool o_processCJK; // true
    static bool o_noNumbers;  // false
    static bool o_deHyphenate; // false
    static unsigned int o_CJKNgramLen; // 2
    static int o_maxWordLength; // 40
    Flags         m_flags;
    int           m_maxWordLength;
    // Current span. Might be jf.dockes@wanadoo.f
    std::string        m_span; 
@ -206,7 +181,7 @@ private:
    // It may happen that our cleanup would result in emitting the
    // same term twice. We try to avoid this
-    int           m_prevpos;
+    int           m_prevpos{-1};
    int           m_prevlen;
 #ifdef TEXTSPLIT_STATS