From b1ff34407d039be8875075fce1c3b553f56f9035 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Fri, 1 Feb 2019 09:09:15 +0100
Subject: [PATCH] Simplify initialization by moving static config textsplit
 init from rclconfig to textsplit

---
 src/common/rclconfig.cpp | 28 +------------------
 src/common/rclinit.cpp   |  3 +++
 src/common/textsplit.cpp | 58 ++++++++++++++++++++++++++++++----------
 src/common/textsplit.h   | 53 ++++++++++--------------------------
 4 files changed, 62 insertions(+), 80 deletions(-)

diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp
index ec255624..f65a723c 100644
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@@ -47,7 +47,6 @@
 #include "conftree.h"
 #include "log.h"
 #include "smallut.h"
-#include "textsplit.h"
 #include "readfile.h"
 #include "fstreewalk.h"
 #include "cpuconf.h"
@@ -394,32 +393,7 @@ bool RclConfig::updateMainConfig()
 
     setKeyDir(cstr_null);
 
-    // Texsplit customization
-    bool bvalue = false;
-    if (getConfParam("nocjk", &bvalue) && bvalue == true) {
-        TextSplit::cjkProcessing(false);
-    } else {
-        int ngramlen;
-        if (getConfParam("cjkngramlen", &ngramlen)) {
-            TextSplit::cjkProcessing(true, (unsigned int)ngramlen);
-        } else {
-            TextSplit::cjkProcessing(true);
-        }
-    }
-    bvalue = false;
-    if (getConfParam("nonumbers", &bvalue) && bvalue == true) {
-        TextSplit::noNumbers();
-    }
-    bvalue = false;
-    if (getConfParam("dehyphenate", &bvalue)) {
-        TextSplit::deHyphenate(bvalue);
-    }
-    bvalue = false;
-    if (getConfParam("backslashasletter", &bvalue)) {
-        TextSplit::backslashAsLetter(bvalue);
-    }
-
-    bvalue = true;
+    bool bvalue = true;
     if (getConfParam("skippedPathsFnmPathname", &bvalue) && bvalue == false) {
         FsTreeWalker::setNoFnmPathname();
     }
diff --git a/src/common/rclinit.cpp b/src/common/rclinit.cpp
index 070be98f..8cdc5bcb 100644
--- a/src/common/rclinit.cpp
+++ b/src/common/rclinit.cpp
@@ -37,6 +37,7 @@
 #include "unac.h"
 #include "smallut.h"
 #include "execmd.h"
+#include "textsplit.h"
 
 std::thread::id mainthread_id;
 
@@ -273,6 +274,8 @@ RclConfig *recollinit(int flags,
 	return 0;
     }
 
+    TextSplit::staticConfInit(config);
+    
     // Retrieve the log file name and level. Daemon and batch indexing
     // processes may use specific values, else fall back on common
     // ones.
diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index f4fee2db..755ef5ce 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 2004 J.F.Dockes
+/* Copyright (C) 2004-2019 J.F.Dockes
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation; either version 2 of the License, or
@@ -32,6 +32,7 @@
 #include "utf8iter.h"
 #include "uproplist.h"
 #include "smallut.h"
+#include "rclconfig.h"
 
 // Decide if we treat katakana as western scripts, splitting into
 // words instead of n-grams. This is not absurd (katakana is a kind of
@@ -137,14 +138,6 @@ public:
 };
 static const CharClassInit charClassInitInstance;
 
-void TextSplit::backslashAsLetter(bool on) {
-    if (on) {
-        charclasses[int('\\')] = A_LLETTER;
-    } else {
-        charclasses[int('\\')] = SPACE;
-    }
-}
-
 static inline int whatcc(unsigned int c)
 {
     if (c <= 127) {
@@ -251,10 +244,47 @@ bool TextSplit::isKATAKANA(int c)
 // which has its span reader causing a word break)
 enum CharSpanClass {CSC_CJK, CSC_KATAKANA, CSC_OTHER};
 
-bool          TextSplit::o_processCJK = true;
-unsigned int  TextSplit::o_CJKNgramLen = 2;
-bool          TextSplit::o_noNumbers = false;
-bool          TextSplit::o_deHyphenate = false;
+bool          TextSplit::o_processCJK{true};
+unsigned int  TextSplit::o_CJKNgramLen{2};
+bool          TextSplit::o_noNumbers{false};
+bool          TextSplit::o_deHyphenate{false};
+int           TextSplit::o_maxWordLength{40};
+static const int o_CJKMaxNgramLen{5};
+
+void TextSplit::staticConfInit(RclConfig *config)
+{
+    config->getConfParam("maxtermlength", &o_maxWordLength);
+
+    bool bvalue{false};
+    if (config->getConfParam("nocjk", &bvalue) && bvalue == true) {
+	o_processCJK = false;
+    } else {
+	o_processCJK = true;
+        int ngramlen;
+        if (config->getConfParam("cjkngramlen", &ngramlen)) {
+            o_CJKNgramLen = (unsigned int)(ngramlen <= o_CJKMaxNgramLen ?
+                                           ngramlen : o_CJKMaxNgramLen);
+        }
+    }
+
+    bvalue = false;
+    if (config->getConfParam("nonumbers", &bvalue)) {
+	o_noNumbers = bvalue;
+    }
+
+    bvalue = false;
+    if (config->getConfParam("dehyphenate", &bvalue)) {
+	o_deHyphenate = bvalue;
+    }
+
+    bvalue = false;
+    if (config->getConfParam("backslashasletter", &bvalue)) {
+        if (bvalue) {
+        } else {
+            charclasses[int('\\')] = SPACE;
+        }
+    }
+}    
 
 // Final term checkpoint: do some checking (the kind which is simpler
 // to do here than in the main loop), then send term to our client.
@@ -272,7 +302,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
 	m_stats.newsamp(m_wordChars);
 #endif
 
-    if (l > 0 && l < m_maxWordLength) {
+    if (l > 0 && l <= o_maxWordLength) {
 	// 1 byte word: we index single ascii letters and digits, but
 	// nothing else. We might want to turn this into a test for a
 	// single utf8 character instead ?
diff --git a/src/common/textsplit.h b/src/common/textsplit.h
index 5bc7964a..853cb936 100644
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@@ -23,6 +23,7 @@
 #include <vector>
 
 class Utf8Iter;
+class RclConfig;
 
 /** 
  * Split text into words. 
@@ -32,38 +33,6 @@ class Utf8Iter;
  */
 class TextSplit {
 public:
-    // Should we activate special processing of Chinese characters ? This
-    // needs a little more cpu, so it can be turned off globally. This is set
-    // by rclconfig, changing it means reindexing
-    static bool o_processCJK;
-    static unsigned int  o_CJKNgramLen;
-    static const unsigned int o_CJKMaxNgramLen =  5;
-    static void cjkProcessing(bool onoff, unsigned int ngramlen = 2) 
-    {
-	o_processCJK = onoff;
-	o_CJKNgramLen = ngramlen <= o_CJKMaxNgramLen ? 
-	    ngramlen : o_CJKMaxNgramLen;
-    }
-
-    // Are we indexing numbers ? Set by rclconfig. Change needs reindex
-    static bool o_noNumbers;
-    static void noNumbers()
-    {
-	o_noNumbers = true;
-    }
-
-    // Given [co-worker] as input, do we also generate [coworker] ?
-    // Set by rclconfig
-    static bool o_deHyphenate;
-    static void deHyphenate(bool on) {
-	o_deHyphenate = on;
-    }
-
-    // Process backslashes as letters? Default is off, but it may be
-    // useful for searching for tex commands. Config variable:
-    // backslashasletter
-    static void backslashAsLetter(bool on);
-    
     enum Flags {
         // Default: will return spans and words (a_b, a, b)
         TXTS_NONE = 0, 
@@ -79,11 +48,13 @@ public:
     };
     
     TextSplit(Flags flags = Flags(TXTS_NONE))
-	: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
-    {
-    }
+	: m_flags(flags) {}
     virtual ~TextSplit() {}
 
+    /** Call at program initialization to read non default values from the 
+        configuration */
+    static void staticConfInit(RclConfig *config);
+    
     /** Split text, emit words and positions. */
     virtual bool text_to_words(const std::string &in);
 
@@ -97,8 +68,7 @@ public:
     /** Called when we encounter formfeed \f 0x0c. Override to use the event.
      * Mostly or exclusively used with pdftoxx output. Other filters mostly 
      * just don't know about pages. */
-    virtual void newpage(int /*pos*/) {
-    }
+    virtual void newpage(int /*pos*/) {}
 
     // Static utility functions:
 
@@ -184,8 +154,13 @@ public:
 #endif // TEXTSPLIT_STATS
 
 private:
+    static bool o_processCJK; // true
+    static bool o_noNumbers;  // false
+    static bool o_deHyphenate; // false
+    static unsigned int o_CJKNgramLen; // 2
+    static int o_maxWordLength; // 40
+
     Flags         m_flags;
-    int           m_maxWordLength;
 
     // Current span. Might be jf.dockes@wanadoo.f
     std::string        m_span; 
@@ -206,7 +181,7 @@ private:
 
     // It may happen that our cleanup would result in emitting the
     // same term twice. We try to avoid this
-    int           m_prevpos;
+    int           m_prevpos{-1};
     int           m_prevlen;
 
 #ifdef TEXTSPLIT_STATS