From b1ff34407d039be8875075fce1c3b553f56f9035 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 1 Feb 2019 09:09:15 +0100 Subject: [PATCH] Simplify initialization by moving static config textsplit init from rclconfig to textsplit --- src/common/rclconfig.cpp | 28 +------------------ src/common/rclinit.cpp | 3 +++ src/common/textsplit.cpp | 58 ++++++++++++++++++++++++++++++---------- src/common/textsplit.h | 53 ++++++++++-------------------------- 4 files changed, 62 insertions(+), 80 deletions(-) diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index ec255624..f65a723c 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -47,7 +47,6 @@ #include "conftree.h" #include "log.h" #include "smallut.h" -#include "textsplit.h" #include "readfile.h" #include "fstreewalk.h" #include "cpuconf.h" @@ -394,32 +393,7 @@ bool RclConfig::updateMainConfig() setKeyDir(cstr_null); - // Texsplit customization - bool bvalue = false; - if (getConfParam("nocjk", &bvalue) && bvalue == true) { - TextSplit::cjkProcessing(false); - } else { - int ngramlen; - if (getConfParam("cjkngramlen", &ngramlen)) { - TextSplit::cjkProcessing(true, (unsigned int)ngramlen); - } else { - TextSplit::cjkProcessing(true); - } - } - bvalue = false; - if (getConfParam("nonumbers", &bvalue) && bvalue == true) { - TextSplit::noNumbers(); - } - bvalue = false; - if (getConfParam("dehyphenate", &bvalue)) { - TextSplit::deHyphenate(bvalue); - } - bvalue = false; - if (getConfParam("backslashasletter", &bvalue)) { - TextSplit::backslashAsLetter(bvalue); - } - - bvalue = true; + bool bvalue = true; if (getConfParam("skippedPathsFnmPathname", &bvalue) && bvalue == false) { FsTreeWalker::setNoFnmPathname(); } diff --git a/src/common/rclinit.cpp b/src/common/rclinit.cpp index 070be98f..8cdc5bcb 100644 --- a/src/common/rclinit.cpp +++ b/src/common/rclinit.cpp @@ -37,6 +37,7 @@ #include "unac.h" #include "smallut.h" #include "execmd.h" +#include "textsplit.h" std::thread::id mainthread_id; @@ -273,6 +274,8 @@ RclConfig *recollinit(int flags, return 0; } + TextSplit::staticConfInit(config); + // Retrieve the log file name and level. Daemon and batch indexing // processes may use specific values, else fall back on common // ones. diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index f4fee2db..755ef5ce 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2004 J.F.Dockes +/* Copyright (C) 2004-2019 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -32,6 +32,7 @@ #include "utf8iter.h" #include "uproplist.h" #include "smallut.h" +#include "rclconfig.h" // Decide if we treat katakana as western scripts, splitting into // words instead of n-grams. This is not absurd (katakana is a kind of @@ -137,14 +138,6 @@ public: }; static const CharClassInit charClassInitInstance; -void TextSplit::backslashAsLetter(bool on) { - if (on) { - charclasses[int('\\')] = A_LLETTER; - } else { - charclasses[int('\\')] = SPACE; - } -} - static inline int whatcc(unsigned int c) { if (c <= 127) { @@ -251,10 +244,47 @@ bool TextSplit::isKATAKANA(int c) // which has its span reader causing a word break) enum CharSpanClass {CSC_CJK, CSC_KATAKANA, CSC_OTHER}; -bool TextSplit::o_processCJK = true; -unsigned int TextSplit::o_CJKNgramLen = 2; -bool TextSplit::o_noNumbers = false; -bool TextSplit::o_deHyphenate = false; +bool TextSplit::o_processCJK{true}; +unsigned int TextSplit::o_CJKNgramLen{2}; +bool TextSplit::o_noNumbers{false}; +bool TextSplit::o_deHyphenate{false}; +int TextSplit::o_maxWordLength{40}; +static const int o_CJKMaxNgramLen{5}; + +void TextSplit::staticConfInit(RclConfig *config) +{ + config->getConfParam("maxtermlength", &o_maxWordLength); + + bool bvalue{false}; + if (config->getConfParam("nocjk", &bvalue) && bvalue == true) { + o_processCJK = false; + } else { + o_processCJK = true; + int ngramlen; + if (config->getConfParam("cjkngramlen", &ngramlen)) { + o_CJKNgramLen = (unsigned int)(ngramlen <= o_CJKMaxNgramLen ? + ngramlen : o_CJKMaxNgramLen); + } + } + + bvalue = false; + if (config->getConfParam("nonumbers", &bvalue)) { + o_noNumbers = bvalue; + } + + bvalue = false; + if (config->getConfParam("dehyphenate", &bvalue)) { + o_deHyphenate = bvalue; + } + + bvalue = false; + if (config->getConfParam("backslashasletter", &bvalue)) { + if (bvalue) { + } else { + charclasses[int('\\')] = SPACE; + } + } +} // Final term checkpoint: do some checking (the kind which is simpler // to do here than in the main loop), then send term to our client. @@ -272,7 +302,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos, m_stats.newsamp(m_wordChars); #endif - if (l > 0 && l < m_maxWordLength) { + if (l > 0 && l <= o_maxWordLength) { // 1 byte word: we index single ascii letters and digits, but // nothing else. We might want to turn this into a test for a // single utf8 character instead ? diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 5bc7964a..853cb936 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -23,6 +23,7 @@ #include class Utf8Iter; +class RclConfig; /** * Split text into words. @@ -32,38 +33,6 @@ class Utf8Iter; */ class TextSplit { public: - // Should we activate special processing of Chinese characters ? This - // needs a little more cpu, so it can be turned off globally. This is set - // by rclconfig, changing it means reindexing - static bool o_processCJK; - static unsigned int o_CJKNgramLen; - static const unsigned int o_CJKMaxNgramLen = 5; - static void cjkProcessing(bool onoff, unsigned int ngramlen = 2) - { - o_processCJK = onoff; - o_CJKNgramLen = ngramlen <= o_CJKMaxNgramLen ? - ngramlen : o_CJKMaxNgramLen; - } - - // Are we indexing numbers ? Set by rclconfig. Change needs reindex - static bool o_noNumbers; - static void noNumbers() - { - o_noNumbers = true; - } - - // Given [co-worker] as input, do we also generate [coworker] ? - // Set by rclconfig - static bool o_deHyphenate; - static void deHyphenate(bool on) { - o_deHyphenate = on; - } - - // Process backslashes as letters? Default is off, but it may be - // useful for searching for tex commands. Config variable: - // backslashasletter - static void backslashAsLetter(bool on); - enum Flags { // Default: will return spans and words (a_b, a, b) TXTS_NONE = 0, @@ -79,11 +48,13 @@ public: }; TextSplit(Flags flags = Flags(TXTS_NONE)) - : m_flags(flags), m_maxWordLength(40), m_prevpos(-1) - { - } + : m_flags(flags) {} virtual ~TextSplit() {} + /** Call at program initialization to read non default values from the + configuration */ + static void staticConfInit(RclConfig *config); + /** Split text, emit words and positions. */ virtual bool text_to_words(const std::string &in); @@ -97,8 +68,7 @@ public: /** Called when we encounter formfeed \f 0x0c. Override to use the event. * Mostly or exclusively used with pdftoxx output. Other filters mostly * just don't know about pages. */ - virtual void newpage(int /*pos*/) { - } + virtual void newpage(int /*pos*/) {} // Static utility functions: @@ -184,8 +154,13 @@ public: #endif // TEXTSPLIT_STATS private: + static bool o_processCJK; // true + static bool o_noNumbers; // false + static bool o_deHyphenate; // false + static unsigned int o_CJKNgramLen; // 2 + static int o_maxWordLength; // 40 + Flags m_flags; - int m_maxWordLength; // Current span. Might be jf.dockes@wanadoo.f std::string m_span; @@ -206,7 +181,7 @@ private: // It may happen that our cleanup would result in emitting the // same term twice. We try to avoid this - int m_prevpos; + int m_prevpos{-1}; int m_prevlen; #ifdef TEXTSPLIT_STATS