Simplify initialization by moving static config textsplit init from rclconfig to textsplit

This commit is contained in:
Jean-Francois Dockes 2019-02-01 09:09:15 +01:00
parent 04f3449f99
commit b1ff34407d
4 changed files with 62 additions and 80 deletions

View File

@ -47,7 +47,6 @@
#include "conftree.h"
#include "log.h"
#include "smallut.h"
#include "textsplit.h"
#include "readfile.h"
#include "fstreewalk.h"
#include "cpuconf.h"
@ -394,32 +393,7 @@ bool RclConfig::updateMainConfig()
setKeyDir(cstr_null);
// Texsplit customization
bool bvalue = false;
if (getConfParam("nocjk", &bvalue) && bvalue == true) {
TextSplit::cjkProcessing(false);
} else {
int ngramlen;
if (getConfParam("cjkngramlen", &ngramlen)) {
TextSplit::cjkProcessing(true, (unsigned int)ngramlen);
} else {
TextSplit::cjkProcessing(true);
}
}
bvalue = false;
if (getConfParam("nonumbers", &bvalue) && bvalue == true) {
TextSplit::noNumbers();
}
bvalue = false;
if (getConfParam("dehyphenate", &bvalue)) {
TextSplit::deHyphenate(bvalue);
}
bvalue = false;
if (getConfParam("backslashasletter", &bvalue)) {
TextSplit::backslashAsLetter(bvalue);
}
bvalue = true;
bool bvalue = true;
if (getConfParam("skippedPathsFnmPathname", &bvalue) && bvalue == false) {
FsTreeWalker::setNoFnmPathname();
}

View File

@ -37,6 +37,7 @@
#include "unac.h"
#include "smallut.h"
#include "execmd.h"
#include "textsplit.h"
std::thread::id mainthread_id;
@ -273,6 +274,8 @@ RclConfig *recollinit(int flags,
return 0;
}
TextSplit::staticConfInit(config);
// Retrieve the log file name and level. Daemon and batch indexing
// processes may use specific values, else fall back on common
// ones.

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2004 J.F.Dockes
/* Copyright (C) 2004-2019 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@ -32,6 +32,7 @@
#include "utf8iter.h"
#include "uproplist.h"
#include "smallut.h"
#include "rclconfig.h"
// Decide if we treat katakana as western scripts, splitting into
// words instead of n-grams. This is not absurd (katakana is a kind of
@ -137,14 +138,6 @@ public:
};
static const CharClassInit charClassInitInstance;
void TextSplit::backslashAsLetter(bool on) {
if (on) {
charclasses[int('\\')] = A_LLETTER;
} else {
charclasses[int('\\')] = SPACE;
}
}
static inline int whatcc(unsigned int c)
{
if (c <= 127) {
@ -251,10 +244,47 @@ bool TextSplit::isKATAKANA(int c)
// which has its span reader causing a word break)
enum CharSpanClass {CSC_CJK, CSC_KATAKANA, CSC_OTHER};
bool TextSplit::o_processCJK = true;
unsigned int TextSplit::o_CJKNgramLen = 2;
bool TextSplit::o_noNumbers = false;
bool TextSplit::o_deHyphenate = false;
bool TextSplit::o_processCJK{true};
unsigned int TextSplit::o_CJKNgramLen{2};
bool TextSplit::o_noNumbers{false};
bool TextSplit::o_deHyphenate{false};
int TextSplit::o_maxWordLength{40};
static const int o_CJKMaxNgramLen{5};
void TextSplit::staticConfInit(RclConfig *config)
{
config->getConfParam("maxtermlength", &o_maxWordLength);
bool bvalue{false};
if (config->getConfParam("nocjk", &bvalue) && bvalue == true) {
o_processCJK = false;
} else {
o_processCJK = true;
int ngramlen;
if (config->getConfParam("cjkngramlen", &ngramlen)) {
o_CJKNgramLen = (unsigned int)(ngramlen <= o_CJKMaxNgramLen ?
ngramlen : o_CJKMaxNgramLen);
}
}
bvalue = false;
if (config->getConfParam("nonumbers", &bvalue)) {
o_noNumbers = bvalue;
}
bvalue = false;
if (config->getConfParam("dehyphenate", &bvalue)) {
o_deHyphenate = bvalue;
}
bvalue = false;
if (config->getConfParam("backslashasletter", &bvalue)) {
if (bvalue) {
} else {
charclasses[int('\\')] = SPACE;
}
}
}
// Final term checkpoint: do some checking (the kind which is simpler
// to do here than in the main loop), then send term to our client.
@ -272,7 +302,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
m_stats.newsamp(m_wordChars);
#endif
if (l > 0 && l < m_maxWordLength) {
if (l > 0 && l <= o_maxWordLength) {
// 1 byte word: we index single ascii letters and digits, but
// nothing else. We might want to turn this into a test for a
// single utf8 character instead ?

View File

@ -23,6 +23,7 @@
#include <vector>
class Utf8Iter;
class RclConfig;
/**
* Split text into words.
@ -32,38 +33,6 @@ class Utf8Iter;
*/
class TextSplit {
public:
// Should we activate special processing of Chinese characters ? This
// needs a little more cpu, so it can be turned off globally. This is set
// by rclconfig, changing it means reindexing
static bool o_processCJK;
static unsigned int o_CJKNgramLen;
static const unsigned int o_CJKMaxNgramLen = 5;
static void cjkProcessing(bool onoff, unsigned int ngramlen = 2)
{
o_processCJK = onoff;
o_CJKNgramLen = ngramlen <= o_CJKMaxNgramLen ?
ngramlen : o_CJKMaxNgramLen;
}
// Are we indexing numbers ? Set by rclconfig. Change needs reindex
static bool o_noNumbers;
static void noNumbers()
{
o_noNumbers = true;
}
// Given [co-worker] as input, do we also generate [coworker] ?
// Set by rclconfig
static bool o_deHyphenate;
static void deHyphenate(bool on) {
o_deHyphenate = on;
}
// Process backslashes as letters? Default is off, but it may be
// useful for searching for tex commands. Config variable:
// backslashasletter
static void backslashAsLetter(bool on);
enum Flags {
// Default: will return spans and words (a_b, a, b)
TXTS_NONE = 0,
@ -79,11 +48,13 @@ public:
};
TextSplit(Flags flags = Flags(TXTS_NONE))
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
{
}
: m_flags(flags) {}
virtual ~TextSplit() {}
/** Call at program initialization to read non default values from the
configuration */
static void staticConfInit(RclConfig *config);
/** Split text, emit words and positions. */
virtual bool text_to_words(const std::string &in);
@ -97,8 +68,7 @@ public:
/** Called when we encounter formfeed \f 0x0c. Override to use the event.
* Mostly or exclusively used with pdftoxx output. Other filters mostly
* just don't know about pages. */
virtual void newpage(int /*pos*/) {
}
virtual void newpage(int /*pos*/) {}
// Static utility functions:
@ -184,8 +154,13 @@ public:
#endif // TEXTSPLIT_STATS
private:
static bool o_processCJK; // true
static bool o_noNumbers; // false
static bool o_deHyphenate; // false
static unsigned int o_CJKNgramLen; // 2
static int o_maxWordLength; // 40
Flags m_flags;
int m_maxWordLength;
// Current span. Might be jf.dockes@wanadoo.f
std::string m_span;
@ -206,7 +181,7 @@ private:
// It may happen that our cleanup would result in emitting the
// same term twice. We try to avoid this
int m_prevpos;
int m_prevpos{-1};
int m_prevlen;
#ifdef TEXTSPLIT_STATS