Simplify initialization by moving static config textsplit init from rclconfig to textsplit
This commit is contained in:
parent
04f3449f99
commit
b1ff34407d
@ -47,7 +47,6 @@
|
||||
#include "conftree.h"
|
||||
#include "log.h"
|
||||
#include "smallut.h"
|
||||
#include "textsplit.h"
|
||||
#include "readfile.h"
|
||||
#include "fstreewalk.h"
|
||||
#include "cpuconf.h"
|
||||
@ -394,32 +393,7 @@ bool RclConfig::updateMainConfig()
|
||||
|
||||
setKeyDir(cstr_null);
|
||||
|
||||
// Texsplit customization
|
||||
bool bvalue = false;
|
||||
if (getConfParam("nocjk", &bvalue) && bvalue == true) {
|
||||
TextSplit::cjkProcessing(false);
|
||||
} else {
|
||||
int ngramlen;
|
||||
if (getConfParam("cjkngramlen", &ngramlen)) {
|
||||
TextSplit::cjkProcessing(true, (unsigned int)ngramlen);
|
||||
} else {
|
||||
TextSplit::cjkProcessing(true);
|
||||
}
|
||||
}
|
||||
bvalue = false;
|
||||
if (getConfParam("nonumbers", &bvalue) && bvalue == true) {
|
||||
TextSplit::noNumbers();
|
||||
}
|
||||
bvalue = false;
|
||||
if (getConfParam("dehyphenate", &bvalue)) {
|
||||
TextSplit::deHyphenate(bvalue);
|
||||
}
|
||||
bvalue = false;
|
||||
if (getConfParam("backslashasletter", &bvalue)) {
|
||||
TextSplit::backslashAsLetter(bvalue);
|
||||
}
|
||||
|
||||
bvalue = true;
|
||||
bool bvalue = true;
|
||||
if (getConfParam("skippedPathsFnmPathname", &bvalue) && bvalue == false) {
|
||||
FsTreeWalker::setNoFnmPathname();
|
||||
}
|
||||
|
||||
@ -37,6 +37,7 @@
|
||||
#include "unac.h"
|
||||
#include "smallut.h"
|
||||
#include "execmd.h"
|
||||
#include "textsplit.h"
|
||||
|
||||
std::thread::id mainthread_id;
|
||||
|
||||
@ -273,6 +274,8 @@ RclConfig *recollinit(int flags,
|
||||
return 0;
|
||||
}
|
||||
|
||||
TextSplit::staticConfInit(config);
|
||||
|
||||
// Retrieve the log file name and level. Daemon and batch indexing
|
||||
// processes may use specific values, else fall back on common
|
||||
// ones.
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* Copyright (C) 2004 J.F.Dockes
|
||||
/* Copyright (C) 2004-2019 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
@ -32,6 +32,7 @@
|
||||
#include "utf8iter.h"
|
||||
#include "uproplist.h"
|
||||
#include "smallut.h"
|
||||
#include "rclconfig.h"
|
||||
|
||||
// Decide if we treat katakana as western scripts, splitting into
|
||||
// words instead of n-grams. This is not absurd (katakana is a kind of
|
||||
@ -137,14 +138,6 @@ public:
|
||||
};
|
||||
static const CharClassInit charClassInitInstance;
|
||||
|
||||
void TextSplit::backslashAsLetter(bool on) {
|
||||
if (on) {
|
||||
charclasses[int('\\')] = A_LLETTER;
|
||||
} else {
|
||||
charclasses[int('\\')] = SPACE;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int whatcc(unsigned int c)
|
||||
{
|
||||
if (c <= 127) {
|
||||
@ -251,10 +244,47 @@ bool TextSplit::isKATAKANA(int c)
|
||||
// which has its span reader causing a word break)
|
||||
enum CharSpanClass {CSC_CJK, CSC_KATAKANA, CSC_OTHER};
|
||||
|
||||
bool TextSplit::o_processCJK = true;
|
||||
unsigned int TextSplit::o_CJKNgramLen = 2;
|
||||
bool TextSplit::o_noNumbers = false;
|
||||
bool TextSplit::o_deHyphenate = false;
|
||||
bool TextSplit::o_processCJK{true};
|
||||
unsigned int TextSplit::o_CJKNgramLen{2};
|
||||
bool TextSplit::o_noNumbers{false};
|
||||
bool TextSplit::o_deHyphenate{false};
|
||||
int TextSplit::o_maxWordLength{40};
|
||||
static const int o_CJKMaxNgramLen{5};
|
||||
|
||||
void TextSplit::staticConfInit(RclConfig *config)
|
||||
{
|
||||
config->getConfParam("maxtermlength", &o_maxWordLength);
|
||||
|
||||
bool bvalue{false};
|
||||
if (config->getConfParam("nocjk", &bvalue) && bvalue == true) {
|
||||
o_processCJK = false;
|
||||
} else {
|
||||
o_processCJK = true;
|
||||
int ngramlen;
|
||||
if (config->getConfParam("cjkngramlen", &ngramlen)) {
|
||||
o_CJKNgramLen = (unsigned int)(ngramlen <= o_CJKMaxNgramLen ?
|
||||
ngramlen : o_CJKMaxNgramLen);
|
||||
}
|
||||
}
|
||||
|
||||
bvalue = false;
|
||||
if (config->getConfParam("nonumbers", &bvalue)) {
|
||||
o_noNumbers = bvalue;
|
||||
}
|
||||
|
||||
bvalue = false;
|
||||
if (config->getConfParam("dehyphenate", &bvalue)) {
|
||||
o_deHyphenate = bvalue;
|
||||
}
|
||||
|
||||
bvalue = false;
|
||||
if (config->getConfParam("backslashasletter", &bvalue)) {
|
||||
if (bvalue) {
|
||||
} else {
|
||||
charclasses[int('\\')] = SPACE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final term checkpoint: do some checking (the kind which is simpler
|
||||
// to do here than in the main loop), then send term to our client.
|
||||
@ -272,7 +302,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||
m_stats.newsamp(m_wordChars);
|
||||
#endif
|
||||
|
||||
if (l > 0 && l < m_maxWordLength) {
|
||||
if (l > 0 && l <= o_maxWordLength) {
|
||||
// 1 byte word: we index single ascii letters and digits, but
|
||||
// nothing else. We might want to turn this into a test for a
|
||||
// single utf8 character instead ?
|
||||
|
||||
@ -23,6 +23,7 @@
|
||||
#include <vector>
|
||||
|
||||
class Utf8Iter;
|
||||
class RclConfig;
|
||||
|
||||
/**
|
||||
* Split text into words.
|
||||
@ -32,38 +33,6 @@ class Utf8Iter;
|
||||
*/
|
||||
class TextSplit {
|
||||
public:
|
||||
// Should we activate special processing of Chinese characters ? This
|
||||
// needs a little more cpu, so it can be turned off globally. This is set
|
||||
// by rclconfig, changing it means reindexing
|
||||
static bool o_processCJK;
|
||||
static unsigned int o_CJKNgramLen;
|
||||
static const unsigned int o_CJKMaxNgramLen = 5;
|
||||
static void cjkProcessing(bool onoff, unsigned int ngramlen = 2)
|
||||
{
|
||||
o_processCJK = onoff;
|
||||
o_CJKNgramLen = ngramlen <= o_CJKMaxNgramLen ?
|
||||
ngramlen : o_CJKMaxNgramLen;
|
||||
}
|
||||
|
||||
// Are we indexing numbers ? Set by rclconfig. Change needs reindex
|
||||
static bool o_noNumbers;
|
||||
static void noNumbers()
|
||||
{
|
||||
o_noNumbers = true;
|
||||
}
|
||||
|
||||
// Given [co-worker] as input, do we also generate [coworker] ?
|
||||
// Set by rclconfig
|
||||
static bool o_deHyphenate;
|
||||
static void deHyphenate(bool on) {
|
||||
o_deHyphenate = on;
|
||||
}
|
||||
|
||||
// Process backslashes as letters? Default is off, but it may be
|
||||
// useful for searching for tex commands. Config variable:
|
||||
// backslashasletter
|
||||
static void backslashAsLetter(bool on);
|
||||
|
||||
enum Flags {
|
||||
// Default: will return spans and words (a_b, a, b)
|
||||
TXTS_NONE = 0,
|
||||
@ -79,11 +48,13 @@ public:
|
||||
};
|
||||
|
||||
TextSplit(Flags flags = Flags(TXTS_NONE))
|
||||
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
|
||||
{
|
||||
}
|
||||
: m_flags(flags) {}
|
||||
virtual ~TextSplit() {}
|
||||
|
||||
/** Call at program initialization to read non default values from the
|
||||
configuration */
|
||||
static void staticConfInit(RclConfig *config);
|
||||
|
||||
/** Split text, emit words and positions. */
|
||||
virtual bool text_to_words(const std::string &in);
|
||||
|
||||
@ -97,8 +68,7 @@ public:
|
||||
/** Called when we encounter formfeed \f 0x0c. Override to use the event.
|
||||
* Mostly or exclusively used with pdftoxx output. Other filters mostly
|
||||
* just don't know about pages. */
|
||||
virtual void newpage(int /*pos*/) {
|
||||
}
|
||||
virtual void newpage(int /*pos*/) {}
|
||||
|
||||
// Static utility functions:
|
||||
|
||||
@ -184,8 +154,13 @@ public:
|
||||
#endif // TEXTSPLIT_STATS
|
||||
|
||||
private:
|
||||
static bool o_processCJK; // true
|
||||
static bool o_noNumbers; // false
|
||||
static bool o_deHyphenate; // false
|
||||
static unsigned int o_CJKNgramLen; // 2
|
||||
static int o_maxWordLength; // 40
|
||||
|
||||
Flags m_flags;
|
||||
int m_maxWordLength;
|
||||
|
||||
// Current span. Might be jf.dockes@wanadoo.f
|
||||
std::string m_span;
|
||||
@ -206,7 +181,7 @@ private:
|
||||
|
||||
// It may happen that our cleanup would result in emitting the
|
||||
// same term twice. We try to avoid this
|
||||
int m_prevpos;
|
||||
int m_prevpos{-1};
|
||||
int m_prevlen;
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user