Simplify initialization by moving static config textsplit init from rclconfig to textsplit
This commit is contained in:
parent
04f3449f99
commit
b1ff34407d
@ -47,7 +47,6 @@
|
|||||||
#include "conftree.h"
|
#include "conftree.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
#include "textsplit.h"
|
|
||||||
#include "readfile.h"
|
#include "readfile.h"
|
||||||
#include "fstreewalk.h"
|
#include "fstreewalk.h"
|
||||||
#include "cpuconf.h"
|
#include "cpuconf.h"
|
||||||
@ -394,32 +393,7 @@ bool RclConfig::updateMainConfig()
|
|||||||
|
|
||||||
setKeyDir(cstr_null);
|
setKeyDir(cstr_null);
|
||||||
|
|
||||||
// Texsplit customization
|
bool bvalue = true;
|
||||||
bool bvalue = false;
|
|
||||||
if (getConfParam("nocjk", &bvalue) && bvalue == true) {
|
|
||||||
TextSplit::cjkProcessing(false);
|
|
||||||
} else {
|
|
||||||
int ngramlen;
|
|
||||||
if (getConfParam("cjkngramlen", &ngramlen)) {
|
|
||||||
TextSplit::cjkProcessing(true, (unsigned int)ngramlen);
|
|
||||||
} else {
|
|
||||||
TextSplit::cjkProcessing(true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
bvalue = false;
|
|
||||||
if (getConfParam("nonumbers", &bvalue) && bvalue == true) {
|
|
||||||
TextSplit::noNumbers();
|
|
||||||
}
|
|
||||||
bvalue = false;
|
|
||||||
if (getConfParam("dehyphenate", &bvalue)) {
|
|
||||||
TextSplit::deHyphenate(bvalue);
|
|
||||||
}
|
|
||||||
bvalue = false;
|
|
||||||
if (getConfParam("backslashasletter", &bvalue)) {
|
|
||||||
TextSplit::backslashAsLetter(bvalue);
|
|
||||||
}
|
|
||||||
|
|
||||||
bvalue = true;
|
|
||||||
if (getConfParam("skippedPathsFnmPathname", &bvalue) && bvalue == false) {
|
if (getConfParam("skippedPathsFnmPathname", &bvalue) && bvalue == false) {
|
||||||
FsTreeWalker::setNoFnmPathname();
|
FsTreeWalker::setNoFnmPathname();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -37,6 +37,7 @@
|
|||||||
#include "unac.h"
|
#include "unac.h"
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
#include "execmd.h"
|
#include "execmd.h"
|
||||||
|
#include "textsplit.h"
|
||||||
|
|
||||||
std::thread::id mainthread_id;
|
std::thread::id mainthread_id;
|
||||||
|
|
||||||
@ -273,6 +274,8 @@ RclConfig *recollinit(int flags,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TextSplit::staticConfInit(config);
|
||||||
|
|
||||||
// Retrieve the log file name and level. Daemon and batch indexing
|
// Retrieve the log file name and level. Daemon and batch indexing
|
||||||
// processes may use specific values, else fall back on common
|
// processes may use specific values, else fall back on common
|
||||||
// ones.
|
// ones.
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
/* Copyright (C) 2004 J.F.Dockes
|
/* Copyright (C) 2004-2019 J.F.Dockes
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU General Public License as published by
|
* it under the terms of the GNU General Public License as published by
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
@ -32,6 +32,7 @@
|
|||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
#include "uproplist.h"
|
#include "uproplist.h"
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
|
#include "rclconfig.h"
|
||||||
|
|
||||||
// Decide if we treat katakana as western scripts, splitting into
|
// Decide if we treat katakana as western scripts, splitting into
|
||||||
// words instead of n-grams. This is not absurd (katakana is a kind of
|
// words instead of n-grams. This is not absurd (katakana is a kind of
|
||||||
@ -137,14 +138,6 @@ public:
|
|||||||
};
|
};
|
||||||
static const CharClassInit charClassInitInstance;
|
static const CharClassInit charClassInitInstance;
|
||||||
|
|
||||||
void TextSplit::backslashAsLetter(bool on) {
|
|
||||||
if (on) {
|
|
||||||
charclasses[int('\\')] = A_LLETTER;
|
|
||||||
} else {
|
|
||||||
charclasses[int('\\')] = SPACE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int whatcc(unsigned int c)
|
static inline int whatcc(unsigned int c)
|
||||||
{
|
{
|
||||||
if (c <= 127) {
|
if (c <= 127) {
|
||||||
@ -251,10 +244,47 @@ bool TextSplit::isKATAKANA(int c)
|
|||||||
// which has its span reader causing a word break)
|
// which has its span reader causing a word break)
|
||||||
enum CharSpanClass {CSC_CJK, CSC_KATAKANA, CSC_OTHER};
|
enum CharSpanClass {CSC_CJK, CSC_KATAKANA, CSC_OTHER};
|
||||||
|
|
||||||
bool TextSplit::o_processCJK = true;
|
bool TextSplit::o_processCJK{true};
|
||||||
unsigned int TextSplit::o_CJKNgramLen = 2;
|
unsigned int TextSplit::o_CJKNgramLen{2};
|
||||||
bool TextSplit::o_noNumbers = false;
|
bool TextSplit::o_noNumbers{false};
|
||||||
bool TextSplit::o_deHyphenate = false;
|
bool TextSplit::o_deHyphenate{false};
|
||||||
|
int TextSplit::o_maxWordLength{40};
|
||||||
|
static const int o_CJKMaxNgramLen{5};
|
||||||
|
|
||||||
|
void TextSplit::staticConfInit(RclConfig *config)
|
||||||
|
{
|
||||||
|
config->getConfParam("maxtermlength", &o_maxWordLength);
|
||||||
|
|
||||||
|
bool bvalue{false};
|
||||||
|
if (config->getConfParam("nocjk", &bvalue) && bvalue == true) {
|
||||||
|
o_processCJK = false;
|
||||||
|
} else {
|
||||||
|
o_processCJK = true;
|
||||||
|
int ngramlen;
|
||||||
|
if (config->getConfParam("cjkngramlen", &ngramlen)) {
|
||||||
|
o_CJKNgramLen = (unsigned int)(ngramlen <= o_CJKMaxNgramLen ?
|
||||||
|
ngramlen : o_CJKMaxNgramLen);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bvalue = false;
|
||||||
|
if (config->getConfParam("nonumbers", &bvalue)) {
|
||||||
|
o_noNumbers = bvalue;
|
||||||
|
}
|
||||||
|
|
||||||
|
bvalue = false;
|
||||||
|
if (config->getConfParam("dehyphenate", &bvalue)) {
|
||||||
|
o_deHyphenate = bvalue;
|
||||||
|
}
|
||||||
|
|
||||||
|
bvalue = false;
|
||||||
|
if (config->getConfParam("backslashasletter", &bvalue)) {
|
||||||
|
if (bvalue) {
|
||||||
|
} else {
|
||||||
|
charclasses[int('\\')] = SPACE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Final term checkpoint: do some checking (the kind which is simpler
|
// Final term checkpoint: do some checking (the kind which is simpler
|
||||||
// to do here than in the main loop), then send term to our client.
|
// to do here than in the main loop), then send term to our client.
|
||||||
@ -272,7 +302,7 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
m_stats.newsamp(m_wordChars);
|
m_stats.newsamp(m_wordChars);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (l > 0 && l < m_maxWordLength) {
|
if (l > 0 && l <= o_maxWordLength) {
|
||||||
// 1 byte word: we index single ascii letters and digits, but
|
// 1 byte word: we index single ascii letters and digits, but
|
||||||
// nothing else. We might want to turn this into a test for a
|
// nothing else. We might want to turn this into a test for a
|
||||||
// single utf8 character instead ?
|
// single utf8 character instead ?
|
||||||
|
|||||||
@ -23,6 +23,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
class Utf8Iter;
|
class Utf8Iter;
|
||||||
|
class RclConfig;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Split text into words.
|
* Split text into words.
|
||||||
@ -32,38 +33,6 @@ class Utf8Iter;
|
|||||||
*/
|
*/
|
||||||
class TextSplit {
|
class TextSplit {
|
||||||
public:
|
public:
|
||||||
// Should we activate special processing of Chinese characters ? This
|
|
||||||
// needs a little more cpu, so it can be turned off globally. This is set
|
|
||||||
// by rclconfig, changing it means reindexing
|
|
||||||
static bool o_processCJK;
|
|
||||||
static unsigned int o_CJKNgramLen;
|
|
||||||
static const unsigned int o_CJKMaxNgramLen = 5;
|
|
||||||
static void cjkProcessing(bool onoff, unsigned int ngramlen = 2)
|
|
||||||
{
|
|
||||||
o_processCJK = onoff;
|
|
||||||
o_CJKNgramLen = ngramlen <= o_CJKMaxNgramLen ?
|
|
||||||
ngramlen : o_CJKMaxNgramLen;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Are we indexing numbers ? Set by rclconfig. Change needs reindex
|
|
||||||
static bool o_noNumbers;
|
|
||||||
static void noNumbers()
|
|
||||||
{
|
|
||||||
o_noNumbers = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Given [co-worker] as input, do we also generate [coworker] ?
|
|
||||||
// Set by rclconfig
|
|
||||||
static bool o_deHyphenate;
|
|
||||||
static void deHyphenate(bool on) {
|
|
||||||
o_deHyphenate = on;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process backslashes as letters? Default is off, but it may be
|
|
||||||
// useful for searching for tex commands. Config variable:
|
|
||||||
// backslashasletter
|
|
||||||
static void backslashAsLetter(bool on);
|
|
||||||
|
|
||||||
enum Flags {
|
enum Flags {
|
||||||
// Default: will return spans and words (a_b, a, b)
|
// Default: will return spans and words (a_b, a, b)
|
||||||
TXTS_NONE = 0,
|
TXTS_NONE = 0,
|
||||||
@ -79,11 +48,13 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
TextSplit(Flags flags = Flags(TXTS_NONE))
|
TextSplit(Flags flags = Flags(TXTS_NONE))
|
||||||
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
|
: m_flags(flags) {}
|
||||||
{
|
|
||||||
}
|
|
||||||
virtual ~TextSplit() {}
|
virtual ~TextSplit() {}
|
||||||
|
|
||||||
|
/** Call at program initialization to read non default values from the
|
||||||
|
configuration */
|
||||||
|
static void staticConfInit(RclConfig *config);
|
||||||
|
|
||||||
/** Split text, emit words and positions. */
|
/** Split text, emit words and positions. */
|
||||||
virtual bool text_to_words(const std::string &in);
|
virtual bool text_to_words(const std::string &in);
|
||||||
|
|
||||||
@ -97,8 +68,7 @@ public:
|
|||||||
/** Called when we encounter formfeed \f 0x0c. Override to use the event.
|
/** Called when we encounter formfeed \f 0x0c. Override to use the event.
|
||||||
* Mostly or exclusively used with pdftoxx output. Other filters mostly
|
* Mostly or exclusively used with pdftoxx output. Other filters mostly
|
||||||
* just don't know about pages. */
|
* just don't know about pages. */
|
||||||
virtual void newpage(int /*pos*/) {
|
virtual void newpage(int /*pos*/) {}
|
||||||
}
|
|
||||||
|
|
||||||
// Static utility functions:
|
// Static utility functions:
|
||||||
|
|
||||||
@ -184,8 +154,13 @@ public:
|
|||||||
#endif // TEXTSPLIT_STATS
|
#endif // TEXTSPLIT_STATS
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
static bool o_processCJK; // true
|
||||||
|
static bool o_noNumbers; // false
|
||||||
|
static bool o_deHyphenate; // false
|
||||||
|
static unsigned int o_CJKNgramLen; // 2
|
||||||
|
static int o_maxWordLength; // 40
|
||||||
|
|
||||||
Flags m_flags;
|
Flags m_flags;
|
||||||
int m_maxWordLength;
|
|
||||||
|
|
||||||
// Current span. Might be jf.dockes@wanadoo.f
|
// Current span. Might be jf.dockes@wanadoo.f
|
||||||
std::string m_span;
|
std::string m_span;
|
||||||
@ -206,7 +181,7 @@ private:
|
|||||||
|
|
||||||
// It may happen that our cleanup would result in emitting the
|
// It may happen that our cleanup would result in emitting the
|
||||||
// same term twice. We try to avoid this
|
// same term twice. We try to avoid this
|
||||||
int m_prevpos;
|
int m_prevpos{-1};
|
||||||
int m_prevlen;
|
int m_prevlen;
|
||||||
|
|
||||||
#ifdef TEXTSPLIT_STATS
|
#ifdef TEXTSPLIT_STATS
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user