From a6a2abd251ab30d394ae4c7392dff5b5c5365985 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 1 Jun 2022 09:37:51 +0200 Subject: [PATCH] Make max words in span a parameter (it was hard-coded at 6 which remains the default value) --- src/common/textsplit.cpp | 7 +++++-- src/common/textsplit.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index be61c917..075a7489 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -81,6 +81,8 @@ unsigned int TextSplit::o_CJKNgramLen{2}; bool TextSplit::o_noNumbers{false}; bool TextSplit::o_deHyphenate{false}; int TextSplit::o_maxWordLength{40}; +int TextSplit::o_maxWordsInSpan{6}; + static const int o_CJKMaxNgramLen{5}; bool o_exthangultagger{false}; @@ -90,6 +92,7 @@ static char underscoreatend = '_'; void TextSplit::staticConfInit(RclConfig *config) { config->getConfParam("maxtermlength", &o_maxWordLength); + config->getConfParam("maxwordsinspan", &o_maxWordsInSpan); bool bvalue{false}; if (config->getConfParam("nocjk", &bvalue) && bvalue == true) { @@ -505,7 +508,7 @@ bool TextSplit::words_from_span(size_t bp) inline bool TextSplit::doemit(bool spanerase, size_t _bp) { int bp = int(_bp); - LOGDEB2("TextSplit::doemit: sper " << spanerase << " bp " << bp << + LOGERR("TextSplit::doemit: sper " << spanerase << " bp " << bp << " spp " << m_spanpos << " spanwords " << m_words_in_span.size() << " wS " << m_wordStart << " wL " << m_wordLen << " inn " << m_inNumber << " span [" << m_span << "]\n"); @@ -513,7 +516,7 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp) if (m_wordLen) { // We have a current word. Remember it - if (m_words_in_span.size() >= 6) { + if (int(m_words_in_span.size()) >= o_maxWordsInSpan) { // Limit max span word count spanerase = true; } diff --git a/src/common/textsplit.h b/src/common/textsplit.h index c09e867f..dcc576f0 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -162,6 +162,7 @@ private: static bool o_deHyphenate; // false static unsigned int o_CJKNgramLen; // 2 static int o_maxWordLength; // 40 + static int o_maxWordsInSpan; // 6 Flags m_flags;