From 16a9d8eba81905dfe4f542bd708cc9046135ddb5 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 13 Sep 2020 17:53:59 +0200 Subject: [PATCH] fix span trimming loop when underscoreasletter is set --- src/common/textsplit.cpp | 32 ++++++++++++++++++-------------- src/testmains/trtextsplit.cpp | 1 + 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 04af5396..aa725910 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -84,6 +84,9 @@ int TextSplit::o_maxWordLength{40}; static const int o_CJKMaxNgramLen{5}; bool o_exthangultagger{false}; +// This is changed to 0 if _ is processed as a letter +static char underscoreatend = '_'; + void TextSplit::staticConfInit(RclConfig *config) { config->getConfParam("maxtermlength", &o_maxWordLength); @@ -122,6 +125,7 @@ void TextSplit::staticConfInit(RclConfig *config) if (config->getConfParam("underscoreasletter", &bvalue)) { if (bvalue) { charclasses[int('_')] = A_LLETTER; + underscoreatend = 0; } } @@ -557,26 +561,26 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp) // Maybe trim at end. These are chars that we might keep // inside a span, but not at the end. - while (m_span.length() > 0) { - switch (*(m_span.rbegin())) { - case '.': - case '-': - case ',': - case '@': - case '_': - case '\'': - m_span.resize(m_span.length()-1); + string::size_type trimsz{0}; + while (trimsz < m_span.length()) { + auto c = m_span[m_span.length() - 1 - trimsz]; + if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' || + c == underscoreatend) { + trimsz++; if (m_words_in_span.size() && - m_words_in_span.back().second > int(m_span.size())) + m_words_in_span.back().second > int(m_span.size())) { m_words_in_span.back().second = int(m_span.size()); - if (--bp < 0) + } + if (--bp < 0) { bp = 0; + } + } else { break; - default: - goto breaktrimloop; } } -breaktrimloop: + if (trimsz > 0) { + m_span.resize(m_span.length() - trimsz); + } if (!words_from_span(bp)) { return false; diff --git a/src/testmains/trtextsplit.cpp b/src/testmains/trtextsplit.cpp index 730d2011..ab8056e3 100644 --- a/src/testmains/trtextsplit.cpp +++ b/src/testmains/trtextsplit.cpp @@ -265,6 +265,7 @@ int main(int argc, char **argv) if (!kotagger.empty()) { fprintf(fp, "hangultagger = %s\n", kotagger.c_str()); } + fprintf(fp, "underscoreasletter = 0\n"); fclose(fp); Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));