fix span trimming loop when underscoreasletter is set

This commit is contained in:
Jean-Francois Dockes 2020-09-13 17:53:59 +02:00
parent df09d65a4e
commit 16a9d8eba8
2 changed files with 19 additions and 14 deletions

View File

@ -84,6 +84,9 @@ int TextSplit::o_maxWordLength{40};
static const int o_CJKMaxNgramLen{5}; static const int o_CJKMaxNgramLen{5};
bool o_exthangultagger{false}; bool o_exthangultagger{false};
// This is changed to 0 if _ is processed as a letter
static char underscoreatend = '_';
void TextSplit::staticConfInit(RclConfig *config) void TextSplit::staticConfInit(RclConfig *config)
{ {
config->getConfParam("maxtermlength", &o_maxWordLength); config->getConfParam("maxtermlength", &o_maxWordLength);
@ -122,6 +125,7 @@ void TextSplit::staticConfInit(RclConfig *config)
if (config->getConfParam("underscoreasletter", &bvalue)) { if (config->getConfParam("underscoreasletter", &bvalue)) {
if (bvalue) { if (bvalue) {
charclasses[int('_')] = A_LLETTER; charclasses[int('_')] = A_LLETTER;
underscoreatend = 0;
} }
} }
@ -557,26 +561,26 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
// Maybe trim at end. These are chars that we might keep // Maybe trim at end. These are chars that we might keep
// inside a span, but not at the end. // inside a span, but not at the end.
while (m_span.length() > 0) { string::size_type trimsz{0};
switch (*(m_span.rbegin())) { while (trimsz < m_span.length()) {
case '.': auto c = m_span[m_span.length() - 1 - trimsz];
case '-': if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' ||
case ',': c == underscoreatend) {
case '@': trimsz++;
case '_':
case '\'':
m_span.resize(m_span.length()-1);
if (m_words_in_span.size() && if (m_words_in_span.size() &&
m_words_in_span.back().second > int(m_span.size())) m_words_in_span.back().second > int(m_span.size())) {
m_words_in_span.back().second = int(m_span.size()); m_words_in_span.back().second = int(m_span.size());
if (--bp < 0) }
if (--bp < 0) {
bp = 0; bp = 0;
}
} else {
break; break;
default:
goto breaktrimloop;
} }
} }
breaktrimloop: if (trimsz > 0) {
m_span.resize(m_span.length() - trimsz);
}
if (!words_from_span(bp)) { if (!words_from_span(bp)) {
return false; return false;

View File

@ -265,6 +265,7 @@ int main(int argc, char **argv)
if (!kotagger.empty()) { if (!kotagger.empty()) {
fprintf(fp, "hangultagger = %s\n", kotagger.c_str()); fprintf(fp, "hangultagger = %s\n", kotagger.c_str());
} }
fprintf(fp, "underscoreasletter = 0\n");
fclose(fp); fclose(fp);
Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel)); Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));