fix span trimming loop when underscoreasletter is set

This commit is contained in:
Jean-Francois Dockes 2020-09-13 17:53:59 +02:00
parent df09d65a4e
commit 16a9d8eba8
2 changed files with 19 additions and 14 deletions

View File

@ -84,6 +84,9 @@ int TextSplit::o_maxWordLength{40};
static const int o_CJKMaxNgramLen{5};
bool o_exthangultagger{false};
// This is changed to 0 if _ is processed as a letter
static char underscoreatend = '_';
void TextSplit::staticConfInit(RclConfig *config)
{
config->getConfParam("maxtermlength", &o_maxWordLength);
@ -122,6 +125,7 @@ void TextSplit::staticConfInit(RclConfig *config)
if (config->getConfParam("underscoreasletter", &bvalue)) {
if (bvalue) {
charclasses[int('_')] = A_LLETTER;
underscoreatend = 0;
}
}
@ -557,26 +561,26 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
// Maybe trim at end. These are chars that we might keep
// inside a span, but not at the end.
while (m_span.length() > 0) {
switch (*(m_span.rbegin())) {
case '.':
case '-':
case ',':
case '@':
case '_':
case '\'':
m_span.resize(m_span.length()-1);
string::size_type trimsz{0};
while (trimsz < m_span.length()) {
auto c = m_span[m_span.length() - 1 - trimsz];
if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' ||
c == underscoreatend) {
trimsz++;
if (m_words_in_span.size() &&
m_words_in_span.back().second > int(m_span.size()))
m_words_in_span.back().second > int(m_span.size())) {
m_words_in_span.back().second = int(m_span.size());
if (--bp < 0)
}
if (--bp < 0) {
bp = 0;
}
} else {
break;
default:
goto breaktrimloop;
}
}
breaktrimloop:
if (trimsz > 0) {
m_span.resize(m_span.length() - trimsz);
}
if (!words_from_span(bp)) {
return false;

View File

@ -265,6 +265,7 @@ int main(int argc, char **argv)
if (!kotagger.empty()) {
fprintf(fp, "hangultagger = %s\n", kotagger.c_str());
}
fprintf(fp, "underscoreasletter = 0\n");
fclose(fp);
Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));