fix span trimming loop when underscoreasletter is set
This commit is contained in:
parent
df09d65a4e
commit
16a9d8eba8
@ -84,6 +84,9 @@ int TextSplit::o_maxWordLength{40};
|
||||
static const int o_CJKMaxNgramLen{5};
|
||||
bool o_exthangultagger{false};
|
||||
|
||||
// This is changed to 0 if _ is processed as a letter
|
||||
static char underscoreatend = '_';
|
||||
|
||||
void TextSplit::staticConfInit(RclConfig *config)
|
||||
{
|
||||
config->getConfParam("maxtermlength", &o_maxWordLength);
|
||||
@ -122,6 +125,7 @@ void TextSplit::staticConfInit(RclConfig *config)
|
||||
if (config->getConfParam("underscoreasletter", &bvalue)) {
|
||||
if (bvalue) {
|
||||
charclasses[int('_')] = A_LLETTER;
|
||||
underscoreatend = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -557,26 +561,26 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
|
||||
|
||||
// Maybe trim at end. These are chars that we might keep
|
||||
// inside a span, but not at the end.
|
||||
while (m_span.length() > 0) {
|
||||
switch (*(m_span.rbegin())) {
|
||||
case '.':
|
||||
case '-':
|
||||
case ',':
|
||||
case '@':
|
||||
case '_':
|
||||
case '\'':
|
||||
m_span.resize(m_span.length()-1);
|
||||
string::size_type trimsz{0};
|
||||
while (trimsz < m_span.length()) {
|
||||
auto c = m_span[m_span.length() - 1 - trimsz];
|
||||
if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' ||
|
||||
c == underscoreatend) {
|
||||
trimsz++;
|
||||
if (m_words_in_span.size() &&
|
||||
m_words_in_span.back().second > int(m_span.size()))
|
||||
m_words_in_span.back().second > int(m_span.size())) {
|
||||
m_words_in_span.back().second = int(m_span.size());
|
||||
if (--bp < 0)
|
||||
}
|
||||
if (--bp < 0) {
|
||||
bp = 0;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
default:
|
||||
goto breaktrimloop;
|
||||
}
|
||||
}
|
||||
breaktrimloop:
|
||||
if (trimsz > 0) {
|
||||
m_span.resize(m_span.length() - trimsz);
|
||||
}
|
||||
|
||||
if (!words_from_span(bp)) {
|
||||
return false;
|
||||
|
||||
@ -265,6 +265,7 @@ int main(int argc, char **argv)
|
||||
if (!kotagger.empty()) {
|
||||
fprintf(fp, "hangultagger = %s\n", kotagger.c_str());
|
||||
}
|
||||
fprintf(fp, "underscoreasletter = 0\n");
|
||||
fclose(fp);
|
||||
|
||||
Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user