fix span trimming loop when underscoreasletter is set
This commit is contained in:
parent
df09d65a4e
commit
16a9d8eba8
@ -84,6 +84,9 @@ int TextSplit::o_maxWordLength{40};
|
|||||||
static const int o_CJKMaxNgramLen{5};
|
static const int o_CJKMaxNgramLen{5};
|
||||||
bool o_exthangultagger{false};
|
bool o_exthangultagger{false};
|
||||||
|
|
||||||
|
// This is changed to 0 if _ is processed as a letter
|
||||||
|
static char underscoreatend = '_';
|
||||||
|
|
||||||
void TextSplit::staticConfInit(RclConfig *config)
|
void TextSplit::staticConfInit(RclConfig *config)
|
||||||
{
|
{
|
||||||
config->getConfParam("maxtermlength", &o_maxWordLength);
|
config->getConfParam("maxtermlength", &o_maxWordLength);
|
||||||
@ -122,6 +125,7 @@ void TextSplit::staticConfInit(RclConfig *config)
|
|||||||
if (config->getConfParam("underscoreasletter", &bvalue)) {
|
if (config->getConfParam("underscoreasletter", &bvalue)) {
|
||||||
if (bvalue) {
|
if (bvalue) {
|
||||||
charclasses[int('_')] = A_LLETTER;
|
charclasses[int('_')] = A_LLETTER;
|
||||||
|
underscoreatend = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -557,26 +561,26 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
|
|||||||
|
|
||||||
// Maybe trim at end. These are chars that we might keep
|
// Maybe trim at end. These are chars that we might keep
|
||||||
// inside a span, but not at the end.
|
// inside a span, but not at the end.
|
||||||
while (m_span.length() > 0) {
|
string::size_type trimsz{0};
|
||||||
switch (*(m_span.rbegin())) {
|
while (trimsz < m_span.length()) {
|
||||||
case '.':
|
auto c = m_span[m_span.length() - 1 - trimsz];
|
||||||
case '-':
|
if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' ||
|
||||||
case ',':
|
c == underscoreatend) {
|
||||||
case '@':
|
trimsz++;
|
||||||
case '_':
|
|
||||||
case '\'':
|
|
||||||
m_span.resize(m_span.length()-1);
|
|
||||||
if (m_words_in_span.size() &&
|
if (m_words_in_span.size() &&
|
||||||
m_words_in_span.back().second > int(m_span.size()))
|
m_words_in_span.back().second > int(m_span.size())) {
|
||||||
m_words_in_span.back().second = int(m_span.size());
|
m_words_in_span.back().second = int(m_span.size());
|
||||||
if (--bp < 0)
|
}
|
||||||
|
if (--bp < 0) {
|
||||||
bp = 0;
|
bp = 0;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
break;
|
break;
|
||||||
default:
|
|
||||||
goto breaktrimloop;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
breaktrimloop:
|
if (trimsz > 0) {
|
||||||
|
m_span.resize(m_span.length() - trimsz);
|
||||||
|
}
|
||||||
|
|
||||||
if (!words_from_span(bp)) {
|
if (!words_from_span(bp)) {
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@ -265,6 +265,7 @@ int main(int argc, char **argv)
|
|||||||
if (!kotagger.empty()) {
|
if (!kotagger.empty()) {
|
||||||
fprintf(fp, "hangultagger = %s\n", kotagger.c_str());
|
fprintf(fp, "hangultagger = %s\n", kotagger.c_str());
|
||||||
}
|
}
|
||||||
|
fprintf(fp, "underscoreasletter = 0\n");
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
|
|
||||||
Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));
|
Logger::getTheLog("")->setLogLevel(Logger::LogLevel(loglevel));
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user