Restore nonumbers number indexing exclusion function

This commit is contained in:
Jean-Francois Dockes 2020-08-22 10:07:58 +02:00
parent cbcfa7e9a1
commit 3f1dfa564c

View File

@ -521,60 +521,60 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
if (m_wordLen) { if (m_wordLen) {
// We have a current word. Remember it // We have a current word. Remember it
// Limit max span word count
if (m_words_in_span.size() >= 6) { if (m_words_in_span.size() >= 6) {
// Limit max span word count
spanerase = true; spanerase = true;
} }
m_words_in_span.push_back(pair<int,int>(m_wordStart, if (!(o_noNumbers && m_inNumber)) {
m_wordStart + m_wordLen)); m_words_in_span.push_back({m_wordStart, m_wordStart + m_wordLen});
m_wordpos++; m_wordpos++;
}
m_wordLen = m_wordChars = 0; m_wordLen = m_wordChars = 0;
} }
if (spanerase) { if (!spanerase) {
// We encountered a span-terminating character. Produce terms. // Not done with this span. Just update relative word start offset.
string acronym;
if (span_is_acronym(&acronym)) {
if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(), bp))
return false;
}
// Maybe trim at end. These are chars that we might keep
// inside a span, but not at the end.
while (m_span.length() > 0) {
switch (*(m_span.rbegin())) {
case '.':
case '-':
case ',':
case '@':
case '_':
case '\'':
m_span.resize(m_span.length()-1);
if (m_words_in_span.size() &&
m_words_in_span.back().second > int(m_span.size()))
m_words_in_span.back().second = int(m_span.size());
if (--bp < 0)
bp = 0;
break;
default:
goto breaktrimloop;
}
}
breaktrimloop:
if (!words_from_span(bp)) {
return false;
}
discardspan();
} else {
m_wordStart = int(m_span.length()); m_wordStart = int(m_span.length());
return true;
} }
// Span is done (too long or span-terminating character). Produce
// terms and reset it.
string acronym;
if (span_is_acronym(&acronym)) {
if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(), bp))
return false;
}
// Maybe trim at end. These are chars that we might keep
// inside a span, but not at the end.
while (m_span.length() > 0) {
switch (*(m_span.rbegin())) {
case '.':
case '-':
case ',':
case '@':
case '_':
case '\'':
m_span.resize(m_span.length()-1);
if (m_words_in_span.size() &&
m_words_in_span.back().second > int(m_span.size()))
m_words_in_span.back().second = int(m_span.size());
if (--bp < 0)
bp = 0;
break;
default:
goto breaktrimloop;
}
}
breaktrimloop:
if (!words_from_span(bp)) {
return false;
}
discardspan();
return true; return true;
} }