Restore nonumbers number indexing exclusion function
This commit is contained in:
parent
cbcfa7e9a1
commit
3f1dfa564c
@ -521,60 +521,60 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
|
|||||||
if (m_wordLen) {
|
if (m_wordLen) {
|
||||||
// We have a current word. Remember it
|
// We have a current word. Remember it
|
||||||
|
|
||||||
// Limit max span word count
|
|
||||||
if (m_words_in_span.size() >= 6) {
|
if (m_words_in_span.size() >= 6) {
|
||||||
|
// Limit max span word count
|
||||||
spanerase = true;
|
spanerase = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
m_words_in_span.push_back(pair<int,int>(m_wordStart,
|
if (!(o_noNumbers && m_inNumber)) {
|
||||||
m_wordStart + m_wordLen));
|
m_words_in_span.push_back({m_wordStart, m_wordStart + m_wordLen});
|
||||||
m_wordpos++;
|
m_wordpos++;
|
||||||
|
}
|
||||||
m_wordLen = m_wordChars = 0;
|
m_wordLen = m_wordChars = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (spanerase) {
|
if (!spanerase) {
|
||||||
// We encountered a span-terminating character. Produce terms.
|
// Not done with this span. Just update relative word start offset.
|
||||||
|
|
||||||
string acronym;
|
|
||||||
if (span_is_acronym(&acronym)) {
|
|
||||||
if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(), bp))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Maybe trim at end. These are chars that we might keep
|
|
||||||
// inside a span, but not at the end.
|
|
||||||
while (m_span.length() > 0) {
|
|
||||||
switch (*(m_span.rbegin())) {
|
|
||||||
case '.':
|
|
||||||
case '-':
|
|
||||||
case ',':
|
|
||||||
case '@':
|
|
||||||
case '_':
|
|
||||||
case '\'':
|
|
||||||
m_span.resize(m_span.length()-1);
|
|
||||||
if (m_words_in_span.size() &&
|
|
||||||
m_words_in_span.back().second > int(m_span.size()))
|
|
||||||
m_words_in_span.back().second = int(m_span.size());
|
|
||||||
if (--bp < 0)
|
|
||||||
bp = 0;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
goto breaktrimloop;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
breaktrimloop:
|
|
||||||
|
|
||||||
if (!words_from_span(bp)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
discardspan();
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
m_wordStart = int(m_span.length());
|
m_wordStart = int(m_span.length());
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Span is done (too long or span-terminating character). Produce
|
||||||
|
// terms and reset it.
|
||||||
|
string acronym;
|
||||||
|
if (span_is_acronym(&acronym)) {
|
||||||
|
if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(), bp))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Maybe trim at end. These are chars that we might keep
|
||||||
|
// inside a span, but not at the end.
|
||||||
|
while (m_span.length() > 0) {
|
||||||
|
switch (*(m_span.rbegin())) {
|
||||||
|
case '.':
|
||||||
|
case '-':
|
||||||
|
case ',':
|
||||||
|
case '@':
|
||||||
|
case '_':
|
||||||
|
case '\'':
|
||||||
|
m_span.resize(m_span.length()-1);
|
||||||
|
if (m_words_in_span.size() &&
|
||||||
|
m_words_in_span.back().second > int(m_span.size()))
|
||||||
|
m_words_in_span.back().second = int(m_span.size());
|
||||||
|
if (--bp < 0)
|
||||||
|
bp = 0;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
goto breaktrimloop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
breaktrimloop:
|
||||||
|
|
||||||
|
if (!words_from_span(bp)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
discardspan();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user