textsplit: discard - in front of words. Handle cjk punctuation characters
This commit is contained in:
parent
0e37f64a3c
commit
36516b091b
@ -228,6 +228,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
|||||||
case '-':
|
case '-':
|
||||||
case ',':
|
case ',':
|
||||||
case '@':
|
case '@':
|
||||||
|
case '_':
|
||||||
case '\'':
|
case '\'':
|
||||||
m_span.resize(m_span.length()-1);
|
m_span.resize(m_span.length()-1);
|
||||||
if (--bp < 0)
|
if (--bp < 0)
|
||||||
@ -357,21 +358,15 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
case '+':
|
case '+':
|
||||||
curspanglue = cc;
|
curspanglue = cc;
|
||||||
if (m_wordLen == 0) {
|
if (m_wordLen == 0) {
|
||||||
if (cc == '-') {
|
// + or - don't start a term except if this looks like
|
||||||
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
// it's going to be to be a number
|
||||||
// -10
|
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
||||||
m_inNumber = true;
|
// -10
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_inNumber = true;
|
||||||
} else {
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
goto SPACE;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if (nonalnumcnt > 2) {
|
goto SPACE;
|
||||||
discardspan();
|
}
|
||||||
} else {
|
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
|
} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
|
||||||
m_span[m_span.length() - 1] == 'E')) {
|
m_span[m_span.length() - 1] == 'E')) {
|
||||||
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
||||||
@ -580,7 +575,11 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
// Return to normal handler
|
// Return to normal handler
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if (whatcc(c) == SPACE) {
|
||||||
|
// Flush the ngram buffer and go on
|
||||||
|
nchars = 0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if (nchars == o_CJKNgramLen) {
|
if (nchars == o_CJKNgramLen) {
|
||||||
// Offset buffer full, shift it. Might be more efficient
|
// Offset buffer full, shift it. Might be more efficient
|
||||||
// to have a circular one, but things are complicated
|
// to have a circular one, but things are complicated
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user