textsplit: discard - in front of words. Handle cjk punctuation characters

This commit is contained in:
"Jean-Francois Dockes ext:(%22) 2011-07-16 11:51:38 +02:00
parent 0e37f64a3c
commit 36516b091b

View File

@ -228,6 +228,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
case '-': case '-':
case ',': case ',':
case '@': case '@':
case '_':
case '\'': case '\'':
m_span.resize(m_span.length()-1); m_span.resize(m_span.length()-1);
if (--bp < 0) if (--bp < 0)
@ -357,21 +358,15 @@ bool TextSplit::text_to_words(const string &in)
case '+': case '+':
curspanglue = cc; curspanglue = cc;
if (m_wordLen == 0) { if (m_wordLen == 0) {
if (cc == '-') { // + or - don't start a term except if this looks like
if (whatcc(it[it.getCpos()+1]) == DIGIT) { // it's going to be to be a number
// -10 if (whatcc(it[it.getCpos()+1]) == DIGIT) {
m_inNumber = true; // -10
m_wordLen += it.appendchartostring(m_span); m_inNumber = true;
} else { m_wordLen += it.appendchartostring(m_span);
goto SPACE;
}
} else { } else {
if (nonalnumcnt > 2) { goto SPACE;
discardspan(); }
} else {
m_wordStart += it.appendchartostring(m_span);
}
}
} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' || } else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
m_span[m_span.length() - 1] == 'E')) { m_span[m_span.length() - 1] == 'E')) {
if (whatcc(it[it.getCpos()+1]) == DIGIT) { if (whatcc(it[it.getCpos()+1]) == DIGIT) {
@ -580,7 +575,11 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
// Return to normal handler // Return to normal handler
break; break;
} }
if (whatcc(c) == SPACE) {
// Flush the ngram buffer and go on
nchars = 0;
continue;
}
if (nchars == o_CJKNgramLen) { if (nchars == o_CJKNgramLen) {
// Offset buffer full, shift it. Might be more efficient // Offset buffer full, shift it. Might be more efficient
// to have a circular one, but things are complicated // to have a circular one, but things are complicated