diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 081e05d4..28f91c13 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -468,8 +468,8 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp) void TextSplit::discardspan() { + m_span.clear(); m_words_in_span.clear(); - m_span.erase(); m_spanpos = m_wordpos; m_wordStart = 0; m_wordLen = m_wordChars = 0; @@ -513,10 +513,9 @@ bool TextSplit::text_to_words(const string &in) if (in.empty()) return true; - m_span.erase(); - m_inNumber = false; - m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos - = m_spanpos = 0; + // Reset the data members relative to splitting state + clearsplitstate(); + bool pagepending = false; bool softhyphenpending = false; @@ -935,10 +934,12 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp) } } - m_span.erase(); - m_inNumber = false; - m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = 0; - m_spanpos = m_wordpos; + // Reset state, saving term position, and return the found non-cjk + // unicode character value. The current input byte offset is kept + // in the utf8Iter + int pos = m_wordpos; + clearsplitstate(); + m_spanpos = m_wordpos = pos; *cp = c; return true; } diff --git a/src/common/textsplit.h b/src/common/textsplit.h index b68d9430..9f66d13c 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -213,6 +213,14 @@ private: // Word length in characters. Declared but not updated if !TEXTSPLIT_STATS unsigned int m_wordChars; + void clearsplitstate() { + m_span.clear(); + m_words_in_span.clear(); + m_inNumber = false; + m_wordStart = m_wordLen = m_wordpos = m_spanpos = m_prevpos = + m_prevlen = m_wordChars = 0; + } + // This processes cjk text: bool cjk_to_words(Utf8Iter *it, unsigned int *cp);