korean splitter: break on digits
This commit is contained in:
parent
fc981e3733
commit
a5bab94ae3
@ -110,6 +110,11 @@ static bool initCmd()
|
|||||||
|
|
||||||
#define STRSZT std::string::size_type
|
#define STRSZT std::string::size_type
|
||||||
|
|
||||||
|
#define ISASCIIPUNCTORCTL(c) (c <= 0x7f && \
|
||||||
|
!((c >= 'A' && c <= 'Z') || \
|
||||||
|
(c >= 'a' && c <= 'z') || \
|
||||||
|
(c >= '0' && c <= '9')))
|
||||||
|
|
||||||
bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||||
{
|
{
|
||||||
LOGDEB1("ko_to_words\n");
|
LOGDEB1("ko_to_words\n");
|
||||||
@ -145,9 +150,12 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
std::vector<std::pair<STRSZT, STRSZT>> spans;
|
std::vector<std::pair<STRSZT, STRSZT>> spans;
|
||||||
for (; !it.eof() && !it.error(); it++) {
|
for (; !it.eof() && !it.error(); it++) {
|
||||||
c = *it;
|
c = *it;
|
||||||
if (!isHANGUL(c) && isalpha(c)) {
|
if (!isHANGUL(c) && !ISASCIIPUNCTORCTL(c)) {
|
||||||
// Done with Korean stretch. Process to next step.
|
// Non-Korean: we keep on if encountering space and other
|
||||||
LOGDEB1("ko_to_words: broke on " << (std::string)it << endl);
|
// ASCII punctuation. Allows sending longer pieces of text
|
||||||
|
// to the splitter (perf). Else break, process this piece,
|
||||||
|
// and return to the main splitter
|
||||||
|
LOGINF("ko_to_words: broke on " << (std::string)it << endl);
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
if (c == '\f') {
|
if (c == '\f') {
|
||||||
@ -160,7 +168,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
} else {
|
} else {
|
||||||
// Alpha was taken care of above. Keep only ascii
|
// Alpha was taken care of above. Keep only ascii
|
||||||
// numbers, replace all punctuation with spaces.
|
// numbers, replace all punctuation with spaces.
|
||||||
if (c <= 0x7f && (c < 0x30 || c > 0x39)) {
|
if (ISASCIIPUNCTORCTL(c)) {
|
||||||
if (!wasspace) {
|
if (!wasspace) {
|
||||||
// End of span
|
// End of span
|
||||||
spans.push_back({spanstart, inputdata.size()});
|
spans.push_back({spanstart, inputdata.size()});
|
||||||
@ -183,8 +191,8 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
spans.push_back({spanstart, inputdata.size()});
|
spans.push_back({spanstart, inputdata.size()});
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() <<
|
LOGINF("TextSplit::k_to_words: sending out " << inputdata.size() <<
|
||||||
" bytes " << inputdata << endl);
|
" bytes " << inputdata << endl);
|
||||||
|
|
||||||
// Overall data counter for slave restarts
|
// Overall data counter for slave restarts
|
||||||
restartcount += inputdata.size();
|
restartcount += inputdata.size();
|
||||||
@ -271,7 +279,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
});
|
});
|
||||||
if (it != spans.end()) {
|
if (it != spans.end()) {
|
||||||
span = inputdata.substr(it->first, it->second-it->first);
|
span = inputdata.substr(it->first, it->second-it->first);
|
||||||
LOGDEB1("KO: SPAN: [" << span << "] pos " << m_wordpos <<
|
LOGINF("KO: SPAN: [" << span << "] pos " << m_wordpos <<
|
||||||
" bytepos " << bytepos << "\n");
|
" bytepos " << bytepos << "\n");
|
||||||
if (!takeword(span, m_wordpos, abspos, abspos + span.size())) {
|
if (!takeword(span, m_wordpos, abspos, abspos + span.size())) {
|
||||||
return false;
|
return false;
|
||||||
@ -279,7 +287,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Possibly emit a part of span word.
|
// Possibly emit a part of span word.
|
||||||
LOGDEB1("KO: WORD: [" << word << "] pos " << m_wordpos <<
|
LOGINF("KO: WORD: [" << word << "] pos " << m_wordpos <<
|
||||||
" bytepos " << bytepos << "\n");
|
" bytepos " << bytepos << "\n");
|
||||||
// Emit words only if not in onlyspans mode, and different
|
// Emit words only if not in onlyspans mode, and different
|
||||||
// from span. Else, just increase the position
|
// from span. Else, just increase the position
|
||||||
@ -289,7 +297,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOGDEB1("KO: WORD: SKIP\n");
|
LOGINF("KO: WORD: SKIP\n");
|
||||||
}
|
}
|
||||||
m_wordpos++;
|
m_wordpos++;
|
||||||
bytepos += word.size();
|
bytepos += word.size();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user