From 48d4678770cd64a4086a2de37f7aa5e2b30c7896 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 25 Apr 2020 14:19:54 +0200 Subject: [PATCH] experiment: Korean when Noun then JX emit both Noun and Noun+JX --- src/common/textsplitko.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/common/textsplitko.cpp b/src/common/textsplitko.cpp index ca05585d..ae7f2622 100644 --- a/src/common/textsplitko.cpp +++ b/src/common/textsplitko.cpp @@ -184,6 +184,9 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) // not in the whole text which is orgbytepos + bytepos string::size_type bytepos{0}; string::size_type pagefix{0}; + string lastNoun; + string::size_type lastNounBytePos{0}; + int lastNounWordPos{0}; for (unsigned int i = 0; i < words.size(); i++) { // The POS tagger strips characters from the input (e.g. multiple // spaces, sometimes new lines, possibly other stuff). This @@ -214,13 +217,27 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) " absbytepos " << orgbytepos + bytepos << " bytepos " << bytepos << " word from text: " << inputdata.substr(bytepos, word.size()) << endl); - if (tags[i] == "Noun" || tags[i] == "Verb" || + bool isNoun = (tags[i] == "Noun"); + if (isNoun) { + lastNoun = word; + lastNounWordPos = m_wordpos; + lastNounBytePos = orgbytepos + bytepos - pagefix; + } else { + if (tags[i] == "JX" && !lastNoun.empty()) { + if (!takeword(lastNoun+word, lastNounWordPos, lastNounBytePos, + lastNounBytePos + word.size())) { + return false; + } + } + lastNoun.clear(); + } + if (isNoun || tags[i] == "Verb" || tags[i] == "Adjective" || tags[i] == "Adverb") { string::size_type abspos = orgbytepos + bytepos - pagefix; if (!takeword(word, m_wordpos++, abspos, abspos + word.size())) { return false; } - } + } bytepos += word.size(); }