From 48d4678770cd64a4086a2de37f7aa5e2b30c7896 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Sat, 25 Apr 2020 14:19:54 +0200
Subject: [PATCH] experiment: Korean when Noun then JX emit both Noun and
 Noun+JX

---
 src/common/textsplitko.cpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/common/textsplitko.cpp b/src/common/textsplitko.cpp
index ca05585d..ae7f2622 100644
--- a/src/common/textsplitko.cpp
+++ b/src/common/textsplitko.cpp
@@ -184,6 +184,9 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
     // not in the whole text which is orgbytepos + bytepos
     string::size_type bytepos{0};
     string::size_type pagefix{0};
+    string lastNoun;
+    string::size_type lastNounBytePos{0};
+    int lastNounWordPos{0};
     for (unsigned int i = 0; i < words.size(); i++) {
         // The POS tagger strips characters from the input (e.g. multiple
         // spaces, sometimes new lines, possibly other stuff). This
@@ -214,13 +217,27 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
                 " absbytepos " << orgbytepos + bytepos << 
                 " bytepos " << bytepos << " word from text: " <<
                 inputdata.substr(bytepos, word.size()) << endl);
-        if (tags[i] == "Noun" || tags[i] == "Verb" ||
+        bool isNoun = (tags[i] == "Noun");
+        if (isNoun) {
+            lastNoun = word;
+            lastNounWordPos = m_wordpos;
+            lastNounBytePos = orgbytepos + bytepos - pagefix;
+        } else {
+            if (tags[i] == "JX" && !lastNoun.empty()) {
+                if (!takeword(lastNoun+word, lastNounWordPos, lastNounBytePos,
+                              lastNounBytePos + word.size())) {
+                    return false;
+                }
+            }
+            lastNoun.clear();
+        }
+        if (isNoun || tags[i] == "Verb" ||
             tags[i] == "Adjective" || tags[i] == "Adverb") {
             string::size_type abspos = orgbytepos + bytepos - pagefix;
             if (!takeword(word, m_wordpos++, abspos, abspos + word.size())) {
                 return false;
             }
-        }
+        } 
         bytepos += word.size();
     }