From 97f3212f80e021ce82a76b55ab450da6ab93ee11 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 14 May 2020 09:23:09 +0200 Subject: [PATCH] korean splitter: disable the noun+jx emitting thing --- src/common/textsplitko.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/common/textsplitko.cpp b/src/common/textsplitko.cpp index eba7d4d9..ec6c7662 100644 --- a/src/common/textsplitko.cpp +++ b/src/common/textsplitko.cpp @@ -218,6 +218,10 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) " bytepos " << bytepos << " word from text: " << inputdata.substr(bytepos, word.size()) << endl); bool isNoun = (tags[i] == "Noun"); +#if 0 + // When Noun followed by JX, emit both Noun and Noun+JX at the + // same pos Experimental, it seems that this is sometimes + // problematic, so turned off for now. if (isNoun) { lastNoun = word; lastNounWordPos = m_wordpos; @@ -231,6 +235,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) } lastNoun.clear(); } +#endif // 11/05/2020 For now index everything until more precise // verification of what should be pruned if (true || (isNoun || tags[i] == "Verb" ||