diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index bb61696e..59bbfbbe 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -44,8 +44,10 @@ // ngrams #undef KATAKANA_AS_WORDS -// Same for Korean syllabic, and same problem, not used. -#undef HANGUL_AS_WORDS +// Same for Korean syllabic, and same problem. However we have a +// runtime option to use an external text analyser for hangul, so this +// is defined at compile time. +#define HANGUL_AS_WORDS using namespace std; @@ -246,7 +248,6 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr) #define UNICODE_IS_KATAKANA(p) false #endif -#define HANGUL_AS_WORDS #ifdef HANGUL_AS_WORDS #define UNICODE_IS_HANGUL(p) ( \ ((p) >= 0x1100 && (p) <= 0x11FF) || \ @@ -290,6 +291,7 @@ bool TextSplit::o_noNumbers{false}; bool TextSplit::o_deHyphenate{false}; int TextSplit::o_maxWordLength{40}; static const int o_CJKMaxNgramLen{5}; +bool o_exthangultagger{false}; void TextSplit::staticConfInit(RclConfig *config) { @@ -324,7 +326,13 @@ void TextSplit::staticConfInit(RclConfig *config) charclasses[int('\\')] = SPACE; } } - koStaticConfInit(config); + + string kotagger; + config->getConfParam("hangultagger", kotagger); + if (!kotagger.empty()) { + o_exthangultagger = true; + koStaticConfInit(config, kotagger); + } } // Final term checkpoint: do some checking (the kind which is simpler @@ -627,7 +635,11 @@ bool TextSplit::text_to_words(const string &in) if (UNICODE_IS_KATAKANA(c)) { csc = CSC_KATAKANA; } else if (UNICODE_IS_HANGUL(c)) { - csc = CSC_HANGUL; + if (o_exthangultagger) { + csc = CSC_HANGUL; + } else { + csc = CSC_CJK; + } } else if (UNICODE_IS_CJK(c)) { csc = CSC_CJK; } else { @@ -635,15 +647,13 @@ bool TextSplit::text_to_words(const string &in) } if (o_processCJK && (csc == CSC_CJK || csc == CSC_HANGUL)) { - // CJK character hit. Hangul processing may be special or - // not depending on how we were built. + // CJK character hit. Hangul processing may be special. // Do like at EOF with the current non-cjk data. if (m_wordLen || m_span.length()) { if (!doemit(true, it.getBpos())) return false; } - // Hand off situation to the appropriate routine. if (csc == CSC_HANGUL) { if (!ko_to_words(&it, &c)) { diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 8f8f19d3..3cf7adf3 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -54,7 +54,7 @@ public: /** Call at program initialization to read non default values from the configuration */ static void staticConfInit(RclConfig *config); - static void koStaticConfInit(RclConfig *config); + static void koStaticConfInit(RclConfig *config, const std::string& tagger); /** Split text, emit words and positions. */ virtual bool text_to_words(const std::string &in); diff --git a/src/common/textsplitko.cpp b/src/common/textsplitko.cpp index e4c624b4..7d26e0a6 100644 --- a/src/common/textsplitko.cpp +++ b/src/common/textsplitko.cpp @@ -15,6 +15,13 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ +// Specialized Korean text splitter using konlpy running in a Python +// subprocess. konlpy can use several different backends. We support +// Okt (Twitter) and Mecab at this point. Unfortunately the different +// backends have different POS TAG names, so that things are not +// completly transparent when using another (need to translate the tag +// names in the Python program). + #include "autoconfig.h" #include @@ -33,16 +40,27 @@ using namespace std; +// Separator char used in words and tags lists. +static const string sepchars("\t"); + static CmdTalk *o_talker; static bool o_starterror{false}; static string o_cmdpath; std::mutex o_mutex; +static string o_taggername{"Okt"}; -void TextSplit::koStaticConfInit(RclConfig *config) +void TextSplit::koStaticConfInit(RclConfig *config, const string& tagger) { o_cmdpath = config->findFilter("kosplitter.py"); + if (tagger == "Okt" && tagger == "Mecab") { + o_taggername = tagger; + } else { + LOGERR("TextSplit::koStaticConfInit: unknown tagger [" << tagger << + "], using Okt\n"); + } } +// Start the Python subprocess static bool initCmd() { if (o_starterror) { @@ -68,8 +86,6 @@ static bool initCmd() return true; } -static const string sepchars("\t"); - bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) { std::unique_lock mylock(o_mutex); @@ -78,18 +94,28 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) return false; } } + LOGDEB1("k_to_words: m_wordpos " << m_wordpos << "\n"); Utf8Iter &it = *itp; unsigned int c = 0; + unordered_map args; + args.insert(pair{"data", string()}); string& inputdata{args.begin()->second}; - string::size_type orgbytepos = it.getBpos(); + + // We send the tagger name every time but it's only used the first + // one: can't change it after init. We could avoid sending it + // every time, but I don't think that the performance hit is + // significant + args.insert(pair{"tagger", o_taggername}); - // Gather all Korean characters and send the text to the analyser + // Walk the Korean characters section and send the text to the + // analyser + string::size_type orgbytepos = it.getBpos(); for (; !it.eof(); it++) { c = *it; - if (!isHANGUL(c) && !(isascii(c) && (isspace(c) || ispunct(c)))) { + if (!isHANGUL(c) && !(isspace(c) || ispunct(c))) { // Done with Korean stretch, process and go back to main routine //std::cerr << "Broke on char " << int(c) << endl; break; @@ -97,10 +123,6 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) it.appendchartostring(inputdata); } } - // Need to convert white text spans to single space otherwise the - // byte offsets will be wrong - - string::size_type textsize = inputdata.size(); LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() << " bytes " << inputdata << endl); unordered_map result; @@ -161,11 +183,11 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp) } #if DO_CHECK_THINGS - int sizediff = textsize - (bytepos - orgbytepos); + int sizediff = inputdata.size() - (bytepos - orgbytepos); if (sizediff < 0) sizediff = -sizediff; if (sizediff > 1) { - LOGERR("ORIGINAL TEXT SIZE: " << textsize << + LOGERR("ORIGINAL TEXT SIZE: " << inputdata.size() << " FINAL BYTE POS " << bytepos - orgbytepos << " TEXT [" << inputdata << "]\n"); } diff --git a/src/filters/kosplitter.py b/src/filters/kosplitter.py index 4037acb5..c586cfff 100755 --- a/src/filters/kosplitter.py +++ b/src/filters/kosplitter.py @@ -28,17 +28,32 @@ import sys import cmdtalk -from konlpy.tag import Okt,Kkma +from konlpy.tag import Okt,Mecab class Processor(object): def __init__(self, proto): self.proto = proto - self.tagger = Okt() - #self.tagger = Kkma() + self.tagsOkt = False + self.tagsMecab = False + def _init_tagger(self, taggername): + if taggername == "Okt": + self.tagger = Okt() + self.tagsOkt = True + elif taggername == "Mecab": + self.tagger = Mecab() + self.tagsMecab = True + else: + raise Exception("Bad tagger name " + taggername) + def process(self, params): if 'data' not in params: return {'error':'No data field in parameters'} + if not (self.tagsOkt or self.tagsMecab): + if 'tagger' not in params: + return {'error':'No "tagger" field in parameters'} + self._init_tagger(params['tagger']); + pos = self.tagger.pos(params['data']) #proto.log("%s" % pos) text = "" @@ -47,10 +62,25 @@ class Processor(object): word = e[0] word = word.replace('\t', ' ') text += word + "\t" - tags += e[1] + "\t" + tag = e[1] + if self.tagsOkt: + pass + elif self.tagsMecab: + tb = tag[0:2] + if tb[0] == "N": + tag = "Noun" + elif tb == "VV": + tag = "Verb" + elif tb == "VA": + tag = "Adjective" + elif tag == "MAG": + tag = "Adverb" + else: + pass + tags += tag + "\t" return {'text': text, 'tags': tags} + proto = cmdtalk.CmdTalk() processor = Processor(proto) cmdtalk.main(proto, processor) - diff --git a/src/utils/cmdtalk.h b/src/utils/cmdtalk.h index b7a55cb1..9f937b97 100644 --- a/src/utils/cmdtalk.h +++ b/src/utils/cmdtalk.h @@ -74,6 +74,10 @@ class CmdTalk { // @param env each entry should be of the form name=value. They // augment the subprocess environnement. // @param path replaces the PATH variable when looking for the command. + // + // Note that cmdtalk.py:main() method is a test routine which + // expects data pairs on the command line. If actual parameters + // need to be passed, it can't be used by the processor. virtual bool startCmd(const std::string& cmdname, const std::vector& args = std::vector(),