recoll/src/filters/kosplitter.py

#!/usr/bin/python3
#################################
# Copyright (C) 2020 J.F.Dockes
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the
#   Free Software Foundation, Inc.,
#   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
########################################################

#
# Interface to the konlpy Korean text analyser: we receive text from
# our parent process and have it segmented by the analyser, then
# return the results. The analyser startup is very expensive (several
# seconds), which is why we can't just execute it from the main
# process.
#

import sys
import cmdtalk

# We can either use konlpy, which supports different analysers, or use
# the python-mecab-ko, a direct interface to mecab, with the same
# interface as konlpy https://pypi.org/project/python-mecab-ko/
try:
    import mecab
    usingkonlpy = False
except:
    import konlpy.tag
    usingkonlpy = True

class Processor(object):
    def __init__(self, proto):
        self.proto = proto
        self.tagsOkt = False
        self.tagsMecab = False
        self.tagsKomoran = False

    def _init_tagger(self, taggername):
        global usingkonlpy
        if not usingkonlpy and taggername != "Mecab":
            from konlpy.tag import Okt,Mecab,Komoran
            usingkonlpy = True
        if taggername == "Okt":
            self.tagger = konlpy.tag.Okt()
            self.tagsOkt = True
        elif taggername == "Mecab":
            if usingkonlpy:
                # Use Mecab(dicpath="c:/some/path/mecab-ko-dic") for a
                # non-default location. (?? mecab uses rcfile and dicdir not
                # dicpath)
                self.tagger = konlpy.tag.Mecab()
            else:
                self.tagger = mecab.MeCab()
            self.tagsMecab = True
        elif taggername == "Komoran":
            self.tagger = konlpy.tag.Komoran()
            self.tagsKomoran = True
        else:
            raise Exception("Bad tagger name " + taggername)

    def process(self, params):
        if 'data' not in params:
            return {'error':'No data field in parameters'}
        if not (self.tagsOkt or self.tagsMecab or self.tagsKomoran):
            if 'tagger' not in params:
                return {'error':'No "tagger" field in parameters'}
            self._init_tagger(params['tagger']);

        spliteojeol = False
        if spliteojeol:
            data = params['data'].split()
            pos = []
            for d in data:
                pos += self.tagger.pos(d)
        else:
            pos = self.tagger.pos(params['data'])

        #proto.log("POS: %s" % pos)
        text = ""
        tags = ""
        for e in pos:
            word = e[0]
            word = word.replace('\t', ' ')
            text += word + "\t"
            tag = e[1]
            if self.tagsOkt:
                pass
            elif self.tagsMecab or self.tagsKomoran:
                tb = tag[0:2]
                if tb[0] == "N":
                    tag = "Noun"
                elif tb == "VV":
                    tag = "Verb"
                elif tb == "VA":
                    tag = "Adjective"
                elif tag == "MAG":
                    tag = "Adverb"
            else:
                pass
            tags += tag + "\t"
        return {'text': text, 'tags': tags}


proto = cmdtalk.CmdTalk()
processor = Processor(proto)
cmdtalk.main(proto, processor)