recoll/src/filters/kosplitter.py

117 lines
3.9 KiB
Python
Executable File

#!/usr/bin/python3
#################################
# Copyright (C) 2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
########################################################
#
# Interface to the konlpy Korean text analyser: we receive text from
# our parent process and have it segmented by the analyser, then
# return the results. The analyser startup is very expensive (several
# seconds), which is why we can't just execute it from the main
# process.
#
import sys
import cmdtalk
# We can either use konlpy, which supports different analysers, or use
# the python-mecab-ko, a direct interface to mecab, with the same
# interface as konlpy https://pypi.org/project/python-mecab-ko/
try:
import mecab
usingkonlpy = False
except:
import konlpy.tag
usingkonlpy = True
class Processor(object):
def __init__(self, proto):
self.proto = proto
self.tagsOkt = False
self.tagsMecab = False
self.tagsKomoran = False
def _init_tagger(self, taggername):
global usingkonlpy
if not usingkonlpy and taggername != "Mecab":
from konlpy.tag import Okt,Mecab,Komoran
usingkonlpy = True
if taggername == "Okt":
self.tagger = konlpy.tag.Okt()
self.tagsOkt = True
elif taggername == "Mecab":
if usingkonlpy:
# Use Mecab(dicpath="c:/some/path/mecab-ko-dic") for a
# non-default location. (?? mecab uses rcfile and dicdir not
# dicpath)
self.tagger = konlpy.tag.Mecab()
else:
self.tagger = mecab.MeCab()
self.tagsMecab = True
elif taggername == "Komoran":
self.tagger = konlpy.tag.Komoran()
self.tagsKomoran = True
else:
raise Exception("Bad tagger name " + taggername)
def process(self, params):
if 'data' not in params:
return {'error':'No data field in parameters'}
if not (self.tagsOkt or self.tagsMecab or self.tagsKomoran):
if 'tagger' not in params:
return {'error':'No "tagger" field in parameters'}
self._init_tagger(params['tagger']);
spliteojeol = False
if spliteojeol:
data = params['data'].split()
pos = []
for d in data:
pos += self.tagger.pos(d)
else:
pos = self.tagger.pos(params['data'])
#proto.log("POS: %s" % pos)
text = ""
tags = ""
for e in pos:
word = e[0]
word = word.replace('\t', ' ')
text += word + "\t"
tag = e[1]
if self.tagsOkt:
pass
elif self.tagsMecab or self.tagsKomoran:
tb = tag[0:2]
if tb[0] == "N":
tag = "Noun"
elif tb == "VV":
tag = "Verb"
elif tb == "VA":
tag = "Adjective"
elif tag == "MAG":
tag = "Adverb"
else:
pass
tags += tag + "\t"
return {'text': text, 'tags': tags}
proto = cmdtalk.CmdTalk()
processor = Processor(proto)
cmdtalk.main(proto, processor)