Korean external splitter: add some support for Mecab
This commit is contained in:
parent
c9667b5ba7
commit
9719177c82
@ -44,8 +44,10 @@
|
|||||||
// ngrams
|
// ngrams
|
||||||
#undef KATAKANA_AS_WORDS
|
#undef KATAKANA_AS_WORDS
|
||||||
|
|
||||||
// Same for Korean syllabic, and same problem, not used.
|
// Same for Korean syllabic, and same problem. However we have a
|
||||||
#undef HANGUL_AS_WORDS
|
// runtime option to use an external text analyser for hangul, so this
|
||||||
|
// is defined at compile time.
|
||||||
|
#define HANGUL_AS_WORDS
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@ -246,7 +248,6 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
|
|||||||
#define UNICODE_IS_KATAKANA(p) false
|
#define UNICODE_IS_KATAKANA(p) false
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define HANGUL_AS_WORDS
|
|
||||||
#ifdef HANGUL_AS_WORDS
|
#ifdef HANGUL_AS_WORDS
|
||||||
#define UNICODE_IS_HANGUL(p) ( \
|
#define UNICODE_IS_HANGUL(p) ( \
|
||||||
((p) >= 0x1100 && (p) <= 0x11FF) || \
|
((p) >= 0x1100 && (p) <= 0x11FF) || \
|
||||||
@ -290,6 +291,7 @@ bool TextSplit::o_noNumbers{false};
|
|||||||
bool TextSplit::o_deHyphenate{false};
|
bool TextSplit::o_deHyphenate{false};
|
||||||
int TextSplit::o_maxWordLength{40};
|
int TextSplit::o_maxWordLength{40};
|
||||||
static const int o_CJKMaxNgramLen{5};
|
static const int o_CJKMaxNgramLen{5};
|
||||||
|
bool o_exthangultagger{false};
|
||||||
|
|
||||||
void TextSplit::staticConfInit(RclConfig *config)
|
void TextSplit::staticConfInit(RclConfig *config)
|
||||||
{
|
{
|
||||||
@ -324,7 +326,13 @@ void TextSplit::staticConfInit(RclConfig *config)
|
|||||||
charclasses[int('\\')] = SPACE;
|
charclasses[int('\\')] = SPACE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
koStaticConfInit(config);
|
|
||||||
|
string kotagger;
|
||||||
|
config->getConfParam("hangultagger", kotagger);
|
||||||
|
if (!kotagger.empty()) {
|
||||||
|
o_exthangultagger = true;
|
||||||
|
koStaticConfInit(config, kotagger);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Final term checkpoint: do some checking (the kind which is simpler
|
// Final term checkpoint: do some checking (the kind which is simpler
|
||||||
@ -627,7 +635,11 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
if (UNICODE_IS_KATAKANA(c)) {
|
if (UNICODE_IS_KATAKANA(c)) {
|
||||||
csc = CSC_KATAKANA;
|
csc = CSC_KATAKANA;
|
||||||
} else if (UNICODE_IS_HANGUL(c)) {
|
} else if (UNICODE_IS_HANGUL(c)) {
|
||||||
csc = CSC_HANGUL;
|
if (o_exthangultagger) {
|
||||||
|
csc = CSC_HANGUL;
|
||||||
|
} else {
|
||||||
|
csc = CSC_CJK;
|
||||||
|
}
|
||||||
} else if (UNICODE_IS_CJK(c)) {
|
} else if (UNICODE_IS_CJK(c)) {
|
||||||
csc = CSC_CJK;
|
csc = CSC_CJK;
|
||||||
} else {
|
} else {
|
||||||
@ -635,15 +647,13 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (o_processCJK && (csc == CSC_CJK || csc == CSC_HANGUL)) {
|
if (o_processCJK && (csc == CSC_CJK || csc == CSC_HANGUL)) {
|
||||||
// CJK character hit. Hangul processing may be special or
|
// CJK character hit. Hangul processing may be special.
|
||||||
// not depending on how we were built.
|
|
||||||
|
|
||||||
// Do like at EOF with the current non-cjk data.
|
// Do like at EOF with the current non-cjk data.
|
||||||
if (m_wordLen || m_span.length()) {
|
if (m_wordLen || m_span.length()) {
|
||||||
if (!doemit(true, it.getBpos()))
|
if (!doemit(true, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Hand off situation to the appropriate routine.
|
// Hand off situation to the appropriate routine.
|
||||||
if (csc == CSC_HANGUL) {
|
if (csc == CSC_HANGUL) {
|
||||||
if (!ko_to_words(&it, &c)) {
|
if (!ko_to_words(&it, &c)) {
|
||||||
|
|||||||
@ -54,7 +54,7 @@ public:
|
|||||||
/** Call at program initialization to read non default values from the
|
/** Call at program initialization to read non default values from the
|
||||||
configuration */
|
configuration */
|
||||||
static void staticConfInit(RclConfig *config);
|
static void staticConfInit(RclConfig *config);
|
||||||
static void koStaticConfInit(RclConfig *config);
|
static void koStaticConfInit(RclConfig *config, const std::string& tagger);
|
||||||
|
|
||||||
/** Split text, emit words and positions. */
|
/** Split text, emit words and positions. */
|
||||||
virtual bool text_to_words(const std::string &in);
|
virtual bool text_to_words(const std::string &in);
|
||||||
|
|||||||
@ -15,6 +15,13 @@
|
|||||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// Specialized Korean text splitter using konlpy running in a Python
|
||||||
|
// subprocess. konlpy can use several different backends. We support
|
||||||
|
// Okt (Twitter) and Mecab at this point. Unfortunately the different
|
||||||
|
// backends have different POS TAG names, so that things are not
|
||||||
|
// completly transparent when using another (need to translate the tag
|
||||||
|
// names in the Python program).
|
||||||
|
|
||||||
#include "autoconfig.h"
|
#include "autoconfig.h"
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
@ -33,16 +40,27 @@
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
// Separator char used in words and tags lists.
|
||||||
|
static const string sepchars("\t");
|
||||||
|
|
||||||
static CmdTalk *o_talker;
|
static CmdTalk *o_talker;
|
||||||
static bool o_starterror{false};
|
static bool o_starterror{false};
|
||||||
static string o_cmdpath;
|
static string o_cmdpath;
|
||||||
std::mutex o_mutex;
|
std::mutex o_mutex;
|
||||||
|
static string o_taggername{"Okt"};
|
||||||
|
|
||||||
void TextSplit::koStaticConfInit(RclConfig *config)
|
void TextSplit::koStaticConfInit(RclConfig *config, const string& tagger)
|
||||||
{
|
{
|
||||||
o_cmdpath = config->findFilter("kosplitter.py");
|
o_cmdpath = config->findFilter("kosplitter.py");
|
||||||
|
if (tagger == "Okt" && tagger == "Mecab") {
|
||||||
|
o_taggername = tagger;
|
||||||
|
} else {
|
||||||
|
LOGERR("TextSplit::koStaticConfInit: unknown tagger [" << tagger <<
|
||||||
|
"], using Okt\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Start the Python subprocess
|
||||||
static bool initCmd()
|
static bool initCmd()
|
||||||
{
|
{
|
||||||
if (o_starterror) {
|
if (o_starterror) {
|
||||||
@ -68,8 +86,6 @@ static bool initCmd()
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const string sepchars("\t");
|
|
||||||
|
|
||||||
bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||||
{
|
{
|
||||||
std::unique_lock<std::mutex> mylock(o_mutex);
|
std::unique_lock<std::mutex> mylock(o_mutex);
|
||||||
@ -78,18 +94,28 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGDEB1("k_to_words: m_wordpos " << m_wordpos << "\n");
|
LOGDEB1("k_to_words: m_wordpos " << m_wordpos << "\n");
|
||||||
Utf8Iter &it = *itp;
|
Utf8Iter &it = *itp;
|
||||||
unsigned int c = 0;
|
unsigned int c = 0;
|
||||||
|
|
||||||
unordered_map<string, string> args;
|
unordered_map<string, string> args;
|
||||||
|
|
||||||
args.insert(pair<string,string>{"data", string()});
|
args.insert(pair<string,string>{"data", string()});
|
||||||
string& inputdata{args.begin()->second};
|
string& inputdata{args.begin()->second};
|
||||||
string::size_type orgbytepos = it.getBpos();
|
|
||||||
|
|
||||||
// Gather all Korean characters and send the text to the analyser
|
// We send the tagger name every time but it's only used the first
|
||||||
|
// one: can't change it after init. We could avoid sending it
|
||||||
|
// every time, but I don't think that the performance hit is
|
||||||
|
// significant
|
||||||
|
args.insert(pair<string,string>{"tagger", o_taggername});
|
||||||
|
|
||||||
|
// Walk the Korean characters section and send the text to the
|
||||||
|
// analyser
|
||||||
|
string::size_type orgbytepos = it.getBpos();
|
||||||
for (; !it.eof(); it++) {
|
for (; !it.eof(); it++) {
|
||||||
c = *it;
|
c = *it;
|
||||||
if (!isHANGUL(c) && !(isascii(c) && (isspace(c) || ispunct(c)))) {
|
if (!isHANGUL(c) && !(isspace(c) || ispunct(c))) {
|
||||||
// Done with Korean stretch, process and go back to main routine
|
// Done with Korean stretch, process and go back to main routine
|
||||||
//std::cerr << "Broke on char " << int(c) << endl;
|
//std::cerr << "Broke on char " << int(c) << endl;
|
||||||
break;
|
break;
|
||||||
@ -97,10 +123,6 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
it.appendchartostring(inputdata);
|
it.appendchartostring(inputdata);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Need to convert white text spans to single space otherwise the
|
|
||||||
// byte offsets will be wrong
|
|
||||||
|
|
||||||
string::size_type textsize = inputdata.size();
|
|
||||||
LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() <<
|
LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() <<
|
||||||
" bytes " << inputdata << endl);
|
" bytes " << inputdata << endl);
|
||||||
unordered_map<string,string> result;
|
unordered_map<string,string> result;
|
||||||
@ -161,11 +183,11 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if DO_CHECK_THINGS
|
#if DO_CHECK_THINGS
|
||||||
int sizediff = textsize - (bytepos - orgbytepos);
|
int sizediff = inputdata.size() - (bytepos - orgbytepos);
|
||||||
if (sizediff < 0)
|
if (sizediff < 0)
|
||||||
sizediff = -sizediff;
|
sizediff = -sizediff;
|
||||||
if (sizediff > 1) {
|
if (sizediff > 1) {
|
||||||
LOGERR("ORIGINAL TEXT SIZE: " << textsize <<
|
LOGERR("ORIGINAL TEXT SIZE: " << inputdata.size() <<
|
||||||
" FINAL BYTE POS " << bytepos - orgbytepos <<
|
" FINAL BYTE POS " << bytepos - orgbytepos <<
|
||||||
" TEXT [" << inputdata << "]\n");
|
" TEXT [" << inputdata << "]\n");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,17 +28,32 @@
|
|||||||
import sys
|
import sys
|
||||||
import cmdtalk
|
import cmdtalk
|
||||||
|
|
||||||
from konlpy.tag import Okt,Kkma
|
from konlpy.tag import Okt,Mecab
|
||||||
|
|
||||||
class Processor(object):
|
class Processor(object):
|
||||||
def __init__(self, proto):
|
def __init__(self, proto):
|
||||||
self.proto = proto
|
self.proto = proto
|
||||||
self.tagger = Okt()
|
self.tagsOkt = False
|
||||||
#self.tagger = Kkma()
|
self.tagsMecab = False
|
||||||
|
|
||||||
|
def _init_tagger(self, taggername):
|
||||||
|
if taggername == "Okt":
|
||||||
|
self.tagger = Okt()
|
||||||
|
self.tagsOkt = True
|
||||||
|
elif taggername == "Mecab":
|
||||||
|
self.tagger = Mecab()
|
||||||
|
self.tagsMecab = True
|
||||||
|
else:
|
||||||
|
raise Exception("Bad tagger name " + taggername)
|
||||||
|
|
||||||
def process(self, params):
|
def process(self, params):
|
||||||
if 'data' not in params:
|
if 'data' not in params:
|
||||||
return {'error':'No data field in parameters'}
|
return {'error':'No data field in parameters'}
|
||||||
|
if not (self.tagsOkt or self.tagsMecab):
|
||||||
|
if 'tagger' not in params:
|
||||||
|
return {'error':'No "tagger" field in parameters'}
|
||||||
|
self._init_tagger(params['tagger']);
|
||||||
|
|
||||||
pos = self.tagger.pos(params['data'])
|
pos = self.tagger.pos(params['data'])
|
||||||
#proto.log("%s" % pos)
|
#proto.log("%s" % pos)
|
||||||
text = ""
|
text = ""
|
||||||
@ -47,10 +62,25 @@ class Processor(object):
|
|||||||
word = e[0]
|
word = e[0]
|
||||||
word = word.replace('\t', ' ')
|
word = word.replace('\t', ' ')
|
||||||
text += word + "\t"
|
text += word + "\t"
|
||||||
tags += e[1] + "\t"
|
tag = e[1]
|
||||||
|
if self.tagsOkt:
|
||||||
|
pass
|
||||||
|
elif self.tagsMecab:
|
||||||
|
tb = tag[0:2]
|
||||||
|
if tb[0] == "N":
|
||||||
|
tag = "Noun"
|
||||||
|
elif tb == "VV":
|
||||||
|
tag = "Verb"
|
||||||
|
elif tb == "VA":
|
||||||
|
tag = "Adjective"
|
||||||
|
elif tag == "MAG":
|
||||||
|
tag = "Adverb"
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
tags += tag + "\t"
|
||||||
return {'text': text, 'tags': tags}
|
return {'text': text, 'tags': tags}
|
||||||
|
|
||||||
|
|
||||||
proto = cmdtalk.CmdTalk()
|
proto = cmdtalk.CmdTalk()
|
||||||
processor = Processor(proto)
|
processor = Processor(proto)
|
||||||
cmdtalk.main(proto, processor)
|
cmdtalk.main(proto, processor)
|
||||||
|
|
||||||
|
|||||||
@ -74,6 +74,10 @@ class CmdTalk {
|
|||||||
// @param env each entry should be of the form name=value. They
|
// @param env each entry should be of the form name=value. They
|
||||||
// augment the subprocess environnement.
|
// augment the subprocess environnement.
|
||||||
// @param path replaces the PATH variable when looking for the command.
|
// @param path replaces the PATH variable when looking for the command.
|
||||||
|
//
|
||||||
|
// Note that cmdtalk.py:main() method is a test routine which
|
||||||
|
// expects data pairs on the command line. If actual parameters
|
||||||
|
// need to be passed, it can't be used by the processor.
|
||||||
virtual bool startCmd(const std::string& cmdname,
|
virtual bool startCmd(const std::string& cmdname,
|
||||||
const std::vector<std::string>& args =
|
const std::vector<std::string>& args =
|
||||||
std::vector<std::string>(),
|
std::vector<std::string>(),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user