Korean text: sort-of-working version, in need of validation
This commit is contained in:
parent
384e3a1087
commit
c9667b5ba7
@ -68,6 +68,8 @@ static bool initCmd()
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const string sepchars("\t");
|
||||||
|
|
||||||
bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||||
{
|
{
|
||||||
std::unique_lock<std::mutex> mylock(o_mutex);
|
std::unique_lock<std::mutex> mylock(o_mutex);
|
||||||
@ -82,6 +84,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
unordered_map<string, string> args;
|
unordered_map<string, string> args;
|
||||||
args.insert(pair<string,string>{"data", string()});
|
args.insert(pair<string,string>{"data", string()});
|
||||||
string& inputdata{args.begin()->second};
|
string& inputdata{args.begin()->second};
|
||||||
|
string::size_type orgbytepos = it.getBpos();
|
||||||
|
|
||||||
// Gather all Korean characters and send the text to the analyser
|
// Gather all Korean characters and send the text to the analyser
|
||||||
for (; !it.eof(); it++) {
|
for (; !it.eof(); it++) {
|
||||||
@ -94,6 +97,10 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
it.appendchartostring(inputdata);
|
it.appendchartostring(inputdata);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Need to convert white text spans to single space otherwise the
|
||||||
|
// byte offsets will be wrong
|
||||||
|
|
||||||
|
string::size_type textsize = inputdata.size();
|
||||||
LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() <<
|
LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() <<
|
||||||
" bytes " << inputdata << endl);
|
" bytes " << inputdata << endl);
|
||||||
unordered_map<string,string> result;
|
unordered_map<string,string> result;
|
||||||
@ -101,33 +108,68 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
LOGERR("Python splitter for Korean failed\n");
|
LOGERR("Python splitter for Korean failed\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
auto resit = result.find("data");
|
|
||||||
|
auto resit = result.find("text");
|
||||||
if (resit == result.end()) {
|
if (resit == result.end()) {
|
||||||
LOGERR("No data in Python splitter for Korean\n");
|
LOGERR("No text in Python splitter for Korean\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
string& outdata = resit->second;
|
string& outtext = resit->second;
|
||||||
char sepchar = '^';
|
vector<string> words;
|
||||||
//std::cerr << "GOT FROM SPLITTER: " << outdata << endl;
|
stringToTokens(outtext, words, sepchars);
|
||||||
string::size_type wordstart = 0;
|
|
||||||
string::size_type wordend = outdata.find(sepchar);
|
resit = result.find("tags");
|
||||||
for (;;) {
|
if (resit == result.end()) {
|
||||||
//cerr << "start " << wordstart << " end " << wordend << endl;
|
LOGERR("No tags in Python splitter for Korean\n");
|
||||||
if (wordend != wordstart) {
|
return false;
|
||||||
string::size_type len = (wordend == string::npos) ?
|
}
|
||||||
wordend : wordend-wordstart;
|
string& outtags = resit->second;
|
||||||
string word = outdata.substr(wordstart, len);
|
vector<string> tags;
|
||||||
//cerr << " WORD[" << word << "]\n";
|
stringToTokens(outtags, tags, sepchars);
|
||||||
if (!takeword(word, m_wordpos++, 0, 0)) {
|
|
||||||
|
// This is the position in the whole text, not the local fragment,
|
||||||
|
// which is bytepos-orgbytepos
|
||||||
|
string::size_type bytepos(orgbytepos);
|
||||||
|
for (unsigned int i = 0; i < words.size(); i++) {
|
||||||
|
// The POS tagger strips characters from the input (e.g. multiple
|
||||||
|
// spaces, sometimes new lines, possibly other stuff). This
|
||||||
|
// means that we can't easily reconstruct the byte position
|
||||||
|
// from the concatenated terms. The output seems to be always
|
||||||
|
// shorter than the input, so we try to look ahead for the
|
||||||
|
// term. Can't be too sure that this works though, depending
|
||||||
|
// on exactly what transformation may have been applied from
|
||||||
|
// the original input to the term.
|
||||||
|
string word = words[i];
|
||||||
|
trimstring(word);
|
||||||
|
string::size_type newpos = bytepos - orgbytepos;
|
||||||
|
newpos = inputdata.find(word, newpos);
|
||||||
|
if (newpos != string::npos) {
|
||||||
|
bytepos = orgbytepos + newpos;
|
||||||
|
}
|
||||||
|
LOGDEB1("WORD OPOS " << bytepos-orgbytepos <<
|
||||||
|
" FOUND POS " << newpos << endl);
|
||||||
|
if (tags[i] == "Noun" || tags[i] == "Verb" ||
|
||||||
|
tags[i] == "Adjective" || tags[i] == "Adverb") {
|
||||||
|
if (!takeword(
|
||||||
|
word, m_wordpos++, bytepos, bytepos + words[i].size())) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (wordend == string::npos)
|
LOGDEB1("WORD [" << words[i] << "] size " << words[i].size() <<
|
||||||
break;
|
" TAG " << tags[i] << endl);
|
||||||
wordstart = wordend + 1;
|
bytepos += words[i].size();
|
||||||
wordend = outdata.find(sepchar, wordstart);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if DO_CHECK_THINGS
|
||||||
|
int sizediff = textsize - (bytepos - orgbytepos);
|
||||||
|
if (sizediff < 0)
|
||||||
|
sizediff = -sizediff;
|
||||||
|
if (sizediff > 1) {
|
||||||
|
LOGERR("ORIGINAL TEXT SIZE: " << textsize <<
|
||||||
|
" FINAL BYTE POS " << bytepos - orgbytepos <<
|
||||||
|
" TEXT [" << inputdata << "]\n");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Reset state, saving term position, and return the found non-cjk
|
// Reset state, saving term position, and return the found non-cjk
|
||||||
// Unicode character value. The current input byte offset is kept
|
// Unicode character value. The current input byte offset is kept
|
||||||
|
|||||||
@ -201,7 +201,9 @@ class CmdTalk:
|
|||||||
|
|
||||||
|
|
||||||
# Common main routine for testing: either run the normal protocol
|
# Common main routine for testing: either run the normal protocol
|
||||||
# engine or a local loop.
|
# engine or a local loop. This means that you can call
|
||||||
|
# cmdtalk.main(proto,processor) instead of proto.mainloop(processor)
|
||||||
|
# from your module, and get the benefits of command line testing
|
||||||
def main(proto, processor):
|
def main(proto, processor):
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
proto.mainloop(processor)
|
proto.mainloop(processor)
|
||||||
@ -220,7 +222,7 @@ def main(proto, processor):
|
|||||||
if len(args) == 0 or len(args) % 2 != 0:
|
if len(args) == 0 or len(args) % 2 != 0:
|
||||||
usage()
|
usage()
|
||||||
params = dict()
|
params = dict()
|
||||||
for i in range(len(args)/2):
|
for i in range(int(len(args)/2)):
|
||||||
params[args[2*i]] = args[2*i+1]
|
params[args[2*i]] = args[2*i+1]
|
||||||
res = processor.process(params)
|
res = processor.process(params)
|
||||||
|
|
||||||
|
|||||||
@ -28,25 +28,29 @@
|
|||||||
import sys
|
import sys
|
||||||
import cmdtalk
|
import cmdtalk
|
||||||
|
|
||||||
from konlpy.tag import Okt
|
from konlpy.tag import Okt,Kkma
|
||||||
|
|
||||||
class Processor(object):
|
class Processor(object):
|
||||||
def __init__(self, proto):
|
def __init__(self, proto):
|
||||||
self.proto = proto
|
self.proto = proto
|
||||||
self.okt = Okt()
|
self.tagger = Okt()
|
||||||
|
#self.tagger = Kkma()
|
||||||
|
|
||||||
def process(self, params):
|
def process(self, params):
|
||||||
if 'data' not in params:
|
if 'data' not in params:
|
||||||
return {'error':'No data field in parameters'}
|
return {'error':'No data field in parameters'}
|
||||||
pos = self.okt.pos(params['data'])
|
pos = self.tagger.pos(params['data'])
|
||||||
#proto.log("%s" % pos)
|
#proto.log("%s" % pos)
|
||||||
output = ""
|
text = ""
|
||||||
|
tags = ""
|
||||||
for e in pos:
|
for e in pos:
|
||||||
if e[1] == 'Noun' or e[1] == 'Verb' or e[1] == 'Adjective' or \
|
word = e[0]
|
||||||
e[1] == 'Adverb':
|
word = word.replace('\t', ' ')
|
||||||
output += e[0] + '^'
|
text += word + "\t"
|
||||||
return {'data': output}
|
tags += e[1] + "\t"
|
||||||
|
return {'text': text, 'tags': tags}
|
||||||
|
|
||||||
proto = cmdtalk.CmdTalk()
|
proto = cmdtalk.CmdTalk()
|
||||||
processor = Processor(proto)
|
processor = Processor(proto)
|
||||||
proto.mainloop(processor)
|
cmdtalk.main(proto, processor)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user