Korean text: sort-of-working version, in need of validation
This commit is contained in:
parent
384e3a1087
commit
c9667b5ba7
@ -68,6 +68,8 @@ static bool initCmd()
|
||||
return true;
|
||||
}
|
||||
|
||||
static const string sepchars("\t");
|
||||
|
||||
bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
{
|
||||
std::unique_lock<std::mutex> mylock(o_mutex);
|
||||
@ -82,6 +84,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
unordered_map<string, string> args;
|
||||
args.insert(pair<string,string>{"data", string()});
|
||||
string& inputdata{args.begin()->second};
|
||||
string::size_type orgbytepos = it.getBpos();
|
||||
|
||||
// Gather all Korean characters and send the text to the analyser
|
||||
for (; !it.eof(); it++) {
|
||||
@ -94,6 +97,10 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
it.appendchartostring(inputdata);
|
||||
}
|
||||
}
|
||||
// Need to convert white text spans to single space otherwise the
|
||||
// byte offsets will be wrong
|
||||
|
||||
string::size_type textsize = inputdata.size();
|
||||
LOGDEB1("TextSplit::k_to_words: sending out " << inputdata.size() <<
|
||||
" bytes " << inputdata << endl);
|
||||
unordered_map<string,string> result;
|
||||
@ -101,34 +108,69 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
LOGERR("Python splitter for Korean failed\n");
|
||||
return false;
|
||||
}
|
||||
auto resit = result.find("data");
|
||||
|
||||
auto resit = result.find("text");
|
||||
if (resit == result.end()) {
|
||||
LOGERR("No data in Python splitter for Korean\n");
|
||||
LOGERR("No text in Python splitter for Korean\n");
|
||||
return false;
|
||||
}
|
||||
string& outdata = resit->second;
|
||||
char sepchar = '^';
|
||||
//std::cerr << "GOT FROM SPLITTER: " << outdata << endl;
|
||||
string::size_type wordstart = 0;
|
||||
string::size_type wordend = outdata.find(sepchar);
|
||||
for (;;) {
|
||||
//cerr << "start " << wordstart << " end " << wordend << endl;
|
||||
if (wordend != wordstart) {
|
||||
string::size_type len = (wordend == string::npos) ?
|
||||
wordend : wordend-wordstart;
|
||||
string word = outdata.substr(wordstart, len);
|
||||
//cerr << " WORD[" << word << "]\n";
|
||||
if (!takeword(word, m_wordpos++, 0, 0)) {
|
||||
string& outtext = resit->second;
|
||||
vector<string> words;
|
||||
stringToTokens(outtext, words, sepchars);
|
||||
|
||||
resit = result.find("tags");
|
||||
if (resit == result.end()) {
|
||||
LOGERR("No tags in Python splitter for Korean\n");
|
||||
return false;
|
||||
}
|
||||
string& outtags = resit->second;
|
||||
vector<string> tags;
|
||||
stringToTokens(outtags, tags, sepchars);
|
||||
|
||||
// This is the position in the whole text, not the local fragment,
|
||||
// which is bytepos-orgbytepos
|
||||
string::size_type bytepos(orgbytepos);
|
||||
for (unsigned int i = 0; i < words.size(); i++) {
|
||||
// The POS tagger strips characters from the input (e.g. multiple
|
||||
// spaces, sometimes new lines, possibly other stuff). This
|
||||
// means that we can't easily reconstruct the byte position
|
||||
// from the concatenated terms. The output seems to be always
|
||||
// shorter than the input, so we try to look ahead for the
|
||||
// term. Can't be too sure that this works though, depending
|
||||
// on exactly what transformation may have been applied from
|
||||
// the original input to the term.
|
||||
string word = words[i];
|
||||
trimstring(word);
|
||||
string::size_type newpos = bytepos - orgbytepos;
|
||||
newpos = inputdata.find(word, newpos);
|
||||
if (newpos != string::npos) {
|
||||
bytepos = orgbytepos + newpos;
|
||||
}
|
||||
LOGDEB1("WORD OPOS " << bytepos-orgbytepos <<
|
||||
" FOUND POS " << newpos << endl);
|
||||
if (tags[i] == "Noun" || tags[i] == "Verb" ||
|
||||
tags[i] == "Adjective" || tags[i] == "Adverb") {
|
||||
if (!takeword(
|
||||
word, m_wordpos++, bytepos, bytepos + words[i].size())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (wordend == string::npos)
|
||||
break;
|
||||
wordstart = wordend + 1;
|
||||
wordend = outdata.find(sepchar, wordstart);
|
||||
LOGDEB1("WORD [" << words[i] << "] size " << words[i].size() <<
|
||||
" TAG " << tags[i] << endl);
|
||||
bytepos += words[i].size();
|
||||
}
|
||||
|
||||
|
||||
#if DO_CHECK_THINGS
|
||||
int sizediff = textsize - (bytepos - orgbytepos);
|
||||
if (sizediff < 0)
|
||||
sizediff = -sizediff;
|
||||
if (sizediff > 1) {
|
||||
LOGERR("ORIGINAL TEXT SIZE: " << textsize <<
|
||||
" FINAL BYTE POS " << bytepos - orgbytepos <<
|
||||
" TEXT [" << inputdata << "]\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
// Reset state, saving term position, and return the found non-cjk
|
||||
// Unicode character value. The current input byte offset is kept
|
||||
// in the utf8Iter
|
||||
|
||||
@ -201,7 +201,9 @@ class CmdTalk:
|
||||
|
||||
|
||||
# Common main routine for testing: either run the normal protocol
|
||||
# engine or a local loop.
|
||||
# engine or a local loop. This means that you can call
|
||||
# cmdtalk.main(proto,processor) instead of proto.mainloop(processor)
|
||||
# from your module, and get the benefits of command line testing
|
||||
def main(proto, processor):
|
||||
if len(sys.argv) == 1:
|
||||
proto.mainloop(processor)
|
||||
@ -220,7 +222,7 @@ def main(proto, processor):
|
||||
if len(args) == 0 or len(args) % 2 != 0:
|
||||
usage()
|
||||
params = dict()
|
||||
for i in range(len(args)/2):
|
||||
for i in range(int(len(args)/2)):
|
||||
params[args[2*i]] = args[2*i+1]
|
||||
res = processor.process(params)
|
||||
|
||||
|
||||
@ -28,25 +28,29 @@
|
||||
import sys
|
||||
import cmdtalk
|
||||
|
||||
from konlpy.tag import Okt
|
||||
from konlpy.tag import Okt,Kkma
|
||||
|
||||
class Processor(object):
|
||||
def __init__(self, proto):
|
||||
self.proto = proto
|
||||
self.okt = Okt()
|
||||
self.tagger = Okt()
|
||||
#self.tagger = Kkma()
|
||||
|
||||
def process(self, params):
|
||||
if 'data' not in params:
|
||||
return {'error':'No data field in parameters'}
|
||||
pos = self.okt.pos(params['data'])
|
||||
pos = self.tagger.pos(params['data'])
|
||||
#proto.log("%s" % pos)
|
||||
output = ""
|
||||
text = ""
|
||||
tags = ""
|
||||
for e in pos:
|
||||
if e[1] == 'Noun' or e[1] == 'Verb' or e[1] == 'Adjective' or \
|
||||
e[1] == 'Adverb':
|
||||
output += e[0] + '^'
|
||||
return {'data': output}
|
||||
word = e[0]
|
||||
word = word.replace('\t', ' ')
|
||||
text += word + "\t"
|
||||
tags += e[1] + "\t"
|
||||
return {'text': text, 'tags': tags}
|
||||
|
||||
proto = cmdtalk.CmdTalk()
|
||||
processor = Processor(proto)
|
||||
proto.mainloop(processor)
|
||||
cmdtalk.main(proto, processor)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user