experiment: Korean when Noun then JX emit both Noun and Noun+JX
This commit is contained in:
parent
2f794be314
commit
48d4678770
@ -184,6 +184,9 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
// not in the whole text which is orgbytepos + bytepos
|
// not in the whole text which is orgbytepos + bytepos
|
||||||
string::size_type bytepos{0};
|
string::size_type bytepos{0};
|
||||||
string::size_type pagefix{0};
|
string::size_type pagefix{0};
|
||||||
|
string lastNoun;
|
||||||
|
string::size_type lastNounBytePos{0};
|
||||||
|
int lastNounWordPos{0};
|
||||||
for (unsigned int i = 0; i < words.size(); i++) {
|
for (unsigned int i = 0; i < words.size(); i++) {
|
||||||
// The POS tagger strips characters from the input (e.g. multiple
|
// The POS tagger strips characters from the input (e.g. multiple
|
||||||
// spaces, sometimes new lines, possibly other stuff). This
|
// spaces, sometimes new lines, possibly other stuff). This
|
||||||
@ -214,7 +217,21 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
" absbytepos " << orgbytepos + bytepos <<
|
" absbytepos " << orgbytepos + bytepos <<
|
||||||
" bytepos " << bytepos << " word from text: " <<
|
" bytepos " << bytepos << " word from text: " <<
|
||||||
inputdata.substr(bytepos, word.size()) << endl);
|
inputdata.substr(bytepos, word.size()) << endl);
|
||||||
if (tags[i] == "Noun" || tags[i] == "Verb" ||
|
bool isNoun = (tags[i] == "Noun");
|
||||||
|
if (isNoun) {
|
||||||
|
lastNoun = word;
|
||||||
|
lastNounWordPos = m_wordpos;
|
||||||
|
lastNounBytePos = orgbytepos + bytepos - pagefix;
|
||||||
|
} else {
|
||||||
|
if (tags[i] == "JX" && !lastNoun.empty()) {
|
||||||
|
if (!takeword(lastNoun+word, lastNounWordPos, lastNounBytePos,
|
||||||
|
lastNounBytePos + word.size())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lastNoun.clear();
|
||||||
|
}
|
||||||
|
if (isNoun || tags[i] == "Verb" ||
|
||||||
tags[i] == "Adjective" || tags[i] == "Adverb") {
|
tags[i] == "Adjective" || tags[i] == "Adverb") {
|
||||||
string::size_type abspos = orgbytepos + bytepos - pagefix;
|
string::size_type abspos = orgbytepos + bytepos - pagefix;
|
||||||
if (!takeword(word, m_wordpos++, abspos, abspos + word.size())) {
|
if (!takeword(word, m_wordpos++, abspos, abspos + word.size())) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user