textsplit: use more regular test for ISHANGUL. CJK: do not ignore whitespace, break on alphabetic non cjk character

This commit is contained in:
Jean-Francois Dockes 2020-04-10 14:28:14 +02:00
parent b63cc1b712
commit de246349da

View File

@ -34,6 +34,17 @@
#include "smallut.h" #include "smallut.h"
#include "rclconfig.h" #include "rclconfig.h"
using namespace std;
/**
* Splitting a text into words. The code in this file works with utf-8
* in a semi-clean way (see uproplist.h). Ascii still gets special
* treatment in the sense that many special characters can only be
* ascii (e.g. @, _,...). However, this compromise works quite well
* while being much more light-weight than a full-blown Unicode
* approach (ICU...)
*/
// Decide if we treat katakana as western scripts, splitting into // Decide if we treat katakana as western scripts, splitting into
// words instead of n-grams. This is not absurd (katakana is a kind of // words instead of n-grams. This is not absurd (katakana is a kind of
// alphabet, albeit phonetic and syllabic and is mostly used to // alphabet, albeit phonetic and syllabic and is mostly used to
@ -49,16 +60,6 @@
// is defined at compile time. // is defined at compile time.
#define HANGUL_AS_WORDS #define HANGUL_AS_WORDS
using namespace std;
/**
* Splitting a text into words. The code in this file works with utf-8
* in a semi-clean way (see uproplist.h). Ascii still gets special
* treatment in the sense that many special characters can only be
* ascii (e.g. @, _,...). However, this compromise works quite well
* while being much more light-weight than a full-blown Unicode
* approach (ICU...)
*/
// Ascii character classes: we have three main groups, and then some chars // Ascii character classes: we have three main groups, and then some chars
// are their own class because they want special handling. // are their own class because they want special handling.
@ -74,6 +75,58 @@ enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259,
A_ULETTER=260, A_LLETTER=261, SKIP=262}; A_ULETTER=260, A_LLETTER=261, SKIP=262};
static int charclasses[charclasses_size]; static int charclasses[charclasses_size];
bool TextSplit::o_processCJK{true};
unsigned int TextSplit::o_CJKNgramLen{2};
bool TextSplit::o_noNumbers{false};
bool TextSplit::o_deHyphenate{false};
int TextSplit::o_maxWordLength{40};
static const int o_CJKMaxNgramLen{5};
bool o_exthangultagger{false};
void TextSplit::staticConfInit(RclConfig *config)
{
config->getConfParam("maxtermlength", &o_maxWordLength);
bool bvalue{false};
if (config->getConfParam("nocjk", &bvalue) && bvalue == true) {
o_processCJK = false;
} else {
o_processCJK = true;
int ngramlen;
if (config->getConfParam("cjkngramlen", &ngramlen)) {
o_CJKNgramLen = (unsigned int)(ngramlen <= o_CJKMaxNgramLen ?
ngramlen : o_CJKMaxNgramLen);
}
}
bvalue = false;
if (config->getConfParam("nonumbers", &bvalue)) {
o_noNumbers = bvalue;
}
bvalue = false;
if (config->getConfParam("dehyphenate", &bvalue)) {
o_deHyphenate = bvalue;
}
bvalue = false;
if (config->getConfParam("backslashasletter", &bvalue)) {
if (bvalue) {
} else {
charclasses[int('\\')] = SPACE;
}
}
string kotagger;
config->getConfParam("hangultagger", kotagger);
if (!kotagger.empty()) {
o_exthangultagger = true;
koStaticConfInit(config, kotagger);
}
}
// Non-ascii UTF-8 characters are handled with sets holding all // Non-ascii UTF-8 characters are handled with sets holding all
// characters with interesting properties. This is far from full-blown // characters with interesting properties. This is far from full-blown
// management of Unicode properties, but seems to do the job well // management of Unicode properties, but seems to do the job well
@ -249,13 +302,16 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
#endif #endif
#ifdef HANGUL_AS_WORDS #ifdef HANGUL_AS_WORDS
// If no external tagger is configured, we process HANGUL as generic
// cjk (n-grams)
#define UNICODE_IS_HANGUL(p) ( \ #define UNICODE_IS_HANGUL(p) ( \
((p) >= 0x1100 && (p) <= 0x11FF) || \ o_exthangultagger && \
((p) >= 0x3130 && (p) <= 0x318F) || \ (((p) >= 0x1100 && (p) <= 0x11FF) || \
((p) >= 0x3200 && (p) <= 0x321e) || \ ((p) >= 0x3130 && (p) <= 0x318F) || \
((p) >= 0x3248 && (p) <= 0x327F) || \ ((p) >= 0x3200 && (p) <= 0x321e) || \
((p) >= 0x3281 && (p) <= 0x32BF) || \ ((p) >= 0x3248 && (p) <= 0x327F) || \
((p) >= 0xAC00 && (p) <= 0xD7AF) \ ((p) >= 0x3281 && (p) <= 0x32BF) || \
((p) >= 0xAC00 && (p) <= 0xD7AF)) \
) )
#else #else
#define UNICODE_IS_HANGUL(p) false #define UNICODE_IS_HANGUL(p) false
@ -285,56 +341,6 @@ std::vector<CharFlags> csc_names {CHARFLAGENTRY(CSC_HANGUL),
CHARFLAGENTRY(CSC_CJK), CHARFLAGENTRY(CSC_KATAKANA), CHARFLAGENTRY(CSC_CJK), CHARFLAGENTRY(CSC_KATAKANA),
CHARFLAGENTRY(CSC_OTHER)}; CHARFLAGENTRY(CSC_OTHER)};
bool TextSplit::o_processCJK{true};
unsigned int TextSplit::o_CJKNgramLen{2};
bool TextSplit::o_noNumbers{false};
bool TextSplit::o_deHyphenate{false};
int TextSplit::o_maxWordLength{40};
static const int o_CJKMaxNgramLen{5};
bool o_exthangultagger{false};
void TextSplit::staticConfInit(RclConfig *config)
{
config->getConfParam("maxtermlength", &o_maxWordLength);
bool bvalue{false};
if (config->getConfParam("nocjk", &bvalue) && bvalue == true) {
o_processCJK = false;
} else {
o_processCJK = true;
int ngramlen;
if (config->getConfParam("cjkngramlen", &ngramlen)) {
o_CJKNgramLen = (unsigned int)(ngramlen <= o_CJKMaxNgramLen ?
ngramlen : o_CJKMaxNgramLen);
}
}
bvalue = false;
if (config->getConfParam("nonumbers", &bvalue)) {
o_noNumbers = bvalue;
}
bvalue = false;
if (config->getConfParam("dehyphenate", &bvalue)) {
o_deHyphenate = bvalue;
}
bvalue = false;
if (config->getConfParam("backslashasletter", &bvalue)) {
if (bvalue) {
} else {
charclasses[int('\\')] = SPACE;
}
}
string kotagger;
config->getConfParam("hangultagger", kotagger);
if (!kotagger.empty()) {
o_exthangultagger = true;
koStaticConfInit(config, kotagger);
}
}
// Final term checkpoint: do some checking (the kind which is simpler // Final term checkpoint: do some checking (the kind which is simpler
// to do here than in the main loop), then send term to our client. // to do here than in the main loop), then send term to our client.
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
@ -635,11 +641,7 @@ bool TextSplit::text_to_words(const string &in)
if (UNICODE_IS_KATAKANA(c)) { if (UNICODE_IS_KATAKANA(c)) {
csc = CSC_KATAKANA; csc = CSC_KATAKANA;
} else if (UNICODE_IS_HANGUL(c)) { } else if (UNICODE_IS_HANGUL(c)) {
if (o_exthangultagger) { csc = CSC_HANGUL;
csc = CSC_HANGUL;
} else {
csc = CSC_CJK;
}
} else if (UNICODE_IS_CJK(c)) { } else if (UNICODE_IS_CJK(c)) {
csc = CSC_CJK; csc = CSC_CJK;
} else { } else {
@ -998,10 +1000,10 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
unsigned int c = 0; unsigned int c = 0;
for (; !it.eof() && !it.error(); it++) { for (; !it.eof() && !it.error(); it++) {
c = *it; c = *it;
if (c == ' ' || c == '\t' || c == '\n') { // We had a version which ignored whitespace for some time,
continue; // but this was a bad idea. Only break on an non-cjk
} // alphabetic character.
if (!UNICODE_IS_CJK(c)) { if (!UNICODE_IS_CJK(c) && isalpha(c)) {
// Return to normal handler // Return to normal handler
break; break;
} }