Regularise processing of hangul characters (there was a mixup of cjk/regular processing), and add a build-time option to either use cjk/ngram or regular term splitting for them
This commit is contained in:
parent
7457633b79
commit
6b058e9758
@ -44,6 +44,9 @@
|
|||||||
// ngrams
|
// ngrams
|
||||||
#undef KATAKANA_AS_WORDS
|
#undef KATAKANA_AS_WORDS
|
||||||
|
|
||||||
|
// Same for Korean syllabic
|
||||||
|
#define HANGUL_AS_WORDS
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -190,12 +193,13 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
|
|||||||
|
|
||||||
// CJK Unicode character detection:
|
// CJK Unicode character detection:
|
||||||
//
|
//
|
||||||
|
// 1100..11FF; Hangul Jamo (optional: see UNICODE_IS_HANGUL)
|
||||||
// 2E80..2EFF; CJK Radicals Supplement
|
// 2E80..2EFF; CJK Radicals Supplement
|
||||||
// 3000..303F; CJK Symbols and Punctuation
|
// 3000..303F; CJK Symbols and Punctuation
|
||||||
// 3040..309F; Hiragana
|
// 3040..309F; Hiragana
|
||||||
// 30A0..30FF; Katakana
|
// 30A0..30FF; Katakana
|
||||||
// 3100..312F; Bopomofo
|
// 3100..312F; Bopomofo
|
||||||
// 3130..318F; Hangul Compatibility Jamo
|
// 3130..318F; Hangul Compatibility Jamo (optional: see UNICODE_IS_HANGUL)
|
||||||
// 3190..319F; Kanbun
|
// 3190..319F; Kanbun
|
||||||
// 31A0..31BF; Bopomofo Extended
|
// 31A0..31BF; Bopomofo Extended
|
||||||
// 31C0..31EF; CJK Strokes
|
// 31C0..31EF; CJK Strokes
|
||||||
@ -206,14 +210,15 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
|
|||||||
// 4DC0..4DFF; Yijing Hexagram Symbols
|
// 4DC0..4DFF; Yijing Hexagram Symbols
|
||||||
// 4E00..9FFF; CJK Unified Ideographs
|
// 4E00..9FFF; CJK Unified Ideographs
|
||||||
// A700..A71F; Modifier Tone Letters
|
// A700..A71F; Modifier Tone Letters
|
||||||
// AC00..D7AF; Hangul Syllables
|
// AC00..D7AF; Hangul Syllables (optional: see UNICODE_IS_HANGUL)
|
||||||
// F900..FAFF; CJK Compatibility Ideographs
|
// F900..FAFF; CJK Compatibility Ideographs
|
||||||
// FE30..FE4F; CJK Compatibility Forms
|
// FE30..FE4F; CJK Compatibility Forms
|
||||||
// FF00..FFEF; Halfwidth and Fullwidth Forms
|
// FF00..FFEF; Halfwidth and Fullwidth Forms
|
||||||
// 20000..2A6DF; CJK Unified Ideographs Extension B
|
// 20000..2A6DF; CJK Unified Ideographs Extension B
|
||||||
// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
|
// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
|
||||||
#define UNICODE_IS_CJK(p) \
|
#define UNICODE_IS_CJK(p) \
|
||||||
(((p) >= 0x2E80 && (p) <= 0x2EFF) || \
|
(((p) >= 0x1100 && (p) <= 0x11FF) || \
|
||||||
|
((p) >= 0x2E80 && (p) <= 0x2EFF) || \
|
||||||
((p) >= 0x3000 && (p) <= 0x9FFF) || \
|
((p) >= 0x3000 && (p) <= 0x9FFF) || \
|
||||||
((p) >= 0xA700 && (p) <= 0xA71F) || \
|
((p) >= 0xA700 && (p) <= 0xA71F) || \
|
||||||
((p) >= 0xAC00 && (p) <= 0xD7AF) || \
|
((p) >= 0xAC00 && (p) <= 0xD7AF) || \
|
||||||
@ -236,20 +241,42 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
|
|||||||
#define UNICODE_IS_KATAKANA(p) false
|
#define UNICODE_IS_KATAKANA(p) false
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HANGUL_AS_WORDS
|
||||||
|
#define UNICODE_IS_HANGUL(p) ( \
|
||||||
|
((p) >= 0x1100 && (p) <= 0x11FF) || \
|
||||||
|
((p) >= 0x3130 && (p) <= 0x318F) || \
|
||||||
|
((p) >= 0x3200 && (p) <= 0x321e) || \
|
||||||
|
((p) >= 0x3248 && (p) <= 0x327F) || \
|
||||||
|
((p) >= 0x3281 && (p) <= 0x32BF) || \
|
||||||
|
((p) >= 0xAC00 && (p) <= 0xD7AF) \
|
||||||
|
)
|
||||||
|
#else
|
||||||
|
#define UNICODE_IS_HANGUL(p) false
|
||||||
|
#endif
|
||||||
|
|
||||||
bool TextSplit::isCJK(int c)
|
bool TextSplit::isCJK(int c)
|
||||||
{
|
{
|
||||||
return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c);
|
return UNICODE_IS_CJK(c) && !UNICODE_IS_KATAKANA(c) &&
|
||||||
|
!UNICODE_IS_HANGUL(c);
|
||||||
}
|
}
|
||||||
bool TextSplit::isKATAKANA(int c)
|
bool TextSplit::isKATAKANA(int c)
|
||||||
{
|
{
|
||||||
return UNICODE_IS_KATAKANA(c);
|
return UNICODE_IS_KATAKANA(c);
|
||||||
}
|
}
|
||||||
|
bool TextSplit::isHANGUL(int c)
|
||||||
|
{
|
||||||
|
return UNICODE_IS_HANGUL(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// This is used to detect katakana/other transitions, which must
|
// This is used to detect katakana/other transitions, which must
|
||||||
// trigger a word split (there is not always a separator, and katakana
|
// trigger a word split (there is not always a separator, and katakana
|
||||||
// is otherwise treated like other, in the same routine, unless cjk
|
// is otherwise treated like other, in the same routine, unless cjk
|
||||||
// which has its span reader causing a word break)
|
// which has its span reader causing a word break)
|
||||||
enum CharSpanClass {CSC_CJK, CSC_KATAKANA, CSC_OTHER};
|
enum CharSpanClass {CSC_HANGUL, CSC_CJK, CSC_KATAKANA, CSC_OTHER};
|
||||||
|
std::vector<CharFlags> csc_names {CHARFLAGENTRY(CSC_HANGUL),
|
||||||
|
CHARFLAGENTRY(CSC_CJK), CHARFLAGENTRY(CSC_KATAKANA),
|
||||||
|
CHARFLAGENTRY(CSC_OTHER)};
|
||||||
|
|
||||||
bool TextSplit::o_processCJK{true};
|
bool TextSplit::o_processCJK{true};
|
||||||
unsigned int TextSplit::o_CJKNgramLen{2};
|
unsigned int TextSplit::o_CJKNgramLen{2};
|
||||||
@ -577,7 +604,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
int nonalnumcnt = 0;
|
int nonalnumcnt = 0;
|
||||||
|
|
||||||
Utf8Iter it(in);
|
Utf8Iter it(in);
|
||||||
#ifdef KATAKANA_AS_WORDS
|
#if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS)
|
||||||
int prev_csc = -1;
|
int prev_csc = -1;
|
||||||
#endif
|
#endif
|
||||||
for (; !it.eof(); it++) {
|
for (; !it.eof(); it++) {
|
||||||
@ -592,6 +619,8 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
CharSpanClass csc;
|
CharSpanClass csc;
|
||||||
if (UNICODE_IS_KATAKANA(c)) {
|
if (UNICODE_IS_KATAKANA(c)) {
|
||||||
csc = CSC_KATAKANA;
|
csc = CSC_KATAKANA;
|
||||||
|
} else if (UNICODE_IS_HANGUL(c)) {
|
||||||
|
csc = CSC_HANGUL;
|
||||||
} else if (UNICODE_IS_CJK(c)) {
|
} else if (UNICODE_IS_CJK(c)) {
|
||||||
csc = CSC_CJK;
|
csc = CSC_CJK;
|
||||||
} else {
|
} else {
|
||||||
@ -618,12 +647,17 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef KATAKANA_AS_WORDS
|
#if defined(KATAKANA_AS_WORDS) || defined(HANGUL_AS_WORDS)
|
||||||
// Only needed if we have script transitions inside this
|
// Only needed if we have script transitions inside this
|
||||||
// routine, else the call to cjk_to_words does the job.
|
// routine, else the call to cjk_to_words does the job (so do
|
||||||
if (csc != prev_csc && (m_wordLen || m_span.length())) {
|
// nothing right after a CJK section). Because
|
||||||
LOGDEB("csc " << csc << " pcsc " << prev_csc << " wl " <<
|
// katakana-western transitions sometimes have no whitespace
|
||||||
m_wordLen << " spl " << m_span.length() << endl);
|
// (and maybe hangul too, but probably not).
|
||||||
|
if (prev_csc != CSC_CJK && csc != prev_csc &&
|
||||||
|
(m_wordLen || m_span.length())) {
|
||||||
|
LOGDEB2("csc " << valToString(csc_names, csc) << " prev_csc " <<
|
||||||
|
valToString(csc_names, prev_csc) << " wl " <<
|
||||||
|
m_wordLen << " spl " << m_span.length() << endl);
|
||||||
if (!doemit(true, it.getBpos())) {
|
if (!doemit(true, it.getBpos())) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -91,6 +91,7 @@ public:
|
|||||||
/** Is char CJK ? (excluding Katakana) */
|
/** Is char CJK ? (excluding Katakana) */
|
||||||
static bool isCJK(int c);
|
static bool isCJK(int c);
|
||||||
static bool isKATAKANA(int c);
|
static bool isKATAKANA(int c);
|
||||||
|
static bool isHANGUL(int c);
|
||||||
|
|
||||||
/** Statistics about word length (average and dispersion) can
|
/** Statistics about word length (average and dispersion) can
|
||||||
* detect bad data like undecoded base64 or other mis-identified
|
* detect bad data like undecoded base64 or other mis-identified
|
||||||
|
|||||||
@ -196,7 +196,7 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
RclConfig *config = new RclConfig(&cffn);
|
RclConfig *config = new RclConfig(&cffn);
|
||||||
TextSplit::staticConfInit(config);
|
TextSplit::staticConfInit(config);
|
||||||
|
Logger::getTheLog("stderr")->setLogLevel(Logger::LLDEB0);
|
||||||
|
|
||||||
Rcl::StopList stoplist;
|
Rcl::StopList stoplist;
|
||||||
if (op_flags & OPT_S) {
|
if (op_flags & OPT_S) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user