From 069d71ea8f7fc7c30a8baa072b293a84f82f6465 Mon Sep 17 00:00:00 2001 From: dockes Date: Thu, 20 Sep 2007 08:45:05 +0000 Subject: [PATCH] initial cjk support --- src/common/textsplit.cpp | 193 +++++++++++++++++++++++++++++++++++---- src/common/textsplit.h | 21 +++-- src/utils/utf8iter.h | 44 +++++---- 3 files changed, 218 insertions(+), 40 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 5c53a501..1b35f5a8 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.30 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.31 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -95,6 +95,51 @@ static void setcharclasses() init = 1; } +static inline int whatcc(unsigned int c) +{ + if (c <= 127) { + return charclasses[c]; + } else { + if (unicign.find(c) != unicign.end()) + return SPACE; + else + return LETTER; + } +} + +// 2E80..2EFF; CJK Radicals Supplement +// 3000..303F; CJK Symbols and Punctuation +// 3040..309F; Hiragana +// 30A0..30FF; Katakana +// 3100..312F; Bopomofo +// 3130..318F; Hangul Compatibility Jamo +// 3190..319F; Kanbun +// 31A0..31BF; Bopomofo Extended +// 31C0..31EF; CJK Strokes +// 31F0..31FF; Katakana Phonetic Extensions +// 3200..32FF; Enclosed CJK Letters and Months +// 3300..33FF; CJK Compatibility +// 3400..4DBF; CJK Unified Ideographs Extension A +// 4DC0..4DFF; Yijing Hexagram Symbols +// 4E00..9FFF; CJK Unified Ideographs +// A700..A71F; Modifier Tone Letters +// AC00..D7AF; Hangul Syllables +// F900..FAFF; CJK Compatibility Ideographs +// FE30..FE4F; CJK Compatibility Forms +// FF00..FFEF; Halfwidth and Fullwidth Forms +// 20000..2A6DF; CJK Unified Ideographs Extension B +// 2F800..2FA1F; CJK Compatibility Ideographs Supplement +#define UNICODE_IS_CJK(p) \ + (((p) >= 0x2E80 && (p) <= 0x2EFF) \ + || ((p) >= 0x3000 && (p) <= 0x9FFF) \ + || ((p) >= 0xA700 && (p) <= 0xA71F) \ + || ((p) >= 0xAC00 && (p) <= 0xD7AF) \ + || ((p) >= 0xF900 && (p) <= 0xFAFF) \ + || ((p) >= 0xFE30 && (p) <= 0xFE4F) \ + || ((p) >= 0xFF00 && (p) <= 0xFFEF) \ + || ((p) >= 0x20000 && (p) <= 0x2A6DF) \ + || ((p) >= 0x2F800 && (p) <= 0x2FA1F)) + // Do some checking (the kind which is simpler to do here than in the // main loop), then send term to our client. inline bool TextSplit::emitterm(bool isspan, string &w, int pos, @@ -190,18 +235,6 @@ inline bool TextSplit::doemit(bool spanerase, int bp) return true; } -static inline int whatcc(unsigned int c) -{ - if (c <= 127) { - return charclasses[c]; - } else { - if (unicign.find(c) != unicign.end()) - return SPACE; - else - return LETTER; - } -} - /** * Splitting a text into terms to be indexed. * We basically emit a word every time we see a separator, but some chars are @@ -210,7 +243,11 @@ static inline int whatcc(unsigned int c) */ bool TextSplit::text_to_words(const string &in) { - LOGDEB2(("TextSplit::text_to_words: cb %p in [%s]\n", cb, + LOGDEB(("TextSplit::text_to_words:%s%s%s%s [%s]\n", + m_flags & TXTS_NOSPANS ? " nospans" : "", + m_flags & TXTS_ONLYSPANS ? " onlyspans" : "", + m_flags & TXTS_KEEPWILD ? " keepwild" : "", + m_flags & TXTS_NOCJK ? " nocjk" : "", in.substr(0,50).c_str())); setcharclasses(); @@ -228,6 +265,27 @@ bool TextSplit::text_to_words(const string &in) LOGERR(("Textsplit: error occured while scanning UTF-8 string\n")); return false; } + + if (!m_nocjk && UNICODE_IS_CJK(c)) { + // CJK character hit. + // Do like at EOF with the current non-cjk data. + if (m_wordLen || m_span.length()) { + if (!doemit(true, it.getBpos())) + return false; + } + + // Hand off situation to the cjk routine. + if (!cjk_to_words(&it, &c)) { + LOGERR(("Textsplit: scan error in cjk handler\n")); + return false; + } + + // Check for eof, else c contains the first non-cjk + // character after the cjk sequence, just go on. + if (it.eof()) + break; + } + int cc = whatcc(c); switch (cc) { case LETTER: @@ -360,7 +418,101 @@ bool TextSplit::text_to_words(const string &in) return true; } -// Callback class for utility function usage +const unsigned int ngramlen = 2; +#define MAXNGRAMLEN 5 + +// Using an utf8iter pointer just to avoid needing its definition in +// textsplit.h +// +// We output ngrams for exemple for char input a b c and ngramlen== 2, +// we generate: a ab b bc c as words +// +// This is very different from the normal behaviour, so we don't use +// the doemit() and emitterm() routines +// +// The routine is sort of a mess and goes to show that we'd probably +// be better off converting the whole buffer to utf32 on entry... +bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp) +{ + LOGDEB(("cjk_to_words: m_wordpos %d\n", m_wordpos)); + Utf8Iter &it = *itp; + + // We use an offset buffer to remember the starts of the utf-8 + // characters which we still need to use. + // Fixed size array. ngramlen over 3 doesn't make sense. + assert(ngramlen < MAXNGRAMLEN); + unsigned int boffs[MAXNGRAMLEN]; + + // Current number of valid offsets; + unsigned int nchars = 0; + unsigned int c = 0; + for (; !it.eof(); it++) { + c = *it; + if (!UNICODE_IS_CJK(c)) { + // Return to normal handler + break; + } + + if (nchars == ngramlen) { + // Offset buffer full, shift it. Might be more efficient + // to have a circular one, but things are complicated + // enough already... + for (unsigned int i = 0; i < nchars-1; i++) { + boffs[i] = boffs[i+1]; + } + } else { + nchars++; + } + + // Take note of byte offset for this character. + boffs[nchars-1] = it.getBpos(); + + // Output all new ngrams: they begin at each existing position + // and end after the new character. onlyspans->only output + // maximum words, nospans=> single chars + if (!(m_flags & TXTS_ONLYSPANS) || nchars == ngramlen) { + unsigned int btend = it.getBpos() + it.getBlen(); + unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0; + unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars; + for (unsigned int i = loopbeg; i < loopend; i++) { + if (!m_cb->takeword(it.buffer().substr(boffs[i], + btend-boffs[i]), + m_wordpos - (nchars-i-1), boffs[i], btend)) { + return false; + } + } + + if ((m_flags & TXTS_ONLYSPANS)) { + // Only spans: don't overlap: flush buffer + nchars = 0; + } + } + // Increase word position by one, other words are at an + // existing position. This could be subject to discussion... + m_wordpos++; + } + + // If onlyspans is set, there may be things to flush in the buffer + // first + if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != ngramlen) { + unsigned int btend = it.getBpos(); // Current char is out + if (!m_cb->takeword(it.buffer().substr(boffs[0], + btend-boffs[0]), + m_wordpos - nchars, + boffs[0], btend)) { + return false; + } + } + + m_span.erase(); + m_inNumber = false; + m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0; + m_spanpos = m_wordpos; + *cp = c; + return true; +} + +// Callback class for countWords class utSplitterCB : public TextSplitCB { public: int wcnt; @@ -404,11 +556,12 @@ class mySplitterCB : public TextSplitCB { bool takeword(const string &term, int pos, int bs, int be) { if (nooutput) return true; + FILE *fp = stdout; if (first) { - printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be"); + fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be"); first = 0; } - printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be); + fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be); return true; } }; @@ -438,6 +591,7 @@ static string usage = " -s: only spans\n" " -w: only words\n" " -k: preserve wildcards (?*)\n" + " -C: desactivate CJK processing\n" " -c: just count words\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n" " \n\n" @@ -456,6 +610,7 @@ static int op_flags; #define OPT_S 0x4 #define OPT_c 0x8 #define OPT_k 0x10 +#define OPT_C 0x20 int main(int argc, char **argv) { @@ -470,6 +625,7 @@ int main(int argc, char **argv) while (**argv) switch (*(*argv)++) { case 'c': op_flags |= OPT_c; break; + case 'C': op_flags |= OPT_C; break; case 'k': op_flags |= OPT_k; break; case 's': op_flags |= OPT_s; break; case 'S': op_flags |= OPT_S; break; @@ -494,6 +650,9 @@ int main(int argc, char **argv) if (op_flags & OPT_k) flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); + if (op_flags & OPT_C) + flags = (TextSplit::Flags)(flags | TextSplit::TXTS_NOCJK); + string data; if (argc == 1) { const char *filename = *argv++; argc--; diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 754ea1c4..4e74d386 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -16,7 +16,7 @@ */ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ -/* @(#$Id: textsplit.h,v 1.17 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: textsplit.h,v 1.18 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes */ #include #ifndef NO_NAMESPACES @@ -36,6 +36,8 @@ public: ) = 0; }; +class Utf8Iter; + /** * Split text into words. * See comments at top of .cpp for more explanations. @@ -47,14 +49,19 @@ public: enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com) TXTS_NOSPANS = 2, // Only return atomic words (a, b, com) - TXTS_KEEPWILD = 4 // Handle wildcards as letters + TXTS_KEEPWILD = 4, // Handle wildcards as letters + TXTS_NOCJK = 8 // CJK special processing }; /** * Constructor: just store callback object */ - TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE) - : m_flags(flags), m_cb(t), m_maxWordLength(40), m_prevpos(-1) {} + TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE)) + : m_flags(flags), m_cb(t), m_maxWordLength(40), + m_nocjk((m_flags & TXTS_NOCJK) != 0), + m_prevpos(-1) + { + } /** * Split text, emit words and positions. @@ -69,11 +76,13 @@ private: Flags m_flags; TextSplitCB *m_cb; int m_maxWordLength; + int m_nocjk; // Current span. Might be jf.dockes@wanadoo.f string m_span; - // Current word: no punctuation at all in there + // Current word: no punctuation at all in there. Byte offset + // relative to the current span and byte length int m_wordStart; unsigned int m_wordLen; @@ -90,7 +99,7 @@ private: unsigned int m_prevlen; // This processes cjk text: - // bool cjk_to_words(); + bool cjk_to_words(Utf8Iter *it, unsigned int *cp); bool emitterm(bool isspan, string &term, int pos, int bs, int be); bool doemit(bool spanerase, int bp); diff --git a/src/utils/utf8iter.h b/src/utils/utf8iter.h index 1935c2e5..c59b459c 100644 --- a/src/utils/utf8iter.h +++ b/src/utils/utf8iter.h @@ -16,7 +16,7 @@ */ #ifndef _UTF8ITER_H_INCLUDED_ #define _UTF8ITER_H_INCLUDED_ -/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: utf8iter.h,v 1.9 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes */ /** * A small helper class to iterate over utf8 strings. This is not an @@ -30,16 +30,18 @@ public: Utf8Iter(const string &in) : m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false) { - compute_cl(); + update_cl(); } + const string& buffer() const {return m_s;} + void rewind() { m_cl = 0; m_pos = 0; m_charpos = 0; m_error = false; - compute_cl(); + update_cl(); } /** "Direct" access. Awfully inefficient as we skip from start or current @@ -56,7 +58,7 @@ public: int l; while (mypos < m_s.length() && mycp != charpos) { l = get_cl(mypos); - if (l < 0) + if (l <= 0) return (unsigned int)-1; mypos += l; ++mycp; @@ -77,12 +79,12 @@ public: #ifdef UTF8ITER_CHECK assert(m_cl != 0); #endif - if (m_cl == 0) + if (m_cl <= 0) return string::npos; m_pos += m_cl; m_charpos++; - compute_cl(); + update_cl(); return m_pos; } @@ -121,10 +123,17 @@ public: return m_error; } + /** Return current byte offset in input string */ string::size_type getBpos() const { return m_pos; } + /** Return current character length */ + string::size_type getBlen() const { + return m_cl; + } + + /** Return current unicode character offset in input string */ string::size_type getCpos() const { return m_charpos; } @@ -133,12 +142,13 @@ private: // String we're working with const string& m_s; // Character length at current position. A value of zero indicates - // unknown or error. + // an error. unsigned int m_cl; // Current byte offset in string. string::size_type m_pos; // Current character position unsigned int m_charpos; + // Am I ok ? mutable bool m_error; // Check position and cl against string length @@ -149,24 +159,24 @@ private: return p != string::npos && l > 0 && p + l <= m_s.length(); } - // Update current char length in object state, minimum checking for - // errors - inline int compute_cl() + // Update current char length in object state, minimum checking + // for errors + inline void update_cl() { m_cl = 0; - if (m_pos == m_s.length()) - return -1; + if (m_pos >= m_s.length()) + return; m_cl = get_cl(m_pos); if (!poslok(m_pos, m_cl)) { - m_pos = m_s.length(); + // Used to set eof here for safety, but this is bad because it + // basically prevents the caller to discriminate error and eof. + // m_pos = m_s.length(); m_cl = 0; m_error = true; - return -1; } - return 0; } - // Get character byte length at specified position + // Get character byte length at specified position. Returns 0 for error. inline int get_cl(string::size_type p) const { unsigned int z = (unsigned char)m_s[p]; @@ -183,7 +193,7 @@ private: assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 || (z & 248) == 240); #endif - return -1; + return 0; } // Compute value at given position. No error checking.