initial cjk support

2007-09-20 08:45:05 +00:00 · 2007-09-20 08:45:05 +00:00 · 069d71ea8f
commit 069d71ea8f
parent 844f4f831a
3 changed files with 218 additions and 40 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.30 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.31 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -95,6 +95,51 @@ static void setcharclasses()
    init = 1;
 }
 static inline int whatcc(unsigned int c)
 {
    if (c <= 127) {
 	return charclasses[c]; 
    } else {
 	if (unicign.find(c) != unicign.end())
 	    return SPACE;
 	else
 	    return LETTER;
    }
 }
 // 2E80..2EFF; CJK Radicals Supplement
 // 3000..303F; CJK Symbols and Punctuation
 // 3040..309F; Hiragana
 // 30A0..30FF; Katakana
 // 3100..312F; Bopomofo
 // 3130..318F; Hangul Compatibility Jamo
 // 3190..319F; Kanbun
 // 31A0..31BF; Bopomofo Extended
 // 31C0..31EF; CJK Strokes
 // 31F0..31FF; Katakana Phonetic Extensions
 // 3200..32FF; Enclosed CJK Letters and Months
 // 3300..33FF; CJK Compatibility
 // 3400..4DBF; CJK Unified Ideographs Extension A
 // 4DC0..4DFF; Yijing Hexagram Symbols
 // 4E00..9FFF; CJK Unified Ideographs
 // A700..A71F; Modifier Tone Letters
 // AC00..D7AF; Hangul Syllables
 // F900..FAFF; CJK Compatibility Ideographs
 // FE30..FE4F; CJK Compatibility Forms
 // FF00..FFEF; Halfwidth and Fullwidth Forms
 // 20000..2A6DF; CJK Unified Ideographs Extension B
 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
 #define UNICODE_IS_CJK(p)						\
    (((p) >= 0x2E80 && (p) <= 0x2EFF)                                   \
     || ((p) >= 0x3000 && (p) <= 0x9FFF)                                \
     || ((p) >= 0xA700 && (p) <= 0xA71F)                                \
     || ((p) >= 0xAC00 && (p) <= 0xD7AF)                                \
     || ((p) >= 0xF900 && (p) <= 0xFAFF)                                \
     || ((p) >= 0xFE30 && (p) <= 0xFE4F)                                \
     || ((p) >= 0xFF00 && (p) <= 0xFFEF)                                \
     || ((p) >= 0x20000 && (p) <= 0x2A6DF)                              \
     || ((p) >= 0x2F800 && (p) <= 0x2FA1F))
 // Do some checking (the kind which is simpler to do here than in the
 // main loop), then send term to our client.
 inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
@ -190,18 +235,6 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
    return true;
 }
 static inline int whatcc(unsigned int c)
 {
    if (c <= 127) {
 	return charclasses[c]; 
    } else {
 	if (unicign.find(c) != unicign.end())
 	    return SPACE;
 	else
 	    return LETTER;
    }
 }
 /** 
 * Splitting a text into terms to be indexed.
 * We basically emit a word every time we see a separator, but some chars are
@ -210,7 +243,11 @@ static inline int whatcc(unsigned int c)
 */
 bool TextSplit::text_to_words(const string &in)
 {
-    LOGDEB2(("TextSplit::text_to_words: cb %p in [%s]\n", cb, 
+    LOGDEB(("TextSplit::text_to_words:%s%s%s%s [%s]\n", 
 	    m_flags & TXTS_NOSPANS ? " nospans" : "",
 	    m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
 	    m_flags & TXTS_KEEPWILD ? " keepwild" : "",
 	    m_flags & TXTS_NOCJK ? " nocjk" : "",
 	    in.substr(0,50).c_str()));
    setcharclasses();
@ -228,6 +265,27 @@ bool TextSplit::text_to_words(const string &in)
 	    LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
 	    return false;
 	}
 	if (!m_nocjk && UNICODE_IS_CJK(c)) {
 	    // CJK character hit. 
 	    // Do like at EOF with the current non-cjk data.
 	    if (m_wordLen || m_span.length()) {
 		if (!doemit(true, it.getBpos()))
 		    return false;
 	    }
 	    // Hand off situation to the cjk routine.
 	    if (!cjk_to_words(&it, &c)) {
 		LOGERR(("Textsplit: scan error in cjk handler\n"));
 		return false;
 	    }
 	    // Check for eof, else c contains the first non-cjk
 	    // character after the cjk sequence, just go on.
 	    if (it.eof())
 		break;
 	}
 	int cc = whatcc(c);
 	switch (cc) {
 	case LETTER:
@ -360,7 +418,101 @@ bool TextSplit::text_to_words(const string &in)
    return true;
 }
-// Callback class for utility function usage
+const unsigned int ngramlen = 2;
 #define MAXNGRAMLEN 5
 // Using an utf8iter pointer just to avoid needing its definition in
 // textsplit.h
 //
 // We output ngrams for exemple for char input a b c and ngramlen== 2, 
 // we generate: a ab b bc c as words
 //
 // This is very different from the normal behaviour, so we don't use
 // the doemit() and emitterm() routines
 //
 // The routine is sort of a mess and goes to show that we'd probably
 // be better off converting the whole buffer to utf32 on entry...
 bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
 {
    LOGDEB(("cjk_to_words: m_wordpos %d\n", m_wordpos));
    Utf8Iter &it = *itp;
    // We use an offset buffer to remember the starts of the utf-8
    // characters which we still need to use.
    // Fixed size array. ngramlen over 3 doesn't make sense.
    assert(ngramlen < MAXNGRAMLEN);
    unsigned int boffs[MAXNGRAMLEN];
    // Current number of valid offsets;
    unsigned int nchars = 0;
    unsigned int c = 0;
    for (; !it.eof(); it++) {
 	c = *it;
 	if (!UNICODE_IS_CJK(c)) {
 	    // Return to normal handler
 	    break;
 	}
 	if (nchars == ngramlen) {
 	    // Offset buffer full, shift it. Might be more efficient
 	    // to have a circular one, but things are complicated
 	    // enough already...
 	    for (unsigned int i = 0; i < nchars-1; i++) {
 		boffs[i] = boffs[i+1];
 	    }
 	}  else {
 	    nchars++;
 	}
 	// Take note of byte offset for this character.
 	boffs[nchars-1] = it.getBpos();
 	// Output all new ngrams: they begin at each existing position
 	// and end after the new character. onlyspans->only output
 	// maximum words, nospans=> single chars
 	if (!(m_flags & TXTS_ONLYSPANS) || nchars == ngramlen) {
 	    unsigned int btend = it.getBpos() + it.getBlen();
 	    unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
 	    unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
 	    for (unsigned int i = loopbeg; i < loopend; i++) {
 		if (!m_cb->takeword(it.buffer().substr(boffs[i], 
 						       btend-boffs[i]),
 				m_wordpos - (nchars-i-1), boffs[i], btend)) {
 		    return false;
 		}
 	    }
 	    if ((m_flags & TXTS_ONLYSPANS)) {
 		// Only spans: don't overlap: flush buffer
 		nchars = 0;
 	    }
 	}
 	// Increase word position by one, other words are at an
 	// existing position. This could be subject to discussion...
 	m_wordpos++;
    }
    // If onlyspans is set, there may be things to flush in the buffer
    // first
    if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != ngramlen)  {
 	unsigned int btend = it.getBpos(); // Current char is out
 	if (!m_cb->takeword(it.buffer().substr(boffs[0], 
 					       btend-boffs[0]),
 			    m_wordpos - nchars,
 			    boffs[0], btend)) {
 	    return false;
 	}
    }
    m_span.erase();
    m_inNumber = false;
    m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0;
    m_spanpos = m_wordpos;
    *cp = c;
    return true;
 }
 // Callback class for countWords 
 class utSplitterCB : public TextSplitCB {
 public:
    int wcnt;
@ -404,11 +556,12 @@ class mySplitterCB : public TextSplitCB {
    bool takeword(const string &term, int pos, int bs, int be) {
 	if (nooutput)
 	    return true;
 	FILE *fp = stdout;
 	if (first) {
-	    printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
+	    fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
 	    first = 0;
 	}
-	printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
+	fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
 	return true;
    }
 };
@ -438,6 +591,7 @@ static string usage =
    "   -s:  only spans\n"
    "   -w:  only words\n"
    "   -k:  preserve wildcards (?*)\n"
    "   -C:  desactivate CJK processing\n"
    "   -c: just count words\n"
    " if filename is 'stdin', will read stdin for data (end with ^D)\n"
    "  \n\n"
@ -456,6 +610,7 @@ static int        op_flags;
 #define OPT_S	  0x4
 #define OPT_c     0x8
 #define OPT_k     0x10
 #define OPT_C     0x20
 int main(int argc, char **argv)
 {
@ -470,6 +625,7 @@ int main(int argc, char **argv)
 	while (**argv)
 	    switch (*(*argv)++) {
 	    case 'c':	op_flags |= OPT_c; break;
 	    case 'C':	op_flags |= OPT_C; break;
 	    case 'k':	op_flags |= OPT_k; break;
 	    case 's':	op_flags |= OPT_s; break;
 	    case 'S':	op_flags |= OPT_S; break;
@ -494,6 +650,9 @@ int main(int argc, char **argv)
    if (op_flags & OPT_k) 
 	flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); 
    if (op_flags & OPT_C) 
 	flags = (TextSplit::Flags)(flags | TextSplit::TXTS_NOCJK); 
    string data;
    if (argc == 1) {
 	const char *filename = *argv++;	argc--;
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -16,7 +16,7 @@
 */
 #ifndef _TEXTSPLIT_H_INCLUDED_
 #define _TEXTSPLIT_H_INCLUDED_
-/* @(#$Id: textsplit.h,v 1.17 2007-09-18 20:35:31 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: textsplit.h,v 1.18 2007-09-20 08:45:05 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #ifndef NO_NAMESPACES
@ -36,6 +36,8 @@ public:
 			  ) = 0; 
 };
 class Utf8Iter;
 /** 
 * Split text into words. 
 * See comments at top of .cpp for more explanations.
@ -47,14 +49,19 @@ public:
    enum Flags {TXTS_NONE = 0, 
 		TXTS_ONLYSPANS = 1,  // Only return maximum spans (a@b.com) 
 		TXTS_NOSPANS = 2,  // Only return atomic words (a, b, com)
-		TXTS_KEEPWILD = 4 // Handle wildcards as letters
+		TXTS_KEEPWILD = 4, // Handle wildcards as letters
 		TXTS_NOCJK = 8     // CJK special processing
    };
    /**
     * Constructor: just store callback object
     */
-    TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE) 
+    TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE))
-	: m_flags(flags), m_cb(t), m_maxWordLength(40), m_prevpos(-1) {}
+	: m_flags(flags), m_cb(t), m_maxWordLength(40), 
 	  m_nocjk((m_flags & TXTS_NOCJK) != 0),
 	  m_prevpos(-1)
    {
    }
    /**
     * Split text, emit words and positions.
@ -69,11 +76,13 @@ private:
    Flags         m_flags;
    TextSplitCB  *m_cb;
    int           m_maxWordLength;
    int           m_nocjk;
    // Current span. Might be jf.dockes@wanadoo.f
    string        m_span; 
-    // Current word: no punctuation at all in there
+    // Current word: no punctuation at all in there. Byte offset
    // relative to the current span and byte length
    int           m_wordStart;
    unsigned int  m_wordLen;
@ -90,7 +99,7 @@ private:
    unsigned int  m_prevlen;
    // This processes cjk text:
-    // bool cjk_to_words();
+    bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
    bool emitterm(bool isspan, string &term, int pos, int bs, int be);
    bool doemit(bool spanerase, int bp);
--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@ -16,7 +16,7 @@
 */
 #ifndef _UTF8ITER_H_INCLUDED_
 #define _UTF8ITER_H_INCLUDED_
-/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: utf8iter.h,v 1.9 2007-09-20 08:45:05 dockes Exp $  (C) 2004 J.F.Dockes */
 /** 
 * A small helper class to iterate over utf8 strings. This is not an
@ -30,16 +30,18 @@ public:
    Utf8Iter(const string &in) 
 	: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
    {
-	compute_cl();
+	update_cl();
    }
    const string& buffer() const {return m_s;}
    void rewind() 
    {
 	m_cl = 0; 
 	m_pos = 0; 
 	m_charpos = 0; 
 	m_error = false;
-	compute_cl();
+	update_cl();
    }
    /** "Direct" access. Awfully inefficient as we skip from start or current
@ -56,7 +58,7 @@ public:
 	int l;
 	while (mypos < m_s.length() && mycp != charpos) {
 	    l = get_cl(mypos);
-	    if (l < 0)
+	    if (l <= 0)
 		return (unsigned int)-1;
 	    mypos += l;
 	    ++mycp;
@ -77,12 +79,12 @@ public:
 #ifdef UTF8ITER_CHECK
 	assert(m_cl != 0);
 #endif
-	if (m_cl == 0) 
+	if (m_cl <= 0) 
 	    return string::npos;
 	m_pos += m_cl;
 	m_charpos++;
-	compute_cl();
+	update_cl();
 	return m_pos;
    }
@ -121,10 +123,17 @@ public:
 	return m_error;
    }
    /** Return current byte offset in input string */
    string::size_type getBpos() const {
 	return m_pos;
    }
    /** Return current character length */
    string::size_type getBlen() const {
 	return m_cl;
    }
    /** Return current unicode character offset in input string */
    string::size_type getCpos() const {
 	return m_charpos;
    }
@ -133,12 +142,13 @@ private:
    // String we're working with
    const string&     m_s; 
    // Character length at current position. A value of zero indicates
-    // unknown or error.
+    // an error.
    unsigned int      m_cl; 
    // Current byte offset in string.
    string::size_type m_pos; 
    // Current character position
    unsigned int      m_charpos; 
    // Am I ok ?
    mutable bool      m_error;
    // Check position and cl against string length
@ -149,24 +159,24 @@ private:
 	return p != string::npos && l > 0 && p + l <= m_s.length();
    }
-    // Update current char length in object state, minimum checking for 
+    // Update current char length in object state, minimum checking
-    // errors
+    // for errors
-    inline int compute_cl() 
+    inline void update_cl() 
    {
 	m_cl = 0;
-	if (m_pos == m_s.length())
+	if (m_pos >= m_s.length())
-	    return -1;
+	    return;
 	m_cl = get_cl(m_pos);
 	if (!poslok(m_pos, m_cl)) {
-	    m_pos = m_s.length();
+	    // Used to set eof here for safety, but this is bad because it
 	    // basically prevents the caller to discriminate error and eof.
 	    //	    m_pos = m_s.length();
 	    m_cl = 0;
 	    m_error = true;
 	    return -1;
 	}
 	return 0;
    }
-    // Get character byte length at specified position
+    // Get character byte length at specified position. Returns 0 for error.
    inline int get_cl(string::size_type p) const 
    {
 	unsigned int z = (unsigned char)m_s[p];
@ -183,7 +193,7 @@ private:
 	assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
 	       (z & 248) == 240);
 #endif
-	return -1;
+	return 0;
    }
    // Compute value at given position. No error checking.