initial cjk support
This commit is contained in:
parent
844f4f831a
commit
069d71ea8f
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.30 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.31 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -95,6 +95,51 @@ static void setcharclasses()
|
||||
init = 1;
|
||||
}
|
||||
|
||||
static inline int whatcc(unsigned int c)
|
||||
{
|
||||
if (c <= 127) {
|
||||
return charclasses[c];
|
||||
} else {
|
||||
if (unicign.find(c) != unicign.end())
|
||||
return SPACE;
|
||||
else
|
||||
return LETTER;
|
||||
}
|
||||
}
|
||||
|
||||
// 2E80..2EFF; CJK Radicals Supplement
|
||||
// 3000..303F; CJK Symbols and Punctuation
|
||||
// 3040..309F; Hiragana
|
||||
// 30A0..30FF; Katakana
|
||||
// 3100..312F; Bopomofo
|
||||
// 3130..318F; Hangul Compatibility Jamo
|
||||
// 3190..319F; Kanbun
|
||||
// 31A0..31BF; Bopomofo Extended
|
||||
// 31C0..31EF; CJK Strokes
|
||||
// 31F0..31FF; Katakana Phonetic Extensions
|
||||
// 3200..32FF; Enclosed CJK Letters and Months
|
||||
// 3300..33FF; CJK Compatibility
|
||||
// 3400..4DBF; CJK Unified Ideographs Extension A
|
||||
// 4DC0..4DFF; Yijing Hexagram Symbols
|
||||
// 4E00..9FFF; CJK Unified Ideographs
|
||||
// A700..A71F; Modifier Tone Letters
|
||||
// AC00..D7AF; Hangul Syllables
|
||||
// F900..FAFF; CJK Compatibility Ideographs
|
||||
// FE30..FE4F; CJK Compatibility Forms
|
||||
// FF00..FFEF; Halfwidth and Fullwidth Forms
|
||||
// 20000..2A6DF; CJK Unified Ideographs Extension B
|
||||
// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
|
||||
#define UNICODE_IS_CJK(p) \
|
||||
(((p) >= 0x2E80 && (p) <= 0x2EFF) \
|
||||
|| ((p) >= 0x3000 && (p) <= 0x9FFF) \
|
||||
|| ((p) >= 0xA700 && (p) <= 0xA71F) \
|
||||
|| ((p) >= 0xAC00 && (p) <= 0xD7AF) \
|
||||
|| ((p) >= 0xF900 && (p) <= 0xFAFF) \
|
||||
|| ((p) >= 0xFE30 && (p) <= 0xFE4F) \
|
||||
|| ((p) >= 0xFF00 && (p) <= 0xFFEF) \
|
||||
|| ((p) >= 0x20000 && (p) <= 0x2A6DF) \
|
||||
|| ((p) >= 0x2F800 && (p) <= 0x2FA1F))
|
||||
|
||||
// Do some checking (the kind which is simpler to do here than in the
|
||||
// main loop), then send term to our client.
|
||||
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||
@ -190,18 +235,6 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline int whatcc(unsigned int c)
|
||||
{
|
||||
if (c <= 127) {
|
||||
return charclasses[c];
|
||||
} else {
|
||||
if (unicign.find(c) != unicign.end())
|
||||
return SPACE;
|
||||
else
|
||||
return LETTER;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Splitting a text into terms to be indexed.
|
||||
* We basically emit a word every time we see a separator, but some chars are
|
||||
@ -210,7 +243,11 @@ static inline int whatcc(unsigned int c)
|
||||
*/
|
||||
bool TextSplit::text_to_words(const string &in)
|
||||
{
|
||||
LOGDEB2(("TextSplit::text_to_words: cb %p in [%s]\n", cb,
|
||||
LOGDEB(("TextSplit::text_to_words:%s%s%s%s [%s]\n",
|
||||
m_flags & TXTS_NOSPANS ? " nospans" : "",
|
||||
m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
|
||||
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
||||
m_flags & TXTS_NOCJK ? " nocjk" : "",
|
||||
in.substr(0,50).c_str()));
|
||||
|
||||
setcharclasses();
|
||||
@ -228,6 +265,27 @@ bool TextSplit::text_to_words(const string &in)
|
||||
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!m_nocjk && UNICODE_IS_CJK(c)) {
|
||||
// CJK character hit.
|
||||
// Do like at EOF with the current non-cjk data.
|
||||
if (m_wordLen || m_span.length()) {
|
||||
if (!doemit(true, it.getBpos()))
|
||||
return false;
|
||||
}
|
||||
|
||||
// Hand off situation to the cjk routine.
|
||||
if (!cjk_to_words(&it, &c)) {
|
||||
LOGERR(("Textsplit: scan error in cjk handler\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for eof, else c contains the first non-cjk
|
||||
// character after the cjk sequence, just go on.
|
||||
if (it.eof())
|
||||
break;
|
||||
}
|
||||
|
||||
int cc = whatcc(c);
|
||||
switch (cc) {
|
||||
case LETTER:
|
||||
@ -360,7 +418,101 @@ bool TextSplit::text_to_words(const string &in)
|
||||
return true;
|
||||
}
|
||||
|
||||
// Callback class for utility function usage
|
||||
const unsigned int ngramlen = 2;
|
||||
#define MAXNGRAMLEN 5
|
||||
|
||||
// Using an utf8iter pointer just to avoid needing its definition in
|
||||
// textsplit.h
|
||||
//
|
||||
// We output ngrams for exemple for char input a b c and ngramlen== 2,
|
||||
// we generate: a ab b bc c as words
|
||||
//
|
||||
// This is very different from the normal behaviour, so we don't use
|
||||
// the doemit() and emitterm() routines
|
||||
//
|
||||
// The routine is sort of a mess and goes to show that we'd probably
|
||||
// be better off converting the whole buffer to utf32 on entry...
|
||||
bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
{
|
||||
LOGDEB(("cjk_to_words: m_wordpos %d\n", m_wordpos));
|
||||
Utf8Iter &it = *itp;
|
||||
|
||||
// We use an offset buffer to remember the starts of the utf-8
|
||||
// characters which we still need to use.
|
||||
// Fixed size array. ngramlen over 3 doesn't make sense.
|
||||
assert(ngramlen < MAXNGRAMLEN);
|
||||
unsigned int boffs[MAXNGRAMLEN];
|
||||
|
||||
// Current number of valid offsets;
|
||||
unsigned int nchars = 0;
|
||||
unsigned int c = 0;
|
||||
for (; !it.eof(); it++) {
|
||||
c = *it;
|
||||
if (!UNICODE_IS_CJK(c)) {
|
||||
// Return to normal handler
|
||||
break;
|
||||
}
|
||||
|
||||
if (nchars == ngramlen) {
|
||||
// Offset buffer full, shift it. Might be more efficient
|
||||
// to have a circular one, but things are complicated
|
||||
// enough already...
|
||||
for (unsigned int i = 0; i < nchars-1; i++) {
|
||||
boffs[i] = boffs[i+1];
|
||||
}
|
||||
} else {
|
||||
nchars++;
|
||||
}
|
||||
|
||||
// Take note of byte offset for this character.
|
||||
boffs[nchars-1] = it.getBpos();
|
||||
|
||||
// Output all new ngrams: they begin at each existing position
|
||||
// and end after the new character. onlyspans->only output
|
||||
// maximum words, nospans=> single chars
|
||||
if (!(m_flags & TXTS_ONLYSPANS) || nchars == ngramlen) {
|
||||
unsigned int btend = it.getBpos() + it.getBlen();
|
||||
unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
|
||||
unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
|
||||
for (unsigned int i = loopbeg; i < loopend; i++) {
|
||||
if (!m_cb->takeword(it.buffer().substr(boffs[i],
|
||||
btend-boffs[i]),
|
||||
m_wordpos - (nchars-i-1), boffs[i], btend)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if ((m_flags & TXTS_ONLYSPANS)) {
|
||||
// Only spans: don't overlap: flush buffer
|
||||
nchars = 0;
|
||||
}
|
||||
}
|
||||
// Increase word position by one, other words are at an
|
||||
// existing position. This could be subject to discussion...
|
||||
m_wordpos++;
|
||||
}
|
||||
|
||||
// If onlyspans is set, there may be things to flush in the buffer
|
||||
// first
|
||||
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != ngramlen) {
|
||||
unsigned int btend = it.getBpos(); // Current char is out
|
||||
if (!m_cb->takeword(it.buffer().substr(boffs[0],
|
||||
btend-boffs[0]),
|
||||
m_wordpos - nchars,
|
||||
boffs[0], btend)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
m_span.erase();
|
||||
m_inNumber = false;
|
||||
m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0;
|
||||
m_spanpos = m_wordpos;
|
||||
*cp = c;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Callback class for countWords
|
||||
class utSplitterCB : public TextSplitCB {
|
||||
public:
|
||||
int wcnt;
|
||||
@ -404,11 +556,12 @@ class mySplitterCB : public TextSplitCB {
|
||||
bool takeword(const string &term, int pos, int bs, int be) {
|
||||
if (nooutput)
|
||||
return true;
|
||||
FILE *fp = stdout;
|
||||
if (first) {
|
||||
printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
||||
fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
||||
first = 0;
|
||||
}
|
||||
printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
|
||||
fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
@ -438,6 +591,7 @@ static string usage =
|
||||
" -s: only spans\n"
|
||||
" -w: only words\n"
|
||||
" -k: preserve wildcards (?*)\n"
|
||||
" -C: desactivate CJK processing\n"
|
||||
" -c: just count words\n"
|
||||
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
||||
" \n\n"
|
||||
@ -456,6 +610,7 @@ static int op_flags;
|
||||
#define OPT_S 0x4
|
||||
#define OPT_c 0x8
|
||||
#define OPT_k 0x10
|
||||
#define OPT_C 0x20
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
@ -470,6 +625,7 @@ int main(int argc, char **argv)
|
||||
while (**argv)
|
||||
switch (*(*argv)++) {
|
||||
case 'c': op_flags |= OPT_c; break;
|
||||
case 'C': op_flags |= OPT_C; break;
|
||||
case 'k': op_flags |= OPT_k; break;
|
||||
case 's': op_flags |= OPT_s; break;
|
||||
case 'S': op_flags |= OPT_S; break;
|
||||
@ -494,6 +650,9 @@ int main(int argc, char **argv)
|
||||
if (op_flags & OPT_k)
|
||||
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
|
||||
|
||||
if (op_flags & OPT_C)
|
||||
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_NOCJK);
|
||||
|
||||
string data;
|
||||
if (argc == 1) {
|
||||
const char *filename = *argv++; argc--;
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||
#define _TEXTSPLIT_H_INCLUDED_
|
||||
/* @(#$Id: textsplit.h,v 1.17 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: textsplit.h,v 1.18 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#ifndef NO_NAMESPACES
|
||||
@ -36,6 +36,8 @@ public:
|
||||
) = 0;
|
||||
};
|
||||
|
||||
class Utf8Iter;
|
||||
|
||||
/**
|
||||
* Split text into words.
|
||||
* See comments at top of .cpp for more explanations.
|
||||
@ -47,14 +49,19 @@ public:
|
||||
enum Flags {TXTS_NONE = 0,
|
||||
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
||||
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
|
||||
TXTS_KEEPWILD = 4 // Handle wildcards as letters
|
||||
TXTS_KEEPWILD = 4, // Handle wildcards as letters
|
||||
TXTS_NOCJK = 8 // CJK special processing
|
||||
};
|
||||
|
||||
/**
|
||||
* Constructor: just store callback object
|
||||
*/
|
||||
TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE)
|
||||
: m_flags(flags), m_cb(t), m_maxWordLength(40), m_prevpos(-1) {}
|
||||
TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE))
|
||||
: m_flags(flags), m_cb(t), m_maxWordLength(40),
|
||||
m_nocjk((m_flags & TXTS_NOCJK) != 0),
|
||||
m_prevpos(-1)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Split text, emit words and positions.
|
||||
@ -69,11 +76,13 @@ private:
|
||||
Flags m_flags;
|
||||
TextSplitCB *m_cb;
|
||||
int m_maxWordLength;
|
||||
int m_nocjk;
|
||||
|
||||
// Current span. Might be jf.dockes@wanadoo.f
|
||||
string m_span;
|
||||
|
||||
// Current word: no punctuation at all in there
|
||||
// Current word: no punctuation at all in there. Byte offset
|
||||
// relative to the current span and byte length
|
||||
int m_wordStart;
|
||||
unsigned int m_wordLen;
|
||||
|
||||
@ -90,7 +99,7 @@ private:
|
||||
unsigned int m_prevlen;
|
||||
|
||||
// This processes cjk text:
|
||||
// bool cjk_to_words();
|
||||
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
||||
|
||||
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
||||
bool doemit(bool spanerase, int bp);
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _UTF8ITER_H_INCLUDED_
|
||||
#define _UTF8ITER_H_INCLUDED_
|
||||
/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: utf8iter.h,v 1.9 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
/**
|
||||
* A small helper class to iterate over utf8 strings. This is not an
|
||||
@ -30,16 +30,18 @@ public:
|
||||
Utf8Iter(const string &in)
|
||||
: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
|
||||
{
|
||||
compute_cl();
|
||||
update_cl();
|
||||
}
|
||||
|
||||
const string& buffer() const {return m_s;}
|
||||
|
||||
void rewind()
|
||||
{
|
||||
m_cl = 0;
|
||||
m_pos = 0;
|
||||
m_charpos = 0;
|
||||
m_error = false;
|
||||
compute_cl();
|
||||
update_cl();
|
||||
}
|
||||
|
||||
/** "Direct" access. Awfully inefficient as we skip from start or current
|
||||
@ -56,7 +58,7 @@ public:
|
||||
int l;
|
||||
while (mypos < m_s.length() && mycp != charpos) {
|
||||
l = get_cl(mypos);
|
||||
if (l < 0)
|
||||
if (l <= 0)
|
||||
return (unsigned int)-1;
|
||||
mypos += l;
|
||||
++mycp;
|
||||
@ -77,12 +79,12 @@ public:
|
||||
#ifdef UTF8ITER_CHECK
|
||||
assert(m_cl != 0);
|
||||
#endif
|
||||
if (m_cl == 0)
|
||||
if (m_cl <= 0)
|
||||
return string::npos;
|
||||
|
||||
m_pos += m_cl;
|
||||
m_charpos++;
|
||||
compute_cl();
|
||||
update_cl();
|
||||
return m_pos;
|
||||
}
|
||||
|
||||
@ -121,10 +123,17 @@ public:
|
||||
return m_error;
|
||||
}
|
||||
|
||||
/** Return current byte offset in input string */
|
||||
string::size_type getBpos() const {
|
||||
return m_pos;
|
||||
}
|
||||
|
||||
/** Return current character length */
|
||||
string::size_type getBlen() const {
|
||||
return m_cl;
|
||||
}
|
||||
|
||||
/** Return current unicode character offset in input string */
|
||||
string::size_type getCpos() const {
|
||||
return m_charpos;
|
||||
}
|
||||
@ -133,12 +142,13 @@ private:
|
||||
// String we're working with
|
||||
const string& m_s;
|
||||
// Character length at current position. A value of zero indicates
|
||||
// unknown or error.
|
||||
// an error.
|
||||
unsigned int m_cl;
|
||||
// Current byte offset in string.
|
||||
string::size_type m_pos;
|
||||
// Current character position
|
||||
unsigned int m_charpos;
|
||||
// Am I ok ?
|
||||
mutable bool m_error;
|
||||
|
||||
// Check position and cl against string length
|
||||
@ -149,24 +159,24 @@ private:
|
||||
return p != string::npos && l > 0 && p + l <= m_s.length();
|
||||
}
|
||||
|
||||
// Update current char length in object state, minimum checking for
|
||||
// errors
|
||||
inline int compute_cl()
|
||||
// Update current char length in object state, minimum checking
|
||||
// for errors
|
||||
inline void update_cl()
|
||||
{
|
||||
m_cl = 0;
|
||||
if (m_pos == m_s.length())
|
||||
return -1;
|
||||
if (m_pos >= m_s.length())
|
||||
return;
|
||||
m_cl = get_cl(m_pos);
|
||||
if (!poslok(m_pos, m_cl)) {
|
||||
m_pos = m_s.length();
|
||||
// Used to set eof here for safety, but this is bad because it
|
||||
// basically prevents the caller to discriminate error and eof.
|
||||
// m_pos = m_s.length();
|
||||
m_cl = 0;
|
||||
m_error = true;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Get character byte length at specified position
|
||||
// Get character byte length at specified position. Returns 0 for error.
|
||||
inline int get_cl(string::size_type p) const
|
||||
{
|
||||
unsigned int z = (unsigned char)m_s[p];
|
||||
@ -183,7 +193,7 @@ private:
|
||||
assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
|
||||
(z & 248) == 240);
|
||||
#endif
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Compute value at given position. No error checking.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user