make cjk ngramlen configurable
This commit is contained in:
parent
75d251a6b5
commit
90e378333e
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.51 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.52 2007-10-04 12:21:52 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -136,7 +136,12 @@ RclConfig::RclConfig(const string *argcnf)
|
|||||||
if (getConfParam("nocjk", &nocjk) && nocjk == true) {
|
if (getConfParam("nocjk", &nocjk) && nocjk == true) {
|
||||||
TextSplit::cjkProcessing(false);
|
TextSplit::cjkProcessing(false);
|
||||||
} else {
|
} else {
|
||||||
TextSplit::cjkProcessing(true);
|
int ngramlen;
|
||||||
|
if (getConfParam("cjkngramlen", &ngramlen)) {
|
||||||
|
TextSplit::cjkProcessing(true, (unsigned int)ngramlen);
|
||||||
|
} else {
|
||||||
|
TextSplit::cjkProcessing(true);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
m_ok = true;
|
m_ok = true;
|
||||||
return;
|
return;
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.34 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.35 2007-10-04 12:21:52 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -141,7 +141,8 @@ static inline int whatcc(unsigned int c)
|
|||||||
|| ((p) >= 0x20000 && (p) <= 0x2A6DF) \
|
|| ((p) >= 0x20000 && (p) <= 0x2A6DF) \
|
||||||
|| ((p) >= 0x2F800 && (p) <= 0x2FA1F))
|
|| ((p) >= 0x2F800 && (p) <= 0x2FA1F))
|
||||||
|
|
||||||
bool TextSplit::t_processCJK = true;
|
bool TextSplit::o_processCJK = true;
|
||||||
|
unsigned int TextSplit::o_CJKNgramLen = 2;
|
||||||
|
|
||||||
// Do some checking (the kind which is simpler to do here than in the
|
// Do some checking (the kind which is simpler to do here than in the
|
||||||
// main loop), then send term to our client.
|
// main loop), then send term to our client.
|
||||||
@ -246,12 +247,12 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
|
|||||||
*/
|
*/
|
||||||
bool TextSplit::text_to_words(const string &in)
|
bool TextSplit::text_to_words(const string &in)
|
||||||
{
|
{
|
||||||
LOGDEB1(("TextSplit::text_to_words: docjk %d %s%s%s [%s]\n",
|
LOGDEB1(("TextSplit::text_to_words: docjk %d (%d) %s%s%s [%s]\n",
|
||||||
t_processCJK,
|
o_processCJK, o_CJKNgramLen,
|
||||||
m_flags & TXTS_NOSPANS ? " nospans" : "",
|
m_flags & TXTS_NOSPANS ? " nospans" : "",
|
||||||
m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
|
m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
|
||||||
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
||||||
in.substr(0,50).c_str()));
|
in.substr(0,50).c_str()));
|
||||||
|
|
||||||
setcharclasses();
|
setcharclasses();
|
||||||
|
|
||||||
@ -269,7 +270,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (t_processCJK && UNICODE_IS_CJK(c)) {
|
if (o_processCJK && UNICODE_IS_CJK(c)) {
|
||||||
// CJK character hit.
|
// CJK character hit.
|
||||||
// Do like at EOF with the current non-cjk data.
|
// Do like at EOF with the current non-cjk data.
|
||||||
if (m_wordLen || m_span.length()) {
|
if (m_wordLen || m_span.length()) {
|
||||||
@ -421,9 +422,6 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
const unsigned int ngramlen = 2;
|
|
||||||
#define MAXNGRAMLEN 5
|
|
||||||
|
|
||||||
// Using an utf8iter pointer just to avoid needing its definition in
|
// Using an utf8iter pointer just to avoid needing its definition in
|
||||||
// textsplit.h
|
// textsplit.h
|
||||||
//
|
//
|
||||||
@ -442,9 +440,8 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
|
|
||||||
// We use an offset buffer to remember the starts of the utf-8
|
// We use an offset buffer to remember the starts of the utf-8
|
||||||
// characters which we still need to use.
|
// characters which we still need to use.
|
||||||
// Fixed size array. ngramlen over 3 doesn't make sense.
|
assert(o_CJKNgramLen < o_CJKMaxNgramLen);
|
||||||
assert(ngramlen < MAXNGRAMLEN);
|
unsigned int boffs[o_CJKMaxNgramLen+1];
|
||||||
unsigned int boffs[MAXNGRAMLEN];
|
|
||||||
|
|
||||||
// Current number of valid offsets;
|
// Current number of valid offsets;
|
||||||
unsigned int nchars = 0;
|
unsigned int nchars = 0;
|
||||||
@ -456,7 +453,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nchars == ngramlen) {
|
if (nchars == o_CJKNgramLen) {
|
||||||
// Offset buffer full, shift it. Might be more efficient
|
// Offset buffer full, shift it. Might be more efficient
|
||||||
// to have a circular one, but things are complicated
|
// to have a circular one, but things are complicated
|
||||||
// enough already...
|
// enough already...
|
||||||
@ -473,7 +470,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
// Output all new ngrams: they begin at each existing position
|
// Output all new ngrams: they begin at each existing position
|
||||||
// and end after the new character. onlyspans->only output
|
// and end after the new character. onlyspans->only output
|
||||||
// maximum words, nospans=> single chars
|
// maximum words, nospans=> single chars
|
||||||
if (!(m_flags & TXTS_ONLYSPANS) || nchars == ngramlen) {
|
if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) {
|
||||||
unsigned int btend = it.getBpos() + it.getBlen();
|
unsigned int btend = it.getBpos() + it.getBlen();
|
||||||
unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
|
unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
|
||||||
unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
|
unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
|
||||||
@ -497,7 +494,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
|||||||
|
|
||||||
// If onlyspans is set, there may be things to flush in the buffer
|
// If onlyspans is set, there may be things to flush in the buffer
|
||||||
// first
|
// first
|
||||||
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != ngramlen) {
|
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
|
||||||
unsigned int btend = it.getBpos(); // Current char is out
|
unsigned int btend = it.getBpos(); // Current char is out
|
||||||
if (!m_cb->takeword(it.buffer().substr(boffs[0],
|
if (!m_cb->takeword(it.buffer().substr(boffs[0],
|
||||||
btend-boffs[0]),
|
btend-boffs[0]),
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||||
#define _TEXTSPLIT_H_INCLUDED_
|
#define _TEXTSPLIT_H_INCLUDED_
|
||||||
/* @(#$Id: textsplit.h,v 1.19 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: textsplit.h,v 1.20 2007-10-04 12:21:52 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
@ -38,6 +38,7 @@ public:
|
|||||||
|
|
||||||
class Utf8Iter;
|
class Utf8Iter;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Split text into words.
|
* Split text into words.
|
||||||
* See comments at top of .cpp for more explanations.
|
* See comments at top of .cpp for more explanations.
|
||||||
@ -48,8 +49,15 @@ class TextSplit {
|
|||||||
public:
|
public:
|
||||||
// Should we activate special processing of Chinese characters ? This
|
// Should we activate special processing of Chinese characters ? This
|
||||||
// needs a little more cpu, so it can be turned off globally.
|
// needs a little more cpu, so it can be turned off globally.
|
||||||
static bool t_processCJK;
|
static bool o_processCJK;
|
||||||
static void cjkProcessing(bool onoff) {t_processCJK = onoff;}
|
static unsigned int o_CJKNgramLen;
|
||||||
|
static const unsigned int o_CJKMaxNgramLen = 5;
|
||||||
|
static void cjkProcessing(bool onoff, unsigned int ngramlen = 2)
|
||||||
|
{
|
||||||
|
o_processCJK = onoff;
|
||||||
|
o_CJKNgramLen = ngramlen <= o_CJKMaxNgramLen ?
|
||||||
|
ngramlen : o_CJKMaxNgramLen;
|
||||||
|
}
|
||||||
|
|
||||||
enum Flags {TXTS_NONE = 0,
|
enum Flags {TXTS_NONE = 0,
|
||||||
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user