add flag to disable cjk processing
This commit is contained in:
parent
ad04604255
commit
4adb351ca4
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.50 2007-10-01 06:19:21 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.51 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -39,6 +39,7 @@ static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.50 2007-10-01 06:19:21 dockes E
|
|||||||
#include "conftree.h"
|
#include "conftree.h"
|
||||||
#include "debuglog.h"
|
#include "debuglog.h"
|
||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
|
#include "textsplit.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
using namespace std;
|
using namespace std;
|
||||||
@ -131,7 +132,12 @@ RclConfig::RclConfig(const string *argcnf)
|
|||||||
}
|
}
|
||||||
|
|
||||||
setKeyDir("");
|
setKeyDir("");
|
||||||
|
bool nocjk = false;
|
||||||
|
if (getConfParam("nocjk", &nocjk) && nocjk == true) {
|
||||||
|
TextSplit::cjkProcessing(false);
|
||||||
|
} else {
|
||||||
|
TextSplit::cjkProcessing(true);
|
||||||
|
}
|
||||||
m_ok = true;
|
m_ok = true;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.33 2007-09-22 08:51:29 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.34 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -141,6 +141,8 @@ static inline int whatcc(unsigned int c)
|
|||||||
|| ((p) >= 0x20000 && (p) <= 0x2A6DF) \
|
|| ((p) >= 0x20000 && (p) <= 0x2A6DF) \
|
||||||
|| ((p) >= 0x2F800 && (p) <= 0x2FA1F))
|
|| ((p) >= 0x2F800 && (p) <= 0x2FA1F))
|
||||||
|
|
||||||
|
bool TextSplit::t_processCJK = true;
|
||||||
|
|
||||||
// Do some checking (the kind which is simpler to do here than in the
|
// Do some checking (the kind which is simpler to do here than in the
|
||||||
// main loop), then send term to our client.
|
// main loop), then send term to our client.
|
||||||
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||||
@ -244,11 +246,11 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
|
|||||||
*/
|
*/
|
||||||
bool TextSplit::text_to_words(const string &in)
|
bool TextSplit::text_to_words(const string &in)
|
||||||
{
|
{
|
||||||
LOGDEB1(("TextSplit::text_to_words:%s%s%s%s [%s]\n",
|
LOGDEB1(("TextSplit::text_to_words: docjk %d %s%s%s [%s]\n",
|
||||||
|
t_processCJK,
|
||||||
m_flags & TXTS_NOSPANS ? " nospans" : "",
|
m_flags & TXTS_NOSPANS ? " nospans" : "",
|
||||||
m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
|
m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
|
||||||
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
||||||
m_flags & TXTS_NOCJK ? " nocjk" : "",
|
|
||||||
in.substr(0,50).c_str()));
|
in.substr(0,50).c_str()));
|
||||||
|
|
||||||
setcharclasses();
|
setcharclasses();
|
||||||
@ -267,7 +269,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!m_nocjk && UNICODE_IS_CJK(c)) {
|
if (t_processCJK && UNICODE_IS_CJK(c)) {
|
||||||
// CJK character hit.
|
// CJK character hit.
|
||||||
// Do like at EOF with the current non-cjk data.
|
// Do like at EOF with the current non-cjk data.
|
||||||
if (m_wordLen || m_span.length()) {
|
if (m_wordLen || m_span.length()) {
|
||||||
@ -592,7 +594,6 @@ static string usage =
|
|||||||
" -s: only spans\n"
|
" -s: only spans\n"
|
||||||
" -w: only words\n"
|
" -w: only words\n"
|
||||||
" -k: preserve wildcards (?*)\n"
|
" -k: preserve wildcards (?*)\n"
|
||||||
" -C: desactivate CJK processing\n"
|
|
||||||
" -c: just count words\n"
|
" -c: just count words\n"
|
||||||
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
||||||
" \n\n"
|
" \n\n"
|
||||||
@ -651,9 +652,6 @@ int main(int argc, char **argv)
|
|||||||
if (op_flags & OPT_k)
|
if (op_flags & OPT_k)
|
||||||
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
|
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD);
|
||||||
|
|
||||||
if (op_flags & OPT_C)
|
|
||||||
flags = (TextSplit::Flags)(flags | TextSplit::TXTS_NOCJK);
|
|
||||||
|
|
||||||
string data;
|
string data;
|
||||||
if (argc == 1) {
|
if (argc == 1) {
|
||||||
const char *filename = *argv++; argc--;
|
const char *filename = *argv++; argc--;
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||||
#define _TEXTSPLIT_H_INCLUDED_
|
#define _TEXTSPLIT_H_INCLUDED_
|
||||||
/* @(#$Id: textsplit.h,v 1.18 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: textsplit.h,v 1.19 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
@ -46,11 +46,15 @@ class Utf8Iter;
|
|||||||
*/
|
*/
|
||||||
class TextSplit {
|
class TextSplit {
|
||||||
public:
|
public:
|
||||||
|
// Should we activate special processing of Chinese characters ? This
|
||||||
|
// needs a little more cpu, so it can be turned off globally.
|
||||||
|
static bool t_processCJK;
|
||||||
|
static void cjkProcessing(bool onoff) {t_processCJK = onoff;}
|
||||||
|
|
||||||
enum Flags {TXTS_NONE = 0,
|
enum Flags {TXTS_NONE = 0,
|
||||||
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
||||||
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
|
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
|
||||||
TXTS_KEEPWILD = 4, // Handle wildcards as letters
|
TXTS_KEEPWILD = 4 // Handle wildcards as letters
|
||||||
TXTS_NOCJK = 8 // CJK special processing
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -58,7 +62,6 @@ public:
|
|||||||
*/
|
*/
|
||||||
TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE))
|
TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE))
|
||||||
: m_flags(flags), m_cb(t), m_maxWordLength(40),
|
: m_flags(flags), m_cb(t), m_maxWordLength(40),
|
||||||
m_nocjk((m_flags & TXTS_NOCJK) != 0),
|
|
||||||
m_prevpos(-1)
|
m_prevpos(-1)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@ -76,7 +79,6 @@ private:
|
|||||||
Flags m_flags;
|
Flags m_flags;
|
||||||
TextSplitCB *m_cb;
|
TextSplitCB *m_cb;
|
||||||
int m_maxWordLength;
|
int m_maxWordLength;
|
||||||
int m_nocjk;
|
|
||||||
|
|
||||||
// Current span. Might be jf.dockes@wanadoo.f
|
// Current span. Might be jf.dockes@wanadoo.f
|
||||||
string m_span;
|
string m_span;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user