From 4adb351ca4082733217a84593401655797a064a2 Mon Sep 17 00:00:00 2001 From: dockes Date: Tue, 2 Oct 2007 11:39:08 +0000 Subject: [PATCH] add flag to disable cjk processing --- src/common/rclconfig.cpp | 10 ++++++++-- src/common/textsplit.cpp | 14 ++++++-------- src/common/textsplit.h | 12 +++++++----- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index 322ff4c0..fe0442e6 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.50 2007-10-01 06:19:21 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.51 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -39,6 +39,7 @@ static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.50 2007-10-01 06:19:21 dockes E #include "conftree.h" #include "debuglog.h" #include "smallut.h" +#include "textsplit.h" #ifndef NO_NAMESPACES using namespace std; @@ -131,7 +132,12 @@ RclConfig::RclConfig(const string *argcnf) } setKeyDir(""); - + bool nocjk = false; + if (getConfParam("nocjk", &nocjk) && nocjk == true) { + TextSplit::cjkProcessing(false); + } else { + TextSplit::cjkProcessing(true); + } m_ok = true; return; } diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 7e3735b4..7ab77c42 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.33 2007-09-22 08:51:29 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.34 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -141,6 +141,8 @@ static inline int whatcc(unsigned int c) || ((p) >= 0x20000 && (p) <= 0x2A6DF) \ || ((p) >= 0x2F800 && (p) <= 0x2FA1F)) +bool TextSplit::t_processCJK = true; + // Do some checking (the kind which is simpler to do here than in the // main loop), then send term to our client. inline bool TextSplit::emitterm(bool isspan, string &w, int pos, @@ -244,11 +246,11 @@ inline bool TextSplit::doemit(bool spanerase, int bp) */ bool TextSplit::text_to_words(const string &in) { - LOGDEB1(("TextSplit::text_to_words:%s%s%s%s [%s]\n", + LOGDEB1(("TextSplit::text_to_words: docjk %d %s%s%s [%s]\n", + t_processCJK, m_flags & TXTS_NOSPANS ? " nospans" : "", m_flags & TXTS_ONLYSPANS ? " onlyspans" : "", m_flags & TXTS_KEEPWILD ? " keepwild" : "", - m_flags & TXTS_NOCJK ? " nocjk" : "", in.substr(0,50).c_str())); setcharclasses(); @@ -267,7 +269,7 @@ bool TextSplit::text_to_words(const string &in) return false; } - if (!m_nocjk && UNICODE_IS_CJK(c)) { + if (t_processCJK && UNICODE_IS_CJK(c)) { // CJK character hit. // Do like at EOF with the current non-cjk data. if (m_wordLen || m_span.length()) { @@ -592,7 +594,6 @@ static string usage = " -s: only spans\n" " -w: only words\n" " -k: preserve wildcards (?*)\n" - " -C: desactivate CJK processing\n" " -c: just count words\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n" " \n\n" @@ -651,9 +652,6 @@ int main(int argc, char **argv) if (op_flags & OPT_k) flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); - if (op_flags & OPT_C) - flags = (TextSplit::Flags)(flags | TextSplit::TXTS_NOCJK); - string data; if (argc == 1) { const char *filename = *argv++; argc--; diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 4e74d386..2981fd67 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -16,7 +16,7 @@ */ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ -/* @(#$Id: textsplit.h,v 1.18 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: textsplit.h,v 1.19 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes */ #include #ifndef NO_NAMESPACES @@ -46,11 +46,15 @@ class Utf8Iter; */ class TextSplit { public: + // Should we activate special processing of Chinese characters ? This + // needs a little more cpu, so it can be turned off globally. + static bool t_processCJK; + static void cjkProcessing(bool onoff) {t_processCJK = onoff;} + enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com) TXTS_NOSPANS = 2, // Only return atomic words (a, b, com) - TXTS_KEEPWILD = 4, // Handle wildcards as letters - TXTS_NOCJK = 8 // CJK special processing + TXTS_KEEPWILD = 4 // Handle wildcards as letters }; /** @@ -58,7 +62,6 @@ public: */ TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE)) : m_flags(flags), m_cb(t), m_maxWordLength(40), - m_nocjk((m_flags & TXTS_NOCJK) != 0), m_prevpos(-1) { } @@ -76,7 +79,6 @@ private: Flags m_flags; TextSplitCB *m_cb; int m_maxWordLength; - int m_nocjk; // Current span. Might be jf.dockes@wanadoo.f string m_span;