From b3ab39522b088c324f1a5e63d0133a0b19aa6b3d Mon Sep 17 00:00:00 2001 From: dockes Date: Sun, 19 Nov 2006 18:37:37 +0000 Subject: [PATCH] optim ckpt --- src/common/textsplit.cpp | 76 ++++++++++++++++++++++------------------ src/common/textsplit.h | 4 +-- 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 7cf3cd97..cc44df84 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.24 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.25 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -93,7 +93,7 @@ static void setcharclasses() inline bool TextSplit::emitterm(bool isspan, string &w, int pos, int btstart, int btend) { - LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos)); + LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos)); unsigned int l = w.length(); if (l > 0 && l < (unsigned)maxWordLength) { @@ -107,12 +107,13 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos, return true; } } - if (pos != prevpos || l != prevterm.length() || w != prevterm) { + if (pos != prevpos || l != prevlen) { bool ret = cb->takeword(w, pos, btstart, btend); - prevterm = w; + prevlen = w.length(); prevpos = pos; return ret; } + LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos)); } return true; } @@ -137,11 +138,8 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos, */ inline bool TextSplit::doemit(bool spanerase, int bp) { -#if 0 - cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" << - span << "' sp: " << spanpos << " spe: " << spanerase << " bp: " << bp - << endl; -#endif + LOGDEB3(("TextSplit::doemit: wrd [%s] wp %d spn [%s] sp %d spe %d bp %d\n", + word.c_str(), wordpos, span.c_str(), spanpos, spanerase, bp)); // Emit span. When splitting for query, we only emit final spans bool spanemitted = false; @@ -214,8 +212,7 @@ bool TextSplit::text_to_words(const string &in) span.erase(); word.erase(); // Current word: no punctuation at all in there number = false; - prevpos = wordpos = spanpos = charpos = 0; - prevterm.erase(); + prevpos = prevlen = wordpos = spanpos = charpos = 0; Utf8Iter it(in); @@ -228,15 +225,15 @@ bool TextSplit::text_to_words(const string &in) int cc = whatcc(c); switch (cc) { case LETTER: - word += it; - span += it; + it.appendchartostring(word); + it.appendchartostring(span); break; case DIGIT: if (word.length() == 0) number = true; - word += it; - span += it; + it.appendchartostring(word); + it.appendchartostring(span); break; case SPACE: @@ -252,15 +249,15 @@ bool TextSplit::text_to_words(const string &in) if (word.length() == 0) { if (whatcc(it[charpos+1]) == DIGIT) { number = true; - word += it; - span += it; + it.appendchartostring(word); + it.appendchartostring(span); } else - span += it; + it.appendchartostring(span); } else { if (!doemit(false, it.getBpos())) return false; number = false; - span += it; + it.appendchartostring(span); } break; case '.': @@ -269,8 +266,8 @@ bool TextSplit::text_to_words(const string &in) // 132.jpg ? if (whatcc(it[charpos+1]) != DIGIT) goto SPACE; - word += it; - span += it; + it.appendchartostring(word); + it.appendchartostring(span); break; } else { // If . inside a word, keep it, else, this is whitespace. @@ -286,10 +283,10 @@ bool TextSplit::text_to_words(const string &in) // span length could have been adjusted by trimming // inside doemit if (span.length()) - span += it; + it.appendchartostring(span); break; } else { - span += it; + it.appendchartostring(span); break; } } @@ -302,7 +299,7 @@ bool TextSplit::text_to_words(const string &in) return false; number = false; } - span += it; + it.appendchartostring(span); break; case '\'': // If in word, potential span: o'brien, else, this is more @@ -311,7 +308,7 @@ bool TextSplit::text_to_words(const string &in) if (!doemit(false, it.getBpos())) return false; number = false; - span += it; + it.appendchartostring(span); } break; case '#': @@ -319,8 +316,8 @@ bool TextSplit::text_to_words(const string &in) if (word.length() > 0) { int w = whatcc(it[charpos+1]); if (w == SPACE || w == '\n' || w == '\r') { - word += it; - span += it; + it.appendchartostring(word); + it.appendchartostring(span); break; } } @@ -343,8 +340,8 @@ bool TextSplit::text_to_words(const string &in) break; default: - word += it; - span += it; + it.appendchartostring(word); + it.appendchartostring(span); break; } } @@ -373,10 +370,13 @@ using namespace std; // A small class to hold state while splitting text class mySplitterCB : public TextSplitCB { int first; + bool nooutput; public: - mySplitterCB() : first(1) {} - + mySplitterCB() : first(1), nooutput(false) {} + void setNoOut(bool val) {nooutput = val;} bool takeword(const std::string &term, int pos, int bs, int be) { + if (nooutput) + return true; if (first) { printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be"); first = 0; @@ -406,9 +406,10 @@ static string teststring1 = " 124, "; static string thisprog; static string usage = - " textsplit [opts] [filename]\n" - " -s: only spans\n" - " -w: only words\n" + " textsplit [opts] [filename]\n" + " -S: no output\n" + " -s: only spans\n" + " -w: only words\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n" " \n\n" ; @@ -423,6 +424,7 @@ Usage(void) static int op_flags; #define OPT_s 0x1 #define OPT_w 0x2 +#define OPT_S 0x4 int main(int argc, char **argv) { @@ -437,6 +439,7 @@ int main(int argc, char **argv) while (**argv) switch (*(*argv)++) { case 's': op_flags |= OPT_s; break; + case 'S': op_flags |= OPT_S; break; case 'w': op_flags |= OPT_w; break; default: Usage(); break; } @@ -444,8 +447,13 @@ int main(int argc, char **argv) } DebugLog::getdbl()->setloglevel(DEBDEB1); DebugLog::setfilename("stderr"); + mySplitterCB cb; TextSplit::Flags flags = TextSplit::TXTS_NONE; + + if (op_flags&OPT_S) + cb.setNoOut(true); + if (op_flags&OPT_s) flags = TextSplit::TXTS_ONLYSPANS; else if (op_flags&OPT_w) diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 05012ed4..ae46c7de 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -16,7 +16,7 @@ */ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ -/* @(#$Id: textsplit.h,v 1.12 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: textsplit.h,v 1.13 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes */ #include #ifndef NO_NAMESPACES @@ -70,7 +70,7 @@ class TextSplit { // It may happen that our cleanup would result in emitting the // same term twice. We try to avoid this int prevpos; - string prevterm; + unsigned int prevlen; bool emitterm(bool isspan, std::string &term, int pos, int bs, int be); bool doemit(bool spanerase, int bp);