optim ckpt

This commit is contained in:
dockes 2006-11-19 18:37:37 +00:00
parent a83eab29ae
commit b3ab39522b
2 changed files with 44 additions and 36 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.24 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.25 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -93,7 +93,7 @@ static void setcharclasses()
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
int btstart, int btend) int btstart, int btend)
{ {
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos)); LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
unsigned int l = w.length(); unsigned int l = w.length();
if (l > 0 && l < (unsigned)maxWordLength) { if (l > 0 && l < (unsigned)maxWordLength) {
@ -107,12 +107,13 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
return true; return true;
} }
} }
if (pos != prevpos || l != prevterm.length() || w != prevterm) { if (pos != prevpos || l != prevlen) {
bool ret = cb->takeword(w, pos, btstart, btend); bool ret = cb->takeword(w, pos, btstart, btend);
prevterm = w; prevlen = w.length();
prevpos = pos; prevpos = pos;
return ret; return ret;
} }
LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
} }
return true; return true;
} }
@ -137,11 +138,8 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
*/ */
inline bool TextSplit::doemit(bool spanerase, int bp) inline bool TextSplit::doemit(bool spanerase, int bp)
{ {
#if 0 LOGDEB3(("TextSplit::doemit: wrd [%s] wp %d spn [%s] sp %d spe %d bp %d\n",
cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" << word.c_str(), wordpos, span.c_str(), spanpos, spanerase, bp));
span << "' sp: " << spanpos << " spe: " << spanerase << " bp: " << bp
<< endl;
#endif
// Emit span. When splitting for query, we only emit final spans // Emit span. When splitting for query, we only emit final spans
bool spanemitted = false; bool spanemitted = false;
@ -214,8 +212,7 @@ bool TextSplit::text_to_words(const string &in)
span.erase(); span.erase();
word.erase(); // Current word: no punctuation at all in there word.erase(); // Current word: no punctuation at all in there
number = false; number = false;
prevpos = wordpos = spanpos = charpos = 0; prevpos = prevlen = wordpos = spanpos = charpos = 0;
prevterm.erase();
Utf8Iter it(in); Utf8Iter it(in);
@ -228,15 +225,15 @@ bool TextSplit::text_to_words(const string &in)
int cc = whatcc(c); int cc = whatcc(c);
switch (cc) { switch (cc) {
case LETTER: case LETTER:
word += it; it.appendchartostring(word);
span += it; it.appendchartostring(span);
break; break;
case DIGIT: case DIGIT:
if (word.length() == 0) if (word.length() == 0)
number = true; number = true;
word += it; it.appendchartostring(word);
span += it; it.appendchartostring(span);
break; break;
case SPACE: case SPACE:
@ -252,15 +249,15 @@ bool TextSplit::text_to_words(const string &in)
if (word.length() == 0) { if (word.length() == 0) {
if (whatcc(it[charpos+1]) == DIGIT) { if (whatcc(it[charpos+1]) == DIGIT) {
number = true; number = true;
word += it; it.appendchartostring(word);
span += it; it.appendchartostring(span);
} else } else
span += it; it.appendchartostring(span);
} else { } else {
if (!doemit(false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
number = false; number = false;
span += it; it.appendchartostring(span);
} }
break; break;
case '.': case '.':
@ -269,8 +266,8 @@ bool TextSplit::text_to_words(const string &in)
// 132.jpg ? // 132.jpg ?
if (whatcc(it[charpos+1]) != DIGIT) if (whatcc(it[charpos+1]) != DIGIT)
goto SPACE; goto SPACE;
word += it; it.appendchartostring(word);
span += it; it.appendchartostring(span);
break; break;
} else { } else {
// If . inside a word, keep it, else, this is whitespace. // If . inside a word, keep it, else, this is whitespace.
@ -286,10 +283,10 @@ bool TextSplit::text_to_words(const string &in)
// span length could have been adjusted by trimming // span length could have been adjusted by trimming
// inside doemit // inside doemit
if (span.length()) if (span.length())
span += it; it.appendchartostring(span);
break; break;
} else { } else {
span += it; it.appendchartostring(span);
break; break;
} }
} }
@ -302,7 +299,7 @@ bool TextSplit::text_to_words(const string &in)
return false; return false;
number = false; number = false;
} }
span += it; it.appendchartostring(span);
break; break;
case '\'': case '\'':
// If in word, potential span: o'brien, else, this is more // If in word, potential span: o'brien, else, this is more
@ -311,7 +308,7 @@ bool TextSplit::text_to_words(const string &in)
if (!doemit(false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
number = false; number = false;
span += it; it.appendchartostring(span);
} }
break; break;
case '#': case '#':
@ -319,8 +316,8 @@ bool TextSplit::text_to_words(const string &in)
if (word.length() > 0) { if (word.length() > 0) {
int w = whatcc(it[charpos+1]); int w = whatcc(it[charpos+1]);
if (w == SPACE || w == '\n' || w == '\r') { if (w == SPACE || w == '\n' || w == '\r') {
word += it; it.appendchartostring(word);
span += it; it.appendchartostring(span);
break; break;
} }
} }
@ -343,8 +340,8 @@ bool TextSplit::text_to_words(const string &in)
break; break;
default: default:
word += it; it.appendchartostring(word);
span += it; it.appendchartostring(span);
break; break;
} }
} }
@ -373,10 +370,13 @@ using namespace std;
// A small class to hold state while splitting text // A small class to hold state while splitting text
class mySplitterCB : public TextSplitCB { class mySplitterCB : public TextSplitCB {
int first; int first;
bool nooutput;
public: public:
mySplitterCB() : first(1) {} mySplitterCB() : first(1), nooutput(false) {}
void setNoOut(bool val) {nooutput = val;}
bool takeword(const std::string &term, int pos, int bs, int be) { bool takeword(const std::string &term, int pos, int bs, int be) {
if (nooutput)
return true;
if (first) { if (first) {
printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be"); printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
first = 0; first = 0;
@ -406,9 +406,10 @@ static string teststring1 = " 124, ";
static string thisprog; static string thisprog;
static string usage = static string usage =
" textsplit [opts] [filename]\n" " textsplit [opts] [filename]\n"
" -s: only spans\n" " -S: no output\n"
" -w: only words\n" " -s: only spans\n"
" -w: only words\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n"
" \n\n" " \n\n"
; ;
@ -423,6 +424,7 @@ Usage(void)
static int op_flags; static int op_flags;
#define OPT_s 0x1 #define OPT_s 0x1
#define OPT_w 0x2 #define OPT_w 0x2
#define OPT_S 0x4
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
@ -437,6 +439,7 @@ int main(int argc, char **argv)
while (**argv) while (**argv)
switch (*(*argv)++) { switch (*(*argv)++) {
case 's': op_flags |= OPT_s; break; case 's': op_flags |= OPT_s; break;
case 'S': op_flags |= OPT_S; break;
case 'w': op_flags |= OPT_w; break; case 'w': op_flags |= OPT_w; break;
default: Usage(); break; default: Usage(); break;
} }
@ -444,8 +447,13 @@ int main(int argc, char **argv)
} }
DebugLog::getdbl()->setloglevel(DEBDEB1); DebugLog::getdbl()->setloglevel(DEBDEB1);
DebugLog::setfilename("stderr"); DebugLog::setfilename("stderr");
mySplitterCB cb; mySplitterCB cb;
TextSplit::Flags flags = TextSplit::TXTS_NONE; TextSplit::Flags flags = TextSplit::TXTS_NONE;
if (op_flags&OPT_S)
cb.setNoOut(true);
if (op_flags&OPT_s) if (op_flags&OPT_s)
flags = TextSplit::TXTS_ONLYSPANS; flags = TextSplit::TXTS_ONLYSPANS;
else if (op_flags&OPT_w) else if (op_flags&OPT_w)

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _TEXTSPLIT_H_INCLUDED_ #ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.12 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: textsplit.h,v 1.13 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
@ -70,7 +70,7 @@ class TextSplit {
// It may happen that our cleanup would result in emitting the // It may happen that our cleanup would result in emitting the
// same term twice. We try to avoid this // same term twice. We try to avoid this
int prevpos; int prevpos;
string prevterm; unsigned int prevlen;
bool emitterm(bool isspan, std::string &term, int pos, int bs, int be); bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
bool doemit(bool spanerase, int bp); bool doemit(bool spanerase, int bp);