optim ckpt
This commit is contained in:
parent
a83eab29ae
commit
b3ab39522b
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.24 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.25 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -93,7 +93,7 @@ static void setcharclasses()
|
|||||||
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||||
int btstart, int btend)
|
int btstart, int btend)
|
||||||
{
|
{
|
||||||
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
|
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
||||||
|
|
||||||
unsigned int l = w.length();
|
unsigned int l = w.length();
|
||||||
if (l > 0 && l < (unsigned)maxWordLength) {
|
if (l > 0 && l < (unsigned)maxWordLength) {
|
||||||
@ -107,12 +107,13 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (pos != prevpos || l != prevterm.length() || w != prevterm) {
|
if (pos != prevpos || l != prevlen) {
|
||||||
bool ret = cb->takeword(w, pos, btstart, btend);
|
bool ret = cb->takeword(w, pos, btstart, btend);
|
||||||
prevterm = w;
|
prevlen = w.length();
|
||||||
prevpos = pos;
|
prevpos = pos;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -137,11 +138,8 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
*/
|
*/
|
||||||
inline bool TextSplit::doemit(bool spanerase, int bp)
|
inline bool TextSplit::doemit(bool spanerase, int bp)
|
||||||
{
|
{
|
||||||
#if 0
|
LOGDEB3(("TextSplit::doemit: wrd [%s] wp %d spn [%s] sp %d spe %d bp %d\n",
|
||||||
cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" <<
|
word.c_str(), wordpos, span.c_str(), spanpos, spanerase, bp));
|
||||||
span << "' sp: " << spanpos << " spe: " << spanerase << " bp: " << bp
|
|
||||||
<< endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Emit span. When splitting for query, we only emit final spans
|
// Emit span. When splitting for query, we only emit final spans
|
||||||
bool spanemitted = false;
|
bool spanemitted = false;
|
||||||
@ -214,8 +212,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
span.erase();
|
span.erase();
|
||||||
word.erase(); // Current word: no punctuation at all in there
|
word.erase(); // Current word: no punctuation at all in there
|
||||||
number = false;
|
number = false;
|
||||||
prevpos = wordpos = spanpos = charpos = 0;
|
prevpos = prevlen = wordpos = spanpos = charpos = 0;
|
||||||
prevterm.erase();
|
|
||||||
|
|
||||||
Utf8Iter it(in);
|
Utf8Iter it(in);
|
||||||
|
|
||||||
@ -228,15 +225,15 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
int cc = whatcc(c);
|
int cc = whatcc(c);
|
||||||
switch (cc) {
|
switch (cc) {
|
||||||
case LETTER:
|
case LETTER:
|
||||||
word += it;
|
it.appendchartostring(word);
|
||||||
span += it;
|
it.appendchartostring(span);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case DIGIT:
|
case DIGIT:
|
||||||
if (word.length() == 0)
|
if (word.length() == 0)
|
||||||
number = true;
|
number = true;
|
||||||
word += it;
|
it.appendchartostring(word);
|
||||||
span += it;
|
it.appendchartostring(span);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SPACE:
|
case SPACE:
|
||||||
@ -252,15 +249,15 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
if (word.length() == 0) {
|
if (word.length() == 0) {
|
||||||
if (whatcc(it[charpos+1]) == DIGIT) {
|
if (whatcc(it[charpos+1]) == DIGIT) {
|
||||||
number = true;
|
number = true;
|
||||||
word += it;
|
it.appendchartostring(word);
|
||||||
span += it;
|
it.appendchartostring(span);
|
||||||
} else
|
} else
|
||||||
span += it;
|
it.appendchartostring(span);
|
||||||
} else {
|
} else {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
span += it;
|
it.appendchartostring(span);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '.':
|
case '.':
|
||||||
@ -269,8 +266,8 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
// 132.jpg ?
|
// 132.jpg ?
|
||||||
if (whatcc(it[charpos+1]) != DIGIT)
|
if (whatcc(it[charpos+1]) != DIGIT)
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
word += it;
|
it.appendchartostring(word);
|
||||||
span += it;
|
it.appendchartostring(span);
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
// If . inside a word, keep it, else, this is whitespace.
|
// If . inside a word, keep it, else, this is whitespace.
|
||||||
@ -286,10 +283,10 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
// span length could have been adjusted by trimming
|
// span length could have been adjusted by trimming
|
||||||
// inside doemit
|
// inside doemit
|
||||||
if (span.length())
|
if (span.length())
|
||||||
span += it;
|
it.appendchartostring(span);
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
span += it;
|
it.appendchartostring(span);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -302,7 +299,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
}
|
}
|
||||||
span += it;
|
it.appendchartostring(span);
|
||||||
break;
|
break;
|
||||||
case '\'':
|
case '\'':
|
||||||
// If in word, potential span: o'brien, else, this is more
|
// If in word, potential span: o'brien, else, this is more
|
||||||
@ -311,7 +308,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
number = false;
|
number = false;
|
||||||
span += it;
|
it.appendchartostring(span);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '#':
|
case '#':
|
||||||
@ -319,8 +316,8 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
if (word.length() > 0) {
|
if (word.length() > 0) {
|
||||||
int w = whatcc(it[charpos+1]);
|
int w = whatcc(it[charpos+1]);
|
||||||
if (w == SPACE || w == '\n' || w == '\r') {
|
if (w == SPACE || w == '\n' || w == '\r') {
|
||||||
word += it;
|
it.appendchartostring(word);
|
||||||
span += it;
|
it.appendchartostring(span);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -343,8 +340,8 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
word += it;
|
it.appendchartostring(word);
|
||||||
span += it;
|
it.appendchartostring(span);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -373,10 +370,13 @@ using namespace std;
|
|||||||
// A small class to hold state while splitting text
|
// A small class to hold state while splitting text
|
||||||
class mySplitterCB : public TextSplitCB {
|
class mySplitterCB : public TextSplitCB {
|
||||||
int first;
|
int first;
|
||||||
|
bool nooutput;
|
||||||
public:
|
public:
|
||||||
mySplitterCB() : first(1) {}
|
mySplitterCB() : first(1), nooutput(false) {}
|
||||||
|
void setNoOut(bool val) {nooutput = val;}
|
||||||
bool takeword(const std::string &term, int pos, int bs, int be) {
|
bool takeword(const std::string &term, int pos, int bs, int be) {
|
||||||
|
if (nooutput)
|
||||||
|
return true;
|
||||||
if (first) {
|
if (first) {
|
||||||
printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
||||||
first = 0;
|
first = 0;
|
||||||
@ -406,9 +406,10 @@ static string teststring1 = " 124, ";
|
|||||||
static string thisprog;
|
static string thisprog;
|
||||||
|
|
||||||
static string usage =
|
static string usage =
|
||||||
" textsplit [opts] [filename]\n"
|
" textsplit [opts] [filename]\n"
|
||||||
" -s: only spans\n"
|
" -S: no output\n"
|
||||||
" -w: only words\n"
|
" -s: only spans\n"
|
||||||
|
" -w: only words\n"
|
||||||
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
||||||
" \n\n"
|
" \n\n"
|
||||||
;
|
;
|
||||||
@ -423,6 +424,7 @@ Usage(void)
|
|||||||
static int op_flags;
|
static int op_flags;
|
||||||
#define OPT_s 0x1
|
#define OPT_s 0x1
|
||||||
#define OPT_w 0x2
|
#define OPT_w 0x2
|
||||||
|
#define OPT_S 0x4
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
@ -437,6 +439,7 @@ int main(int argc, char **argv)
|
|||||||
while (**argv)
|
while (**argv)
|
||||||
switch (*(*argv)++) {
|
switch (*(*argv)++) {
|
||||||
case 's': op_flags |= OPT_s; break;
|
case 's': op_flags |= OPT_s; break;
|
||||||
|
case 'S': op_flags |= OPT_S; break;
|
||||||
case 'w': op_flags |= OPT_w; break;
|
case 'w': op_flags |= OPT_w; break;
|
||||||
default: Usage(); break;
|
default: Usage(); break;
|
||||||
}
|
}
|
||||||
@ -444,8 +447,13 @@ int main(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
||||||
DebugLog::setfilename("stderr");
|
DebugLog::setfilename("stderr");
|
||||||
|
|
||||||
mySplitterCB cb;
|
mySplitterCB cb;
|
||||||
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
||||||
|
|
||||||
|
if (op_flags&OPT_S)
|
||||||
|
cb.setNoOut(true);
|
||||||
|
|
||||||
if (op_flags&OPT_s)
|
if (op_flags&OPT_s)
|
||||||
flags = TextSplit::TXTS_ONLYSPANS;
|
flags = TextSplit::TXTS_ONLYSPANS;
|
||||||
else if (op_flags&OPT_w)
|
else if (op_flags&OPT_w)
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||||
#define _TEXTSPLIT_H_INCLUDED_
|
#define _TEXTSPLIT_H_INCLUDED_
|
||||||
/* @(#$Id: textsplit.h,v 1.12 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: textsplit.h,v 1.13 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
@ -70,7 +70,7 @@ class TextSplit {
|
|||||||
// It may happen that our cleanup would result in emitting the
|
// It may happen that our cleanup would result in emitting the
|
||||||
// same term twice. We try to avoid this
|
// same term twice. We try to avoid this
|
||||||
int prevpos;
|
int prevpos;
|
||||||
string prevterm;
|
unsigned int prevlen;
|
||||||
|
|
||||||
bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
|
bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
|
||||||
bool doemit(bool spanerase, int bp);
|
bool doemit(bool spanerase, int bp);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user