*** empty log message ***

This commit is contained in:
dockes 2006-01-28 15:36:59 +00:00
parent 8c9eb8c6d3
commit 3c78938565
2 changed files with 53 additions and 38 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.17 2006-01-28 10:23:55 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.18 2006-01-28 15:36:59 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -163,8 +163,7 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
* @param spanerase Set if the current span is at its end. Reset it. * @param spanerase Set if the current span is at its end. Reset it.
* @param bp The current BYTE position in the stream * @param bp The current BYTE position in the stream
*/ */
inline bool TextSplit::doemit(string &word, int &wordpos, string &span, inline bool TextSplit::doemit(bool spanerase, int bp)
int &spanpos, bool spanerase, int bp)
{ {
#if 0 #if 0
cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" << cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" <<
@ -216,12 +215,10 @@ bool TextSplit::text_to_words(const string &in)
setcharclasses(); setcharclasses();
string span; // Current span. Might be jf.dockes@wanadoo.f span.clear();
string word; // Current word: no punctuation at all in there word.clear(); // Current word: no punctuation at all in there
bool number = false; number = false;
int wordpos = 0; // Term position of current word wordpos = spanpos = charpos = 0;
int spanpos = 0; // Term position of current span
int charpos = 0; // Character position
Utf8Iter it(in); Utf8Iter it(in);
@ -236,7 +233,7 @@ bool TextSplit::text_to_words(const string &in)
case SPACE: case SPACE:
SPACE: SPACE:
if (word.length() || span.length()) { if (word.length() || span.length()) {
if (!doemit(word, wordpos, span, spanpos, true, it.getBpos())) if (!doemit(true, it.getBpos()))
return false; return false;
number = false; number = false;
} }
@ -251,7 +248,7 @@ bool TextSplit::text_to_words(const string &in)
} else } else
span += it; span += it;
} else { } else {
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
number = false; number = false;
span += it; span += it;
@ -265,34 +262,41 @@ bool TextSplit::text_to_words(const string &in)
break; break;
} else { } else {
// If . inside a word, keep it, else, this is whitespace. // If . inside a word, keep it, else, this is whitespace.
// We also keep an initial '.' for catching .net, but this adds
// quite a few spurious terms !
// Another problem is that something like .x-errs
// will be split as .x-errs, x, errs but not x-errs
// A final comma in a word will be removed by doemit // A final comma in a word will be removed by doemit
if (cc == '.' && word.length()) { if (cc == '.') {
if (!doemit(word, wordpos, span, spanpos, false, if (word.length()) {
it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
// span length could have been adjusted by trimming // span length could have been adjusted by trimming
// inside doemit // inside doemit
if (span.length()) if (span.length())
span += it;
break;
} else {
span += it; span += it;
break; break;
}
} }
} }
goto SPACE; goto SPACE;
break; break;
case '@': case '@':
if (word.length()) { if (word.length()) {
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
number = false; number = false;
} else }
word += it;
span += it; span += it;
break; break;
case '\'': case '\'':
// If in word, potential span: o'brien, else, this is more // If in word, potential span: o'brien, else, this is more
// whitespace // whitespace
if (word.length()) { if (word.length()) {
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
number = false; number = false;
span += it; span += it;
@ -337,7 +341,7 @@ bool TextSplit::text_to_words(const string &in)
} }
} }
if (word.length() || span.length()) { if (word.length() || span.length()) {
if (!doemit(word, wordpos, span, spanpos, true, it.getBpos())) if (!doemit(true, it.getBpos()))
return false; return false;
} }
return true; return true;
@ -374,7 +378,7 @@ class mySplitterCB : public TextSplitCB {
} }
}; };
static string teststring1 = static string teststring =
"Un bout de texte \nnormal. jfd@okyz.com \n" "Un bout de texte \nnormal. jfd@okyz.com \n"
"Ceci. Est;Oui n@d @net .net t@v@c c# c++ -10 o'brien l'ami \n" "Ceci. Est;Oui n@d @net .net t@v@c c# c++ -10 o'brien l'ami \n"
"a 134 +134 -14 -1.5 +1.5 1.54e10 a @^#$(#$(*) 1,2 1,2e30\n" "a 134 +134 -14 -1.5 +1.5 1.54e10 a @^#$(#$(*) 1,2 1,2e30\n"
@ -384,8 +388,9 @@ static string teststring1 =
"M9R F($AA;F1L:6YG\"0D)\"0D@(\" @(#4P, T)0W)A=&4)\"0D)\"2 @,C4P#0E3" "M9R F($AA;F1L:6YG\"0D)\"0D@(\" @(#4P, T)0W)A=&4)\"0D)\"2 @,C4P#0E3"
" ,able,test-domain " " ,able,test-domain "
" -wl,--export-dynamic " " -wl,--export-dynamic "
" ~/.xsession-errors "
; ;
static string teststring = " -wl,--export-dynamic "; static string teststring1 = " ~/.xsession-errors ";
static string thisprog; static string thisprog;

View File

@ -1,6 +1,6 @@
#ifndef _TEXTSPLIT_H_INCLUDED_ #ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.9 2006-01-28 10:23:55 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: textsplit.h,v 1.10 2006-01-28 15:36:59 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
@ -27,26 +27,36 @@ class TextSplitCB {
* but 'ts much simpler this way... * but 'ts much simpler this way...
*/ */
class TextSplit { class TextSplit {
bool fq; // for query: Are we splitting for query or index ?
// It may happen that our cleanup would result in emitting the
// same term twice. We try to avoid this
string prevterm;
int prevpos;
TextSplitCB *cb;
int maxWordLength;
bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
bool doemit(string &word, int &wordpos, string &span, int &spanpos,
bool spanerase, int bp);
public: public:
/** /**
* Constructor: just store callback object * Constructor: just store callback object
*/ */
TextSplit(TextSplitCB *t, bool forquery = false) TextSplit(TextSplitCB *t, bool forquery = false)
: fq(forquery), prevpos(-1), cb(t), maxWordLength(40) {} : fq(forquery), cb(t), maxWordLength(40), prevpos(-1) {}
/** /**
* Split text, emit words and positions. * Split text, emit words and positions.
*/ */
bool text_to_words(const std::string &in); bool text_to_words(const std::string &in);
private:
bool fq; // for query: Are we splitting for query or index ?
TextSplitCB *cb;
int maxWordLength;
string span; // Current span. Might be jf.dockes@wanadoo.f
string word; // Current word: no punctuation at all in there
bool number;
int wordpos; // Term position of current word
int spanpos; // Term position of current span
int charpos; // Character position
// It may happen that our cleanup would result in emitting the
// same term twice. We try to avoid this
int prevpos;
string prevterm;
bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
bool doemit(bool spanerase, int bp);
}; };
#endif /* _TEXTSPLIT_H_INCLUDED_ */ #endif /* _TEXTSPLIT_H_INCLUDED_ */