From d8297680b1b85a9944d46bf320c66b1cec519436 Mon Sep 17 00:00:00 2001 From: dockes Date: Thu, 22 Sep 2005 11:10:11 +0000 Subject: [PATCH] fix problems with word followed by . --- src/common/textsplit.cpp | 72 ++++++++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index f825571b..1687a6cd 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.10 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.11 2005-09-22 11:10:11 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_TEXTSPLIT @@ -19,7 +19,7 @@ using namespace std; * (ok for UTF-8, ascii, iso8859* and quite a few others). * * We work in a way which would make it quite difficult to handle non-ascii - * separator chars (en-dash,etc.). We would then need to actually parse the + * separator chars (en-dash, etc.). We would then need to actually parse the * utf-8 stream, and use a different way to classify the characters (instead * of a 256 slot array). * @@ -27,9 +27,9 @@ using namespace std; * * How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first. * Then specialcase all 'real' utf chars, by checking for the few - punctuation ones we're interested in (put them in a map). Then - classify all other non-ascii as letter, and use the current method - for chars < 127. + * punctuation ones we're interested in (put them in a map). Then + * classify all other non-ascii as letter, and use the current method + * for chars < 127. */ // Character classes: we have three main groups, and then some chars @@ -75,13 +75,18 @@ static void setcharclasses() unicign.insert(uniign[i]); } -// Do some cleanup (the kind which is simpler to do here than in the main loop, -// then send term to our client. +// Do some cleanup (the kind which is simpler to do here than in the +// main loop, then send term to our client. bool TextSplit::emitterm(bool isspan, string &w, int pos, int btstart, int btend) { LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos)); + // It may happen that our cleanup would result in emitting the + // same term twice. We try to avoid this + static string prevterm; + static int prevpos = -1; + if (!cb) return false; @@ -123,8 +128,12 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos, } } if (w.length() > 0 && w.length() < (unsigned)maxWordLength) { - bool ret = cb->takeword(w, pos, btstart, btend); - return ret; + if (w != prevterm || pos != prevpos) { + bool ret = cb->takeword(w, pos, btstart, btend); + prevterm = w; + prevpos = pos; + return ret; + } } return true; } @@ -135,6 +144,12 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos, bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos, bool spanerase, int bp) { +#if 0 + cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" << + span << "' sp: " << spanpos << " spe: " << spanerase << " bp: " << bp + << endl; +#endif + // When splitting for query, we only emit final spans if (fq && !spanerase) { wordpos++; @@ -183,13 +198,16 @@ static inline int whatcc(unsigned int c) bool TextSplit::text_to_words(const string &in) { LOGDEB2(("TextSplit::text_to_words: cb %p\n", cb)); + setcharclasses(); - string span; - string word; + + string span; // Current span. Might be jf.dockes@wanadoo.f + string word; // Current word: no punctuation at all in there bool number = false; - int wordpos = 0; - int spanpos = 0; - int charpos = 0; + int wordpos = 0; // Term position of current word + int spanpos = 0; // Term position of current span + int charpos = 0; // Character position + Utf8Iter it(in); for (; !it.eof(); it++, charpos++) { @@ -202,7 +220,7 @@ bool TextSplit::text_to_words(const string &in) switch (cc) { case SPACE: SPACE: - if (word.length()) { + if (word.length() || span.length()) { if (!doemit(word, wordpos, span, spanpos, true, it.getBpos())) return false; number = false; @@ -217,7 +235,8 @@ bool TextSplit::text_to_words(const string &in) number = true; word += it; span += it; - } + } else + span += it; } else { if (!doemit(word, wordpos, span, spanpos, false, it.getBpos())) return false; @@ -248,7 +267,8 @@ bool TextSplit::text_to_words(const string &in) } else { //cerr<<"Got . span: '"<