From d8297680b1b85a9944d46bf320c66b1cec519436 Mon Sep 17 00:00:00 2001
From: dockes <none@none>
Date: Thu, 22 Sep 2005 11:10:11 +0000
Subject: [PATCH] fix problems with word followed by .

---
 src/common/textsplit.cpp | 72 ++++++++++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 21 deletions(-)

diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index f825571b..1687a6cd 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.10 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.11 2005-09-22 11:10:11 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_TEXTSPLIT
 
@@ -19,7 +19,7 @@ using namespace std;
  * (ok for UTF-8, ascii, iso8859* and quite a few others).
  *
  * We work in a way which would make it quite difficult to handle non-ascii
- * separator chars (en-dash,etc.). We would then need to actually parse the 
+ * separator chars (en-dash, etc.). We would then need to actually parse the 
  * utf-8 stream, and use a different way to classify the characters (instead 
  * of a 256 slot array).
  *
@@ -27,9 +27,9 @@ using namespace std;
  *
  * How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.
  * Then specialcase all 'real' utf chars, by checking for the few
-   punctuation ones we're interested in (put them in a map). Then
-   classify all other non-ascii as letter, and use the current method
-   for chars < 127.
+ * punctuation ones we're interested in (put them in a map). Then
+ * classify all other non-ascii as letter, and use the current method
+ * for chars < 127.
  */
 
 // Character classes: we have three main groups, and then some chars
@@ -75,13 +75,18 @@ static void setcharclasses()
 	unicign.insert(uniign[i]);
 }
 
-// Do some cleanup (the kind which is simpler to do here than in the main loop,
-// then send term to our client.
+// Do some cleanup (the kind which is simpler to do here than in the
+// main loop, then send term to our client.
 bool TextSplit::emitterm(bool isspan, string &w, int pos, 
 			 int btstart, int btend)
 {
     LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
 
+    // It may happen that our cleanup would result in emitting the
+    // same term twice. We try to avoid this
+    static string prevterm;
+    static int prevpos = -1;
+
     if (!cb)
 	return false;
 
@@ -123,8 +128,12 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
 	}
     }
     if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
-	bool ret = cb->takeword(w, pos, btstart, btend);
-	return ret;
+	if (w != prevterm || pos != prevpos) {
+	    bool ret = cb->takeword(w, pos, btstart, btend);
+	    prevterm = w;
+	    prevpos = pos;
+	    return ret;
+	}
     }
     return true;
 }
@@ -135,6 +144,12 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
 bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
 		       bool spanerase, int bp)
 {
+#if 0
+    cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" <<
+	span << "' sp: " << spanpos << " spe: " << spanerase << " bp: " << bp 
+	 << endl;
+#endif
+
     // When splitting for query, we only emit final spans
     if (fq && !spanerase) {
 	wordpos++;
@@ -183,13 +198,16 @@ static inline int whatcc(unsigned int c)
 bool TextSplit::text_to_words(const string &in)
 {
     LOGDEB2(("TextSplit::text_to_words: cb %p\n", cb));
+
     setcharclasses();
-    string span;
-    string word;
+
+    string span; // Current span. Might be jf.dockes@wanadoo.f
+    string word; // Current word: no punctuation at all in there
     bool number = false;
-    int wordpos = 0;
-    int spanpos = 0;
-    int charpos = 0;
+    int wordpos = 0; // Term position of current word
+    int spanpos = 0; // Term position of current span
+    int charpos = 0; // Character position
+
     Utf8Iter it(in);
 
     for (; !it.eof(); it++, charpos++) {
@@ -202,7 +220,7 @@ bool TextSplit::text_to_words(const string &in)
 	switch (cc) {
 	case SPACE:
 	SPACE:
-	    if (word.length()) {
+	    if (word.length() || span.length()) {
 		if (!doemit(word, wordpos, span, spanpos, true, it.getBpos()))
 		    return false;
 		number = false;
@@ -217,7 +235,8 @@ bool TextSplit::text_to_words(const string &in)
 		    number = true;
 		    word += it;
 		    span += it;
-		}
+		} else
+		    span += it;
 	    } else {
 		if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
 		    return false;
@@ -248,7 +267,8 @@ bool TextSplit::text_to_words(const string &in)
 	    } else {
 		//cerr<<"Got . span: '"<<span<<"' word: '"<<word<<"'"<<endl;
 		if (word.length()) {
-		    if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
+		    if (!doemit(word, wordpos, span, spanpos, false, 
+				it.getBpos()))
 			return false;
 		    number = false;
 		} else 
@@ -294,7 +314,7 @@ bool TextSplit::text_to_words(const string &in)
 	    break;
 	}
     }
-    if (span.length()) {
+    if (word.length() || span.length()) {
 	if (!doemit(word, wordpos, span, spanpos, true, it.getBpos()))
 	    return false;
     }
@@ -318,15 +338,23 @@ using namespace std;
 
 // A small class to hold state while splitting text
 class mySplitterCB : public TextSplitCB {
+    int first;
  public:
+    mySplitterCB() : first(0) {}
+
     bool takeword(const std::string &term, int pos, int bs, int be) {
-	printf("%3d %-20s %d %d\n", pos, term.c_str(), bs, be);
+	if (first) {
+	    printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
+	    first = 0;
+	}
+	printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
 	return true;
     }
 };
 
 static string teststring = 
-    "le ta " 
+    "Un bout de texte \n" 
+    "normal. "
     "jfd@okyz.com "
     "Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
     "a 134 +134 -14 -1.5 +1.5 1.54e10 a "
@@ -338,6 +366,7 @@ static string teststring =
     "utf-8 ucs-4©"
     "\n"							      
 ;
+static string teststring1 = "c++ ";
 
 static string thisprog;
 
@@ -380,8 +409,9 @@ int main(int argc, char **argv)
     TextSplit splitter(&cb, (op_flags&OPT_q) ? true: false);
     if (argc == 1) {
 	string data;
-	if (!file_to_string(argv[1], data)) 
+	if (!file_to_string(*argv++, data)) 
 	    exit(1);
+	argc--;
 	splitter.text_to_words(data);
     } else {
 	cout << endl << teststring << endl << endl;