From 4588803281d888a5ac0e849446e5bd917d25e175 Mon Sep 17 00:00:00 2001
From: dockes <none@none>
Date: Tue, 8 Feb 2005 10:56:13 +0000
Subject: [PATCH] phrases ok except for preview position

---
 src/common/textsplit.cpp | 69 +++++++++++++++++++++-------------------
 src/common/textsplit.h   | 10 ++++--
 src/rcldb/rcldb.cpp      | 53 +++++++++++++++++++++++-------
 3 files changed, 84 insertions(+), 48 deletions(-)

diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index 3999a9e1..c64cb86f 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.6 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.7 2005-02-08 10:56:13 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_TEXTSPLIT
 
@@ -65,12 +65,11 @@ static void setcharclasses()
 
 // Do some cleanup (the kind which is simpler to do here than in the main loop,
 // then send term to our client.
-bool TextSplit::emitterm(bool isspan, string &w, int pos, bool doerase,
+bool TextSplit::emitterm(bool isspan, string &w, int pos, 
 			 int btstart, int btend)
 {
     LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
-    if (fq && !isspan)
-	return true;
+
     if (!cb)
 	return false;
 
@@ -113,13 +112,32 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos, bool doerase,
     }
     if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
 	bool ret = cb->takeword(w, pos, btstart, btend);
-	if (doerase)
-	    w.erase();
 	return ret;
     }
     return true;
 }
 
+bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
+		       bool spanerase, int bp)
+{
+    // When splitting for query, we only emit final spans
+    if (fq && !spanerase) {
+	wordpos++;
+	word.erase();
+	return true;
+    }
+    if (!emitterm(true, span, spanpos, bp-span.length(), bp))
+	return false;
+    if (word.length() != span.length() && !fq)
+	if (!emitterm(false, word, wordpos, bp-word.length(), bp))
+	    return false;
+    wordpos++;
+    if (spanerase)
+	span.erase();
+    word.erase();
+    return true;
+}
+
 /* 
  * We basically emit a word every time we see a separator, but some chars are
  * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, 
@@ -143,11 +161,7 @@ bool TextSplit::text_to_words(const string &in)
 	case SPACE:
 	SPACE:
 	    if (word.length()) {
-		if (span.length() != word.length()) {
-		    if (!emitterm(true, span, spanpos, true, i-span.length(), i)) 
-			return false;
-		}
-		if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
+		if (!doemit(word, wordpos, span, spanpos, true, i))
 		    return false;
 		number = false;
 	    }
@@ -163,11 +177,7 @@ bool TextSplit::text_to_words(const string &in)
 		    span += c;
 		}
 	    } else {
-		if (span.length() != word.length()) {
-		    if (!emitterm(true, span, spanpos, false, i-span.length(), i))
-			return false;
-		}
-		if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
+		if (!doemit(word, wordpos, span, spanpos, false, i))
 		    return false;
 		number = false;
 		span += c;
@@ -175,11 +185,7 @@ bool TextSplit::text_to_words(const string &in)
 	    break;
 	case '@':
 	    if (word.length()) {
-		if (span.length() != word.length()) {
-		    if (!emitterm(true, span, spanpos, false, i-span.length(), i))
-			return false;
-		}
-		if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
+		if (!doemit(word, wordpos, span, spanpos, false, i))
 		    return false;
 		number = false;
 	    } else
@@ -188,11 +194,7 @@ bool TextSplit::text_to_words(const string &in)
 	    break;
 	case '\'':
 	    if (word.length()) {
-		if (span.length() != word.length()) {
-		    if (!emitterm(true, span, spanpos, false, i-span.length(), i))
-			return false;
-		}
-		if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
+		if (!doemit(word, wordpos, span, spanpos, false, i))
 		    return false;
 		number = false;
 		span += c;
@@ -202,8 +204,9 @@ bool TextSplit::text_to_words(const string &in)
 	    if (number) {
 		word += c;
 	    } else {
+		//cerr<<"Got . span: '"<<span<<"' word: '"<<word<<"'"<<endl;
 		if (word.length()) {
-		    if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
+		    if (!doemit(word, wordpos, span, spanpos, false, i))
 			return false;
 		    number = false;
 		} else 
@@ -249,10 +252,8 @@ bool TextSplit::text_to_words(const string &in)
 	}
     }
     if (word.length()) {
-	if (span.length() != word.length())
-	    if (!emitterm(true, span, spanpos, true, i-span.length(), i))
-		return false;
-	return emitterm(false, word, wordpos, true, i-word.length(), i);
+	if (!doemit(word, wordpos, span, spanpos, true, i))
+	    return false;
     }
     return true;
 }
@@ -282,10 +283,12 @@ class mySplitterCB : public TextSplitCB {
 };
 
 static string teststring = 
+    "le ta " 
     "jfd@okyz.com "
     "Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
     "a 134 +134 -14 -1.5 +1.5 1.54e10 a "
     "@^#$(#$(*) "
+    "192.168.4.1 "
     "one\n\rtwo\nthree-\nfour "
     "[olala][ululu] "
     "'o'brien' "						
@@ -297,14 +300,14 @@ int main(int argc, char **argv)
     DebugLog::getdbl()->setloglevel(DEBDEB1);
     DebugLog::setfilename("stderr");
     mySplitterCB cb;
-    TextSplit splitter(&cb);
+    TextSplit splitter(&cb, true);
     if (argc == 2) {
 	string data;
 	if (!file_to_string(argv[1], data)) 
 	    exit(1);
 	splitter.text_to_words(data);
     } else {
-	cout << teststring << endl;  
+	cout << endl << teststring << endl << endl;  
 	splitter.text_to_words(teststring);
     }
     
diff --git a/src/common/textsplit.h b/src/common/textsplit.h
index adb9d4b0..762fe405 100644
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@@ -1,8 +1,11 @@
 #ifndef _TEXTSPLIT_H_INCLUDED_
 #define _TEXTSPLIT_H_INCLUDED_
-/* @(#$Id: textsplit.h,v 1.5 2005-02-08 09:34:46 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: textsplit.h,v 1.6 2005-02-08 10:56:13 dockes Exp $  (C) 2004 J.F.Dockes */
 
 #include <string>
+#ifndef NO_NAMESPACES
+using std::string;
+#endif
 
 // Function class whose called for every detected word
 class TextSplitCB {
@@ -25,8 +28,9 @@ class TextSplit {
     bool fq;        // Are we splitting for query or index ?
     TextSplitCB *cb;
     int maxWordLength;
-    bool emitterm(bool isspan, std::string &term, int pos, bool doerase, 
-		  int bs, int be);
+    bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
+    bool doemit(string &word, int &wordpos, string &span, int spanpos,
+		bool spanerase, int bp);
  public:
     /**
      * Constructor: just store callback and client data
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index bbe0583e..2474d641 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.20 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.21 2005-02-08 10:56:12 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@@ -438,33 +438,62 @@ bool Rcl::Db::purge()
 class wsQData : public TextSplitCB {
  public:
     vector<string> terms;
-
+    string catterms() {
+	string s;
+	for (unsigned int i=0;i<terms.size();i++) {
+	    s += "[" + terms[i] + "] ";
+	}
+	return s;
+    }
     bool takeword(const std::string &term, int , int, int) {
+	LOGDEB(("Takeword: %s\n", term.c_str()));
 	terms.push_back(term);
 	return true;
     }
 };
 
 
-bool Rcl::Db::setQuery(const std::string &querystring)
+bool Rcl::Db::setQuery(const std::string &iqstring)
 {
-    LOGDEB(("Rcl::Db::setQuery: %s\n", querystring.c_str()));
+    LOGDEB(("Rcl::Db::setQuery: %s\n", iqstring.c_str()));
     Native *ndb = (Native *)pdata;
     if (!ndb)
 	return false;
 
-    wsQData splitData;
-    TextSplit splitter(&splitData, true);
-
-    string noacc;
-    if (!dumb_string(querystring, noacc)) {
+    string qstring;;
+    if (!dumb_string(iqstring, qstring)) {
 	return false;
     }
-    splitter.text_to_words(noacc);
 
+    // First extract phrases:
+    list<string> phrases;
+    ConfTree::stringToStrings(qstring, phrases);
+    for (list<string>::const_iterator i=phrases.begin();
+	 i != phrases.end();i++) {
+	LOGDEB(("Rcl::Db::setQuery: phrase: '%s'\n", i->c_str()));
+    }
+    list<Xapian::Query> pqueries;
+    for (list<string>::const_iterator it = phrases.begin(); 
+	 it != phrases.end(); it++) {
 
-    ndb->query = Xapian::Query(Xapian::Query::OP_OR, splitData.terms.begin(), 
-			       splitData.terms.end());
+	wsQData splitData;
+	TextSplit splitter(&splitData, true);
+	splitter.text_to_words(*it);
+	LOGDEB(("Splitter term count: %d\n", splitData.terms.size()));
+	switch(splitData.terms.size()) {
+	case 0: continue;// ??
+	case 1:
+	    pqueries.push_back(Xapian::Query(splitData.terms.front()));
+	    break;
+	default:
+	    LOGDEB(("Pushing phrase: %s\n", splitData.catterms().c_str()));
+	    pqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE,
+					     splitData.terms.begin(),
+					     splitData.terms.end()));
+	}
+    }
+    ndb->query = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), 
+			       pqueries.end());
     delete ndb->enquire;
     ndb->enquire = new Xapian::Enquire(ndb->db);
     ndb->enquire->set_query(ndb->query);