From 7dcc7c61c8f7094eb0e9316a90d059d1c760bd0b Mon Sep 17 00:00:00 2001
From: dockes <none@none>
Date: Mon, 26 Jan 2009 18:30:48 +0000
Subject: [PATCH] modified the time at which we unaccent so that we can do the
 Capitalized->nostemming test on single words (this had been broken by the
 change of noac/split order done earlier to get japanese to work)

---
 src/query/plaintorich.cpp |   6 ++-
 src/rcldb/rcldb.cpp       | 108 ++++++++++++++++----------------------
 src/rcldb/rcldb.h         |   3 --
 src/rcldb/searchdata.cpp  |  87 ++++++++++++++++--------------
 4 files changed, 96 insertions(+), 108 deletions(-)

diff --git a/src/query/plaintorich.cpp b/src/query/plaintorich.cpp
index c8b90aaa..9e635688 100644
--- a/src/query/plaintorich.cpp
+++ b/src/query/plaintorich.cpp
@@ -42,6 +42,7 @@ using std::set;
 #include "smallut.h"
 #include "plaintorich.h"
 #include "cancelcheck.h"
+#include "unacpp.h"
 
 const string PlainToRich::snull = "";
 
@@ -84,7 +85,10 @@ class myTextSplitCB : public TextSplitCB {
     // Callback called by the text-to-words breaker for each word
     virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
 	string dumb;
-	Rcl::dumb_string(term, dumb);
+	if (!unacmaybefold(term, dumb, "UTF-8", true)) {
+	    LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str()));
+	    return true;
+	}
 	//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
 	// pos, bts, bte));
 
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index fe7be130..2a442f67 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -784,27 +784,33 @@ private:
 };
 
 // Callback for the document to word splitting class during indexation
-bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
+bool mySplitterCB::takeword(const std::string &_term, int pos, int, int)
 {
 #if 0
-    LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
-    string printable;
-    if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
-	LOGDEB(("                                [%s]\n", printable.c_str()));
-    }
+    LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", _term.c_str()));
 #endif
+    string term;
+    if (!unacmaybefold(_term, term, "UTF-8", true)) {
+	LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", _term.c_str()));
+	term.clear();
+	// We don't generate a fatal error because of a bad term
+	return true;
+    }
 
+    if (stops.hasStops() && stops.isStop(term)) {
+	LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
+	return true;
+    }
+
+    // Compute absolute position (pos is relative to current segment),
+    // and remember relative.
+    curpos = pos;
+    pos += basepos;
     string ermsg;
     try {
-	if (stops.hasStops() && stops.isStop(term)) {
-	    LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
-	    return true;
-	}
 	// Note: 1 is the within document frequency increment. It would 
 	// be possible to assign different weigths to doc parts (ie title)
 	// by using a higher value
-	curpos = pos;
-	pos += basepos;
 	doc.add_posting(term, pos, 1);
 	if (!prefix.empty()) {
 	    doc.add_posting(prefix + term, pos, 1);
@@ -815,28 +821,6 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
     return false;
 }
 
-// Unaccent and lowercase data, replace \n\r with spaces
-// Removing crlfs is so that we can use the text in the document data fields.
-// Use unac (with folding extension) for removing accents and casefolding
-//
-// Note that we always return true (but set out to "" on error). We don't
-// want to stop indexation because of a bad string
-bool dumb_string(const string &in, string &out)
-{
-    out.clear();
-    if (in.empty())
-	return true;
-
-    string s1 = neutchars(in, "\n\r");
-    if (!unacmaybefold(s1, out, "UTF-8", true)) {
-	LOGINFO(("dumb_string: unac failed for [%s]\n", in.c_str()));
-	out.clear();
-	// See comment at start of func
-	return true;
-    }
-    return true;
-}
-
 // Let our user set the parameters for abstract processing
 void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
 {
@@ -891,14 +875,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
     Xapian::Document newdocument;
     mySplitterCB splitData(newdocument, m_stops);
     TextSplit splitter(&splitData);
-    string noacc;
 
     // Split and index file name as document term(s)
     LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
-    if (dumb_string(doc.utf8fn, noacc)) {
-	splitter.text_to_words(noacc);
-	splitData.basepos += splitData.curpos + 100;
-    }
+    splitter.text_to_words(doc.utf8fn);
+    splitData.basepos += splitData.curpos + 100;
 
     // Index textual metadata.  These are all indexed as text with
     // positions, as we may want to do phrase searches with them (this
@@ -918,12 +899,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 	    LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n", 
 		    meta_it->first.c_str(), pfx.c_str(), 
 		    meta_it->second.c_str()));
-	    if (!dumb_string(meta_it->second, noacc)) {
-		LOGERR(("Db::add: dumb_string failed\n"));
-		return false;
-	    }
 	    splitData.setprefix(pfx); // Subject
-	    splitter.text_to_words(noacc);
+	    splitter.text_to_words(meta_it->second);
 	    splitData.setprefix(string());
 	    splitData.basepos += splitData.curpos + 100;
 	}
@@ -936,31 +913,28 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 
     // Split and index body text
     LOGDEB2(("Db::add: split body\n"));
-    if (!dumb_string(doc.text, noacc)) {
-	LOGERR(("Db::add: dumb_string failed\n"));
-	return false;
-    }
-    splitter.text_to_words(noacc);
+    splitter.text_to_words(doc.text);
 
     ////// Special terms for other metadata. No positions for these.
     // Mime type
     newdocument.add_term("T" + doc.mimetype);
 
-    // Simple file name. This is used for file name searches only. We index
-    // it with a term prefix. utf8fn used to be the full path, but it's now
-    // the simple file name.
+    // Simple file name indexed for file name searches with a term prefix
     // We also add a term for the filename extension if any.
-    if (dumb_string(doc.utf8fn, noacc) && !noacc.empty()) {
-	// We should truncate after extracting the extension, but this is
-	// a pathological case anyway
-	if (noacc.size() > 230)
-	    utf8truncate(noacc, 230);
-	string::size_type pos = noacc.rfind('.');
-	if (pos != string::npos && pos != noacc.length() -1) {
-	    newdocument.add_term(string("XE") + noacc.substr(pos+1));
+    if (!doc.utf8fn.empty()) {
+	string fn;
+	if (unacmaybefold(doc.utf8fn, fn, "UTF-8", true)) {
+	    // We should truncate after extracting the extension, but this is
+	    // a pathological case anyway
+	    if (fn.size() > 230)
+		utf8truncate(fn, 230);
+	    string::size_type pos = fn.rfind('.');
+	    if (pos != string::npos && pos != fn.length() - 1) {
+		newdocument.add_term(string("XE") + fn.substr(pos + 1));
+	    }
+	    fn = string("XSFN") + fn;
+	    newdocument.add_term(fn);
 	}
-	noacc = string("XSFN") + noacc;
-	newdocument.add_term(noacc);
     }
 
     // Udi unique term: this is used for file existence/uptodate
@@ -1329,7 +1303,10 @@ bool Db::purgeFile(const string &udi)
 bool Db::filenameWildExp(const string& fnexp, list<string>& names)
 {
     string pattern;
-    dumb_string(fnexp, pattern);
+    if (!unacmaybefold(fnexp, pattern, "UTF-8", true)) {
+	LOGERR(("Db::filenameWildExp: unac error for [%s]\n", fnexp.c_str()));
+	return false;
+    }
     names.clear();
 
     // If pattern is not quoted, and has no wildcards, we add * at
@@ -1415,7 +1392,10 @@ bool Db::termMatch(MatchType typ, const string &lang,
 
     // Get rid of capitals and accents
     string droot;
-    dumb_string(root, droot);
+    if (!unacmaybefold(root, droot, "UTF-8", true)) {
+	LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
+	return false;
+    }
     string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars;
 
     string prefix;
diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h
index a1c86415..c4ad70b6 100644
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@@ -243,9 +243,6 @@ private:
     Db& operator=(const Db &) {return *this;};
 };
 
-// Unaccent and lowercase data.
-extern bool dumb_string(const string &in, string &out);
-
 #ifndef NO_NAMESPACES
 }
 #endif // NO_NAMESPACES
diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp
index 14117a1b..c354fcf9 100644
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@@ -183,17 +183,39 @@ class wsQData : public TextSplitCB {
     wsQData(const StopList &_stops) 
 	: stops(_stops), alltermcount(0)
     {}
-    vector<string> terms;
-    bool takeword(const std::string &term, int , int, int) {
+    bool takeword(const std::string &interm, int , int, int) {
 	alltermcount++;
-	LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
-	if (stops.hasStops() && stops.isStop(term)) {
-	    LOGDEB1(("wsQData::takeword [%s] in stop list\n", term.c_str()));
+	LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));
+
+	// Check if the first letter is a majuscule in which
+	// case we do not want to do stem expansion. Note that
+	// the test is convoluted and possibly problematic
+	string noacterm, noaclowterm;
+	if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
+	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
+	    return true;
+	} 
+	if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
+	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
 	    return true;
 	}
-	terms.push_back(term);
+	bool nostemexp = false;
+	Utf8Iter it1(noacterm);
+	Utf8Iter it2(noaclowterm);
+	if (*it1 != *it2)
+	    nostemexp = true;
+
+	if (stops.hasStops() && stops.isStop(noaclowterm)) {
+	    LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
+	    return true;
+	}
+	terms.push_back(noaclowterm);
+	nostemexps.push_back(nostemexp);
 	return true;
     }
+
+    vector<string> terms;
+    vector<bool>   nostemexps;
     const StopList &stops;
     // Count of terms including stopwords: this is for adjusting
     // phrase/near slack
@@ -232,7 +254,7 @@ private:
     void expandTerm(bool dont, const string& term, list<string>& exp, 
 		      string& sterm);
     // After splitting entry on whitespace: process non-phrase element
-    void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
+    void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
     // Process phrase/near element
     void processPhraseOrNear(wsQData *splitData, 
 			     list<Xapian::Query> &pqueries,
@@ -279,18 +301,6 @@ void StringToXapianQ::expandTerm(bool nostemexp,
 	nostemexp = true;
 
     if (!nostemexp) {
-	// Check if the first letter is a majuscule in which
-	// case we do not want to do stem expansion. Note that
-	// the test is convoluted and possibly problematic
-
-	string noacterm, noaclowterm;
-	if (unacmaybefold(term, noacterm, "UTF-8", false) &&
-	    unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
-	    Utf8Iter it1(noacterm);
-	    Utf8Iter it2(noaclowterm);
-	    if (*it1 != *it2)
-		nostemexp = true;
-	}
     }
 
     if (nostemexp && !haswild) {
@@ -356,12 +366,12 @@ static void addPrefix(list<string>& terms, const string& prefix)
 	it->insert(0, prefix);
 }
 
-void StringToXapianQ::processSimpleSpan(const string& span, 
+void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
 					list<Xapian::Query> &pqueries)
 {
     list<string> exp;  
     string sterm; // dumb version of user term
-    expandTerm(false, span, exp, sterm);
+    expandTerm(nostemexp, span, exp, sterm);
     m_terms.insert(m_terms.end(), exp.begin(), exp.end());
     addPrefix(exp, m_prefix);
     // Push either term or OR of stem-expanded set
@@ -396,12 +406,13 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
     vector<vector<string> >groups;
 
     // Go through the list and perform stem/wildcard expansion for each element
+    vector<bool>::iterator nxit = splitData->nostemexps.begin();
     for (vector<string>::iterator it = splitData->terms.begin();
-	 it != splitData->terms.end(); it++) {
+	 it != splitData->terms.end(); it++, nxit++) {
 	// Adjust when we do stem expansion. Not inside phrases, and
 	// some versions of xapian will accept only one OR clause
 	// inside NEAR, all others must be leafs.
-	bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
+	bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;
 
 	string sterm;
 	list<string>exp;
@@ -434,7 +445,10 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
 
 /** 
  * Turn user entry string (NOT query language) into a list of xapian queries.
- * We just separate words and phrases, and do wildcard and stemp expansion,
+ * We just separate words and phrases, and do wildcard and stem expansion,
+ *
+ * This is used to process data entered into an OR/AND/NEAR/PHRASE field of
+ * the GUI.
  *
  * The final list contains one query for each term or phrase
  *   - Elements corresponding to a stem-expanded part are an OP_OR
@@ -444,7 +458,7 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
  * @return the subquery count (either or'd stem-expanded terms or phrase word
  *   count)
  */
-bool StringToXapianQ::processUserString(const string &_iq,
+bool StringToXapianQ::processUserString(const string &iq,
 					string &ermsg,
 					list<Xapian::Query> &pqueries,
 					const StopList& stops,
@@ -452,25 +466,18 @@ bool StringToXapianQ::processUserString(const string &_iq,
 					bool useNear
 					)
 {
-    LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
+    LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
     ermsg.erase();
     m_terms.clear();
     m_groups.clear();
 
-    // First unaccent/normalize the input: do it first so that it
-    // happens in the same order as when indexing: unac then split. As
-    // the character count can change during normalisation, this is
-    // specially important for cjk because the artificial cjk split is
-    // based on character counts
-    string iq;
-    dumb_string(_iq, iq);
-
     // Simple whitespace-split input into user-level words and
-    // double-quoted phrases: word1 word2 "this is a phrase". The text
-    // splitter may further still decide that the resulting "words"
-    // are really phrases, this depends on separators: [paul@dom.net]
-    // would still be a word (span), but [about:me] will probably be
-    // handled as a phrase.
+    // double-quoted phrases: word1 word2 "this is a phrase". 
+    //
+    // The text splitter may further still decide that the resulting
+    // "words" are really phrases, this depends on separators:
+    // [paul@dom.net] would still be a word (span), but [about:me]
+    // will probably be handled as a phrase.
     list<string> phrases;
     TextSplit::stringToStrings(iq, phrases);
 
@@ -516,7 +523,7 @@ bool StringToXapianQ::processUserString(const string &_iq,
 	    case 0: 
 		continue;// ??
 	    case 1: 
-		processSimpleSpan(splitData->terms.front(), pqueries);
+		processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
 		break;
 	    default:
 		processPhraseOrNear(splitData, pqueries, useNear, slack);