modified the time at which we unaccent so that we can do the Capitalized->nostemming test on single words (this had been broken by the change of noac/split order done earlier to get japanese to work)

2009-01-26 18:30:48 +00:00 · 2009-01-26 18:30:48 +00:00 · 7dcc7c61c8
commit 7dcc7c61c8
parent 30c46709ba
4 changed files with 96 additions and 108 deletions
--- a/src/query/plaintorich.cpp
+++ b/src/query/plaintorich.cpp
@ -42,6 +42,7 @@ using std::set;
 #include "smallut.h"
 #include "plaintorich.h"
 #include "cancelcheck.h"
+#include "unacpp.h"

 const string PlainToRich::snull = "";

@ -84,7 +85,10 @@ class myTextSplitCB : public TextSplitCB {
    // Callback called by the text-to-words breaker for each word
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
 	string dumb;
-	Rcl::dumb_string(term, dumb);
+	if (!unacmaybefold(term, dumb, "UTF-8", true)) {
+	    LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str()));
+	    return true;
+	}
 	//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
 	// pos, bts, bte));

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -784,27 +784,33 @@ private:
 };

 // Callback for the document to word splitting class during indexation
-bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
+bool mySplitterCB::takeword(const std::string &_term, int pos, int, int)
 {
 #if 0
-    LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
-    string printable;
-    if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
-	LOGDEB(("                                [%s]\n", printable.c_str()));
-    }
+    LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", _term.c_str()));
 #endif
+    string term;
+    if (!unacmaybefold(_term, term, "UTF-8", true)) {
+	LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", _term.c_str()));
+	term.clear();
+	// We don't generate a fatal error because of a bad term
+	return true;
+    }

+    if (stops.hasStops() && stops.isStop(term)) {
+	LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
+	return true;
+    }
+
+    // Compute absolute position (pos is relative to current segment),
+    // and remember relative.
+    curpos = pos;
+    pos += basepos;
    string ermsg;
    try {
-	if (stops.hasStops() && stops.isStop(term)) {
-	    LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
-	    return true;
-	}
 	// Note: 1 is the within document frequency increment. It would 
 	// be possible to assign different weigths to doc parts (ie title)
 	// by using a higher value
-	curpos = pos;
-	pos += basepos;
 	doc.add_posting(term, pos, 1);
 	if (!prefix.empty()) {
 	    doc.add_posting(prefix + term, pos, 1);
@ -815,28 +821,6 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
    return false;
 }

-// Unaccent and lowercase data, replace \n\r with spaces
-// Removing crlfs is so that we can use the text in the document data fields.
-// Use unac (with folding extension) for removing accents and casefolding
-//
-// Note that we always return true (but set out to "" on error). We don't
-// want to stop indexation because of a bad string
-bool dumb_string(const string &in, string &out)
-{
-    out.clear();
-    if (in.empty())
-	return true;
-
-    string s1 = neutchars(in, "\n\r");
-    if (!unacmaybefold(s1, out, "UTF-8", true)) {
-	LOGINFO(("dumb_string: unac failed for [%s]\n", in.c_str()));
-	out.clear();
-	// See comment at start of func
-	return true;
-    }
-    return true;
-}
-
 // Let our user set the parameters for abstract processing
 void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
 {
@ -891,14 +875,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    Xapian::Document newdocument;
    mySplitterCB splitData(newdocument, m_stops);
    TextSplit splitter(&splitData);
-    string noacc;

    // Split and index file name as document term(s)
    LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
-    if (dumb_string(doc.utf8fn, noacc)) {
-	splitter.text_to_words(noacc);
-	splitData.basepos += splitData.curpos + 100;
-    }
+    splitter.text_to_words(doc.utf8fn);
+    splitData.basepos += splitData.curpos + 100;

    // Index textual metadata.  These are all indexed as text with
    // positions, as we may want to do phrase searches with them (this
@ -918,12 +899,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 	    LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n", 
 		    meta_it->first.c_str(), pfx.c_str(), 
 		    meta_it->second.c_str()));
-	    if (!dumb_string(meta_it->second, noacc)) {
-		LOGERR(("Db::add: dumb_string failed\n"));
-		return false;
-	    }
 	    splitData.setprefix(pfx); // Subject
-	    splitter.text_to_words(noacc);
+	    splitter.text_to_words(meta_it->second);
 	    splitData.setprefix(string());
 	    splitData.basepos += splitData.curpos + 100;
 	}
@ -936,31 +913,28 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,

    // Split and index body text
    LOGDEB2(("Db::add: split body\n"));
-    if (!dumb_string(doc.text, noacc)) {
-	LOGERR(("Db::add: dumb_string failed\n"));
-	return false;
-    }
-    splitter.text_to_words(noacc);
+    splitter.text_to_words(doc.text);

    ////// Special terms for other metadata. No positions for these.
    // Mime type
    newdocument.add_term("T" + doc.mimetype);

-    // Simple file name. This is used for file name searches only. We index
-    // it with a term prefix. utf8fn used to be the full path, but it's now
-    // the simple file name.
+    // Simple file name indexed for file name searches with a term prefix
    // We also add a term for the filename extension if any.
-    if (dumb_string(doc.utf8fn, noacc) && !noacc.empty()) {
-	// We should truncate after extracting the extension, but this is
-	// a pathological case anyway
-	if (noacc.size() > 230)
-	    utf8truncate(noacc, 230);
-	string::size_type pos = noacc.rfind('.');
-	if (pos != string::npos && pos != noacc.length() -1) {
-	    newdocument.add_term(string("XE") + noacc.substr(pos+1));
+    if (!doc.utf8fn.empty()) {
+	string fn;
+	if (unacmaybefold(doc.utf8fn, fn, "UTF-8", true)) {
+	    // We should truncate after extracting the extension, but this is
+	    // a pathological case anyway
+	    if (fn.size() > 230)
+		utf8truncate(fn, 230);
+	    string::size_type pos = fn.rfind('.');
+	    if (pos != string::npos && pos != fn.length() - 1) {
+		newdocument.add_term(string("XE") + fn.substr(pos + 1));
+	    }
+	    fn = string("XSFN") + fn;
+	    newdocument.add_term(fn);
 	}
-	noacc = string("XSFN") + noacc;
-	newdocument.add_term(noacc);
    }

    // Udi unique term: this is used for file existence/uptodate
@ -1329,7 +1303,10 @@ bool Db::purgeFile(const string &udi)
 bool Db::filenameWildExp(const string& fnexp, list<string>& names)
 {
    string pattern;
-    dumb_string(fnexp, pattern);
+    if (!unacmaybefold(fnexp, pattern, "UTF-8", true)) {
+	LOGERR(("Db::filenameWildExp: unac error for [%s]\n", fnexp.c_str()));
+	return false;
+    }
    names.clear();

    // If pattern is not quoted, and has no wildcards, we add * at
@ -1415,7 +1392,10 @@ bool Db::termMatch(MatchType typ, const string &lang,

    // Get rid of capitals and accents
    string droot;
-    dumb_string(root, droot);
+    if (!unacmaybefold(root, droot, "UTF-8", true)) {
+	LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
+	return false;
+    }
    string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars;

    string prefix;
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -243,9 +243,6 @@ private:
    Db& operator=(const Db &) {return *this;};
 };

-// Unaccent and lowercase data.
-extern bool dumb_string(const string &in, string &out);
-
 #ifndef NO_NAMESPACES
 }
 #endif // NO_NAMESPACES
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -183,17 +183,39 @@ class wsQData : public TextSplitCB {
    wsQData(const StopList &_stops) 
 	: stops(_stops), alltermcount(0)
    {}
-    vector<string> terms;
-    bool takeword(const std::string &term, int , int, int) {
+    bool takeword(const std::string &interm, int , int, int) {
 	alltermcount++;
-	LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
-	if (stops.hasStops() && stops.isStop(term)) {
-	    LOGDEB1(("wsQData::takeword [%s] in stop list\n", term.c_str()));
+	LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));
+
+	// Check if the first letter is a majuscule in which
+	// case we do not want to do stem expansion. Note that
+	// the test is convoluted and possibly problematic
+	string noacterm, noaclowterm;
+	if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
+	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
+	    return true;
+	} 
+	if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
+	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
 	    return true;
 	}
-	terms.push_back(term);
+	bool nostemexp = false;
+	Utf8Iter it1(noacterm);
+	Utf8Iter it2(noaclowterm);
+	if (*it1 != *it2)
+	    nostemexp = true;
+
+	if (stops.hasStops() && stops.isStop(noaclowterm)) {
+	    LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
+	    return true;
+	}
+	terms.push_back(noaclowterm);
+	nostemexps.push_back(nostemexp);
 	return true;
    }
+
+    vector<string> terms;
+    vector<bool>   nostemexps;
    const StopList &stops;
    // Count of terms including stopwords: this is for adjusting
    // phrase/near slack
@ -232,7 +254,7 @@ private:
    void expandTerm(bool dont, const string& term, list<string>& exp, 
 		      string& sterm);
    // After splitting entry on whitespace: process non-phrase element
-    void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
+    void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
    // Process phrase/near element
    void processPhraseOrNear(wsQData *splitData, 
 			     list<Xapian::Query> &pqueries,
@ -279,18 +301,6 @@ void StringToXapianQ::expandTerm(bool nostemexp,
 	nostemexp = true;

    if (!nostemexp) {
-	// Check if the first letter is a majuscule in which
-	// case we do not want to do stem expansion. Note that
-	// the test is convoluted and possibly problematic
-
-	string noacterm, noaclowterm;
-	if (unacmaybefold(term, noacterm, "UTF-8", false) &&
-	    unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
-	    Utf8Iter it1(noacterm);
-	    Utf8Iter it2(noaclowterm);
-	    if (*it1 != *it2)
-		nostemexp = true;
-	}
    }

    if (nostemexp && !haswild) {
@ -356,12 +366,12 @@ static void addPrefix(list<string>& terms, const string& prefix)
 	it->insert(0, prefix);
 }

-void StringToXapianQ::processSimpleSpan(const string& span, 
+void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
 					list<Xapian::Query> &pqueries)
 {
    list<string> exp;  
    string sterm; // dumb version of user term
-    expandTerm(false, span, exp, sterm);
+    expandTerm(nostemexp, span, exp, sterm);
    m_terms.insert(m_terms.end(), exp.begin(), exp.end());
    addPrefix(exp, m_prefix);
    // Push either term or OR of stem-expanded set
@ -396,12 +406,13 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
    vector<vector<string> >groups;

    // Go through the list and perform stem/wildcard expansion for each element
+    vector<bool>::iterator nxit = splitData->nostemexps.begin();
    for (vector<string>::iterator it = splitData->terms.begin();
-	 it != splitData->terms.end(); it++) {
+	 it != splitData->terms.end(); it++, nxit++) {
 	// Adjust when we do stem expansion. Not inside phrases, and
 	// some versions of xapian will accept only one OR clause
 	// inside NEAR, all others must be leafs.
-	bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
+	bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;

 	string sterm;
 	list<string>exp;
@ -434,7 +445,10 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,

 /** 
 * Turn user entry string (NOT query language) into a list of xapian queries.
- * We just separate words and phrases, and do wildcard and stemp expansion,
+ * We just separate words and phrases, and do wildcard and stem expansion,
+ *
+ * This is used to process data entered into an OR/AND/NEAR/PHRASE field of
+ * the GUI.
 *
 * The final list contains one query for each term or phrase
 *   - Elements corresponding to a stem-expanded part are an OP_OR
@ -444,7 +458,7 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
 * @return the subquery count (either or'd stem-expanded terms or phrase word
 *   count)
 */
-bool StringToXapianQ::processUserString(const string &_iq,
+bool StringToXapianQ::processUserString(const string &iq,
 					string &ermsg,
 					list<Xapian::Query> &pqueries,
 					const StopList& stops,
@ -452,25 +466,18 @@ bool StringToXapianQ::processUserString(const string &_iq,
 					bool useNear
 					)
 {
-    LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
+    LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
    ermsg.erase();
    m_terms.clear();
    m_groups.clear();

-    // First unaccent/normalize the input: do it first so that it
-    // happens in the same order as when indexing: unac then split. As
-    // the character count can change during normalisation, this is
-    // specially important for cjk because the artificial cjk split is
-    // based on character counts
-    string iq;
-    dumb_string(_iq, iq);
-
    // Simple whitespace-split input into user-level words and
-    // double-quoted phrases: word1 word2 "this is a phrase". The text
-    // splitter may further still decide that the resulting "words"
-    // are really phrases, this depends on separators: [paul@dom.net]
-    // would still be a word (span), but [about:me] will probably be
-    // handled as a phrase.
+    // double-quoted phrases: word1 word2 "this is a phrase". 
+    //
+    // The text splitter may further still decide that the resulting
+    // "words" are really phrases, this depends on separators:
+    // [paul@dom.net] would still be a word (span), but [about:me]
+    // will probably be handled as a phrase.
    list<string> phrases;
    TextSplit::stringToStrings(iq, phrases);

@ -516,7 +523,7 @@ bool StringToXapianQ::processUserString(const string &_iq,
 	    case 0: 
 		continue;// ??
 	    case 1: 
-		processSimpleSpan(splitData->terms.front(), pqueries);
+		processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
 		break;
 	    default:
 		processPhraseOrNear(splitData, pqueries, useNear, slack);