From 7dcc7c61c8f7094eb0e9316a90d059d1c760bd0b Mon Sep 17 00:00:00 2001 From: dockes Date: Mon, 26 Jan 2009 18:30:48 +0000 Subject: [PATCH] modified the time at which we unaccent so that we can do the Capitalized->nostemming test on single words (this had been broken by the change of noac/split order done earlier to get japanese to work) --- src/query/plaintorich.cpp | 6 ++- src/rcldb/rcldb.cpp | 108 ++++++++++++++++---------------------- src/rcldb/rcldb.h | 3 -- src/rcldb/searchdata.cpp | 87 ++++++++++++++++-------------- 4 files changed, 96 insertions(+), 108 deletions(-) diff --git a/src/query/plaintorich.cpp b/src/query/plaintorich.cpp index c8b90aaa..9e635688 100644 --- a/src/query/plaintorich.cpp +++ b/src/query/plaintorich.cpp @@ -42,6 +42,7 @@ using std::set; #include "smallut.h" #include "plaintorich.h" #include "cancelcheck.h" +#include "unacpp.h" const string PlainToRich::snull = ""; @@ -84,7 +85,10 @@ class myTextSplitCB : public TextSplitCB { // Callback called by the text-to-words breaker for each word virtual bool takeword(const std::string& term, int pos, int bts, int bte) { string dumb; - Rcl::dumb_string(term, dumb); + if (!unacmaybefold(term, dumb, "UTF-8", true)) { + LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str())); + return true; + } //LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), // pos, bts, bte)); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index fe7be130..2a442f67 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -784,27 +784,33 @@ private: }; // Callback for the document to word splitting class during indexation -bool mySplitterCB::takeword(const std::string &term, int pos, int, int) +bool mySplitterCB::takeword(const std::string &_term, int pos, int, int) { #if 0 - LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str())); - string printable; - if (transcode(term, printable, "UTF-8", "ISO-8859-1")) { - LOGDEB((" [%s]\n", printable.c_str())); - } + LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", _term.c_str())); #endif + string term; + if (!unacmaybefold(_term, term, "UTF-8", true)) { + LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", _term.c_str())); + term.clear(); + // We don't generate a fatal error because of a bad term + return true; + } + if (stops.hasStops() && stops.isStop(term)) { + LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str())); + return true; + } + + // Compute absolute position (pos is relative to current segment), + // and remember relative. + curpos = pos; + pos += basepos; string ermsg; try { - if (stops.hasStops() && stops.isStop(term)) { - LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str())); - return true; - } // Note: 1 is the within document frequency increment. It would // be possible to assign different weigths to doc parts (ie title) // by using a higher value - curpos = pos; - pos += basepos; doc.add_posting(term, pos, 1); if (!prefix.empty()) { doc.add_posting(prefix + term, pos, 1); @@ -815,28 +821,6 @@ bool mySplitterCB::takeword(const std::string &term, int pos, int, int) return false; } -// Unaccent and lowercase data, replace \n\r with spaces -// Removing crlfs is so that we can use the text in the document data fields. -// Use unac (with folding extension) for removing accents and casefolding -// -// Note that we always return true (but set out to "" on error). We don't -// want to stop indexation because of a bad string -bool dumb_string(const string &in, string &out) -{ - out.clear(); - if (in.empty()) - return true; - - string s1 = neutchars(in, "\n\r"); - if (!unacmaybefold(s1, out, "UTF-8", true)) { - LOGINFO(("dumb_string: unac failed for [%s]\n", in.c_str())); - out.clear(); - // See comment at start of func - return true; - } - return true; -} - // Let our user set the parameters for abstract processing void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen) { @@ -891,14 +875,11 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Xapian::Document newdocument; mySplitterCB splitData(newdocument, m_stops); TextSplit splitter(&splitData); - string noacc; // Split and index file name as document term(s) LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str())); - if (dumb_string(doc.utf8fn, noacc)) { - splitter.text_to_words(noacc); - splitData.basepos += splitData.curpos + 100; - } + splitter.text_to_words(doc.utf8fn); + splitData.basepos += splitData.curpos + 100; // Index textual metadata. These are all indexed as text with // positions, as we may want to do phrase searches with them (this @@ -918,12 +899,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n", meta_it->first.c_str(), pfx.c_str(), meta_it->second.c_str())); - if (!dumb_string(meta_it->second, noacc)) { - LOGERR(("Db::add: dumb_string failed\n")); - return false; - } splitData.setprefix(pfx); // Subject - splitter.text_to_words(noacc); + splitter.text_to_words(meta_it->second); splitData.setprefix(string()); splitData.basepos += splitData.curpos + 100; } @@ -936,31 +913,28 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, // Split and index body text LOGDEB2(("Db::add: split body\n")); - if (!dumb_string(doc.text, noacc)) { - LOGERR(("Db::add: dumb_string failed\n")); - return false; - } - splitter.text_to_words(noacc); + splitter.text_to_words(doc.text); ////// Special terms for other metadata. No positions for these. // Mime type newdocument.add_term("T" + doc.mimetype); - // Simple file name. This is used for file name searches only. We index - // it with a term prefix. utf8fn used to be the full path, but it's now - // the simple file name. + // Simple file name indexed for file name searches with a term prefix // We also add a term for the filename extension if any. - if (dumb_string(doc.utf8fn, noacc) && !noacc.empty()) { - // We should truncate after extracting the extension, but this is - // a pathological case anyway - if (noacc.size() > 230) - utf8truncate(noacc, 230); - string::size_type pos = noacc.rfind('.'); - if (pos != string::npos && pos != noacc.length() -1) { - newdocument.add_term(string("XE") + noacc.substr(pos+1)); + if (!doc.utf8fn.empty()) { + string fn; + if (unacmaybefold(doc.utf8fn, fn, "UTF-8", true)) { + // We should truncate after extracting the extension, but this is + // a pathological case anyway + if (fn.size() > 230) + utf8truncate(fn, 230); + string::size_type pos = fn.rfind('.'); + if (pos != string::npos && pos != fn.length() - 1) { + newdocument.add_term(string("XE") + fn.substr(pos + 1)); + } + fn = string("XSFN") + fn; + newdocument.add_term(fn); } - noacc = string("XSFN") + noacc; - newdocument.add_term(noacc); } // Udi unique term: this is used for file existence/uptodate @@ -1329,7 +1303,10 @@ bool Db::purgeFile(const string &udi) bool Db::filenameWildExp(const string& fnexp, list& names) { string pattern; - dumb_string(fnexp, pattern); + if (!unacmaybefold(fnexp, pattern, "UTF-8", true)) { + LOGERR(("Db::filenameWildExp: unac error for [%s]\n", fnexp.c_str())); + return false; + } names.clear(); // If pattern is not quoted, and has no wildcards, we add * at @@ -1415,7 +1392,10 @@ bool Db::termMatch(MatchType typ, const string &lang, // Get rid of capitals and accents string droot; - dumb_string(root, droot); + if (!unacmaybefold(root, droot, "UTF-8", true)) { + LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str())); + return false; + } string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars; string prefix; diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index a1c86415..c4ad70b6 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -243,9 +243,6 @@ private: Db& operator=(const Db &) {return *this;}; }; -// Unaccent and lowercase data. -extern bool dumb_string(const string &in, string &out); - #ifndef NO_NAMESPACES } #endif // NO_NAMESPACES diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 14117a1b..c354fcf9 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -183,17 +183,39 @@ class wsQData : public TextSplitCB { wsQData(const StopList &_stops) : stops(_stops), alltermcount(0) {} - vector terms; - bool takeword(const std::string &term, int , int, int) { + bool takeword(const std::string &interm, int , int, int) { alltermcount++; - LOGDEB1(("wsQData::takeword: %s\n", term.c_str())); - if (stops.hasStops() && stops.isStop(term)) { - LOGDEB1(("wsQData::takeword [%s] in stop list\n", term.c_str())); + LOGDEB1(("wsQData::takeword: %s\n", interm.c_str())); + + // Check if the first letter is a majuscule in which + // case we do not want to do stem expansion. Note that + // the test is convoluted and possibly problematic + string noacterm, noaclowterm; + if (!unacmaybefold(interm, noacterm, "UTF-8", false)) { + LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str())); + return true; + } + if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) { + LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str())); return true; } - terms.push_back(term); + bool nostemexp = false; + Utf8Iter it1(noacterm); + Utf8Iter it2(noaclowterm); + if (*it1 != *it2) + nostemexp = true; + + if (stops.hasStops() && stops.isStop(noaclowterm)) { + LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str())); + return true; + } + terms.push_back(noaclowterm); + nostemexps.push_back(nostemexp); return true; } + + vector terms; + vector nostemexps; const StopList &stops; // Count of terms including stopwords: this is for adjusting // phrase/near slack @@ -232,7 +254,7 @@ private: void expandTerm(bool dont, const string& term, list& exp, string& sterm); // After splitting entry on whitespace: process non-phrase element - void processSimpleSpan(const string& span, list &pqueries); + void processSimpleSpan(const string& span, bool nostemexp, list &pqueries); // Process phrase/near element void processPhraseOrNear(wsQData *splitData, list &pqueries, @@ -279,18 +301,6 @@ void StringToXapianQ::expandTerm(bool nostemexp, nostemexp = true; if (!nostemexp) { - // Check if the first letter is a majuscule in which - // case we do not want to do stem expansion. Note that - // the test is convoluted and possibly problematic - - string noacterm, noaclowterm; - if (unacmaybefold(term, noacterm, "UTF-8", false) && - unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) { - Utf8Iter it1(noacterm); - Utf8Iter it2(noaclowterm); - if (*it1 != *it2) - nostemexp = true; - } } if (nostemexp && !haswild) { @@ -356,12 +366,12 @@ static void addPrefix(list& terms, const string& prefix) it->insert(0, prefix); } -void StringToXapianQ::processSimpleSpan(const string& span, +void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp, list &pqueries) { list exp; string sterm; // dumb version of user term - expandTerm(false, span, exp, sterm); + expandTerm(nostemexp, span, exp, sterm); m_terms.insert(m_terms.end(), exp.begin(), exp.end()); addPrefix(exp, m_prefix); // Push either term or OR of stem-expanded set @@ -396,12 +406,13 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData, vector >groups; // Go through the list and perform stem/wildcard expansion for each element + vector::iterator nxit = splitData->nostemexps.begin(); for (vector::iterator it = splitData->terms.begin(); - it != splitData->terms.end(); it++) { + it != splitData->terms.end(); it++, nxit++) { // Adjust when we do stem expansion. Not inside phrases, and // some versions of xapian will accept only one OR clause // inside NEAR, all others must be leafs. - bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple; + bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple; string sterm; listexp; @@ -434,7 +445,10 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData, /** * Turn user entry string (NOT query language) into a list of xapian queries. - * We just separate words and phrases, and do wildcard and stemp expansion, + * We just separate words and phrases, and do wildcard and stem expansion, + * + * This is used to process data entered into an OR/AND/NEAR/PHRASE field of + * the GUI. * * The final list contains one query for each term or phrase * - Elements corresponding to a stem-expanded part are an OP_OR @@ -444,7 +458,7 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData, * @return the subquery count (either or'd stem-expanded terms or phrase word * count) */ -bool StringToXapianQ::processUserString(const string &_iq, +bool StringToXapianQ::processUserString(const string &iq, string &ermsg, list &pqueries, const StopList& stops, @@ -452,25 +466,18 @@ bool StringToXapianQ::processUserString(const string &_iq, bool useNear ) { - LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str())); + LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str())); ermsg.erase(); m_terms.clear(); m_groups.clear(); - // First unaccent/normalize the input: do it first so that it - // happens in the same order as when indexing: unac then split. As - // the character count can change during normalisation, this is - // specially important for cjk because the artificial cjk split is - // based on character counts - string iq; - dumb_string(_iq, iq); - // Simple whitespace-split input into user-level words and - // double-quoted phrases: word1 word2 "this is a phrase". The text - // splitter may further still decide that the resulting "words" - // are really phrases, this depends on separators: [paul@dom.net] - // would still be a word (span), but [about:me] will probably be - // handled as a phrase. + // double-quoted phrases: word1 word2 "this is a phrase". + // + // The text splitter may further still decide that the resulting + // "words" are really phrases, this depends on separators: + // [paul@dom.net] would still be a word (span), but [about:me] + // will probably be handled as a phrase. list phrases; TextSplit::stringToStrings(iq, phrases); @@ -516,7 +523,7 @@ bool StringToXapianQ::processUserString(const string &_iq, case 0: continue;// ?? case 1: - processSimpleSpan(splitData->terms.front(), pqueries); + processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries); break; default: processPhraseOrNear(splitData, pqueries, useNear, slack);