diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 4f5605bd..b1090158 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.27 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.28 2008-12-15 09:24:24 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -43,6 +43,8 @@ namespace Rcl { typedef vector::iterator qlist_it_t; typedef vector::const_iterator qlist_cit_t; +static const int original_term_wqf_booster = 10; + bool SearchData::toNativeQuery(Rcl::Db &db, void *d) { Xapian::Query xq; @@ -172,8 +174,10 @@ bool SearchData::getTerms(vector& terms, return true; } -// Splitter callback for breaking a user query string into simple -// terms and phrases. +// Splitter callback for breaking a user string into simple terms and +// phrases. This is for parts of the user entry which would appear as +// a single word because there is no white space inside, but are +// actually multiple terms to rcldb (ie term1,term2) class wsQData : public TextSplitCB { public: wsQData(const StopList &_stops) @@ -191,32 +195,33 @@ class wsQData : public TextSplitCB { return true; } const StopList &stops; - int alltermcount; // Count of terms including stopwords: this is - // for adjusting phrase/near slack + // Count of terms including stopwords: this is for adjusting + // phrase/near slack + int alltermcount; }; -/** - * Translate a user compound string as may be entered in recoll's - * search entry fields, ex: [term1 "a phrase" term3] into a xapian - * query tree. - * The object keeps track of the query terms and term groups while - * translating. - */ +// A class used to translate a user compound string (*not* a query +// language string) as may be entered in any_terms/all_terms search +// entry fields, ex: [term1 "a phrase" term3] into a xapian query +// tree. +// The object keeps track of the query terms and term groups while +// translating. class StringToXapianQ { public: - StringToXapianQ(Db& db, const string &stmlng, bool boostUser) - : m_db(db), m_stemlang(stmlng), m_doBoostUserTerms(boostUser) + StringToXapianQ(Db& db, const string& prefix, + const string &stmlng, bool boostUser) + : m_db(db), m_prefix(prefix), m_stemlang(stmlng), + m_doBoostUserTerms(boostUser) { } bool processUserString(const string &iq, - const string &prefix, string &ermsg, list &pqueries, const StopList &stops, int slack = 0, bool useNear = false); - - bool getTerms(vector& terms, - vector >& groups) + // After processing the string: return search terms and term + // groups (ie: for highlighting) + bool getTerms(vector& terms, vector >& groups) { terms.insert(terms.end(), m_terms.begin(), m_terms.end()); groups.insert(groups.end(), m_groups.begin(), m_groups.end()); @@ -226,8 +231,15 @@ public: private: void stripExpandTerm(bool dont, const string& term, list& exp, string& sterm); + // After splitting entry on whitespace: process non-phrase element + void processSimpleSpan(const string& span, list &pqueries); + // Process phrase/near element + void StringToXapianQ::processPhraseOrNear(wsQData *splitData, + list &pqueries, + bool useNear, int slack); Db& m_db; + const string& m_prefix; const string& m_stemlang; bool m_doBoostUserTerms; // Single terms and phrases resulting from breaking up text; @@ -348,25 +360,100 @@ static void addPrefix(list& terms, const string& prefix) it->insert(0, prefix); } +void StringToXapianQ::processSimpleSpan(const string& span, + list &pqueries) +{ + list exp; + string sterm; // dumb version of user term + stripExpandTerm(false, span, exp, sterm); + m_terms.insert(m_terms.end(), exp.begin(), exp.end()); + addPrefix(exp, m_prefix); + // Push either term or OR of stem-expanded set + Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end()); + + // If sterm (simplified original user term) is not null, give it a + // relevance boost. We do this even if no expansion occurred (else + // the non-expanded terms in a term list would end-up with even + // less wqf). This does not happen if there are wildcards anywhere + // in the search. + if (m_doBoostUserTerms && !sterm.empty()) { + xq = Xapian::Query(Xapian::Query::OP_OR, + xq, + Xapian::Query(m_prefix+sterm, + original_term_wqf_booster)); + } + pqueries.push_back(xq); +} + +// User entry element had several terms: transform into a PHRASE or +// NEAR xapian query, the elements of which can themselves be OR +// queries if the terms get expanded by stemming or wildcards (we +// don't do stemming for PHRASE though) +void StringToXapianQ::processPhraseOrNear(wsQData *splitData, + list &pqueries, + bool useNear, int slack) +{ + Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : + Xapian::Query::OP_PHRASE; + list orqueries; + bool hadmultiple = false; + vector >groups; + + // Go through the list and perform stem/wildcard expansion for each element + for (vector::iterator it = splitData->terms.begin(); + it != splitData->terms.end(); it++) { + // Adjust when we do stem expansion. Not inside phrases, and + // some versions of xapian will accept only one OR clause + // inside NEAR, all others must be leafs. + bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple; + + string sterm; + listexp; + stripExpandTerm(nostemexp, *it, exp, sterm); + groups.push_back(vector(exp.begin(), exp.end())); + addPrefix(exp, m_prefix); + orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, + exp.begin(), exp.end())); +#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF + if (exp.size() > 1) + hadmultiple = true; +#endif + } + + // Generate an appropriate PHRASE/NEAR query with adjusted slack + // For phrases, give a relevance boost like we do for original terms + Xapian::Query xq(op, orqueries.begin(), orqueries.end(), + splitData->alltermcount + slack); + if (op == Xapian::Query::OP_PHRASE) + xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq, + original_term_wqf_booster); + pqueries.push_back(xq); + + // Add all combinations of NEAR/PHRASE groups to the highlighting data. + vector > allcombs; + vector comb; + multiply_groups(groups.begin(), groups.end(), comb, allcombs); + m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end()); +} + /** - * Turn string into list of xapian queries. There is little - * interpretation done on the string (no +term -term or filename:term - * stuff). We just separate words and phrases, and interpret - * capitalized terms as wanting no stem expansion. + * Turn user entry string (NOT query language) into a list of xapian queries. + * We just separate words and phrases, and do wildcard and stemp expansion, + * * The final list contains one query for each term or phrase * - Elements corresponding to a stem-expanded part are an OP_OR * composition of the stem-expanded terms (or a single term query). - * - Elements corresponding to a phrase are an OP_PHRASE composition of the - * phrase terms (no stem expansion in this case) + * - Elements corresponding to phrase/near are an OP_PHRASE/NEAR + * composition of the phrase terms (no stem expansion in this case) * @return the subquery count (either or'd stem-expanded terms or phrase word * count) */ bool StringToXapianQ::processUserString(const string &iq, - const string &prefix, string &ermsg, list &pqueries, const StopList& stops, - int slack, bool useNear + int slack, + bool useNear ) { LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str())); @@ -374,11 +461,12 @@ bool StringToXapianQ::processUserString(const string &iq, m_terms.clear(); m_groups.clear(); - // Split input into user-level words and double-quoted phrases: - // word1 word2 "this is a phrase". The text splitter may still - // decide that the resulting "words" are really phrases, this - // depends on separators: [paul@dom.net] would still be a word - // (span), but [about:me] will probably be handled as a phrase. + // Simple whitespace-split input into user-level words and + // double-quoted phrases: word1 word2 "this is a phrase". The text + // splitter may further still decide that the resulting "words" + // are really phrases, this depends on separators: [paul@dom.net] + // would still be a word (span), but [about:me] will probably be + // handled as a phrase. list phrases; TextSplit::stringToStrings(iq, phrases); @@ -387,10 +475,10 @@ bool StringToXapianQ::processUserString(const string &iq, try { for (list::iterator it = phrases.begin(); it != phrases.end(); it++) { - LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str())); + LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str())); // If there are multiple spans in this element, including - // at least one composite, we have to do something + // at least one composite, we have to increase the slack // else a phrase query including a span would fail. // Ex: "term0@term1 term2" is onlyspans-split as: // 0 term0@term1 0 12 @@ -419,81 +507,15 @@ bool StringToXapianQ::processUserString(const string &iq, // used to: splitData = &splitDataW; } - LOGDEB(("strToXapianQ: splitter term count: %d\n", - splitData->terms.size())); + LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size())); switch (splitData->terms.size()) { - case 0: continue;// ?? + case 0: + continue;// ?? case 1: - // Just a term. Still may be expanded (by stem or - // wildcard) to an OR list. - { - string term = splitData->terms.front(); - list exp; - string sterm; // dumb version of user term - stripExpandTerm(false, term, exp, sterm); - m_terms.insert(m_terms.end(), exp.begin(), exp.end()); - // Push either term or OR of stem-expanded set - addPrefix(exp, prefix); - Xapian::Query xq(Xapian::Query::OP_OR, - exp.begin(), exp.end()); - - // If sterm is not null, give a relevance boost to - // the original term. We do this even if no - // expansion occurred (else the non-expanded terms - // in a term list would end-up with even less - // wqf). This does not happen if there are - // wildcards anywhere in the search. - if (m_doBoostUserTerms && !sterm.empty()) { - xq = Xapian::Query(Xapian::Query::OP_OR, xq, - Xapian::Query(prefix+sterm, 10)); - } - pqueries.push_back(xq); - } + processSimpleSpan(splitData->terms.front(), pqueries); break; - default: - // Element had several terms: transform into a PHRASE - // or NEAR xapian query, the elements of which can - // themselves be OR queries if the terms get expanded - // by stemming or wildcards (we don't do stemming for - // PHRASE though) - Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : - Xapian::Query::OP_PHRASE; - list orqueries; - bool hadmultiple = false; - vector >groups; - for (vector::iterator it = splitData->terms.begin(); - it != splitData->terms.end(); it++) { - // Some version of xapian will accept only one OR clause - // inside NEAR, all others must be leafs - bool nostemexp = - op == Xapian::Query::OP_PHRASE || hadmultiple; - - string sterm; - listexp; - stripExpandTerm(nostemexp, *it, exp, sterm); - groups.push_back(vector(exp.begin(), exp.end())); - addPrefix(exp, prefix); - orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, - exp.begin(), exp.end())); -#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF - if (exp.size() > 1) - hadmultiple = true; -#endif - } - - pqueries.push_back(Xapian::Query(op, - orqueries.begin(), - orqueries.end(), - splitData->alltermcount - + slack)); - // Add NEAR/PHRASE groups to the highlighting data. Must - // push all combinations - vector > allcombs; - vector comb; - multiply_groups(groups.begin(), groups.end(), comb, allcombs); - m_groups.insert(m_groups.end(), allcombs.begin(), - allcombs.end()); + processPhraseOrNear(splitData, pqueries, useNear, slack); } } } catch (const Xapian::Error &e) { @@ -547,8 +569,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, (m_parentSearch && !m_parentSearch->haveWildCards()) || (m_parentSearch == 0 && !m_haveWildCards); - StringToXapianQ tr(db, l_stemlang, doBoostUserTerm); - if (!tr.processUserString(m_text, prefix, m_reason, pqueries, + StringToXapianQ tr(db, prefix, l_stemlang, doBoostUserTerm); + if (!tr.processUserString(m_text, m_reason, pqueries, db.getStopList())) return false; if (pqueries.empty()) { @@ -617,8 +639,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p, } string s = string("\"") + m_text + string("\""); bool useNear = (m_tp == SCLT_NEAR); - StringToXapianQ tr(db, l_stemlang, doBoostUserTerm); - if (!tr.processUserString(s, prefix, m_reason, pqueries, db.getStopList(), + StringToXapianQ tr(db, prefix, l_stemlang, doBoostUserTerm); + if (!tr.processUserString(s, m_reason, pqueries, db.getStopList(), m_slack, useNear)) return false; if (pqueries.empty()) {