From 48bb4a0dd170289c5494bd03d1667b4ba4ed1c51 Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 17 Nov 2006 10:06:34 +0000 Subject: [PATCH] added code to remember search terms and term groups in searchdata --- src/rcldb/rcldb.cpp | 14 ++-- src/rcldb/searchdata.cpp | 175 +++++++++++++++++++++++++-------------- src/rcldb/searchdata.h | 147 +++++++++++++++++++++++++------- 3 files changed, 237 insertions(+), 99 deletions(-) diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index e2007157..ed4d905a 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.95 2006-11-15 14:57:53 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.96 2006-11-17 10:06:33 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -1252,7 +1252,7 @@ bool Db::setQuery(RefCntr sdata, int opts, m_reason.erase(); LOGDEB(("Db::setQuery:\n")); - m_filterTopDir = sdata->m_topdir; + m_filterTopDir = sdata->getTopdir(); m_dbindices.clear(); m_qOpts = opts; m_ndb->m_termfreqs.clear(); @@ -1270,11 +1270,11 @@ bool Db::setQuery(RefCntr sdata, int opts, m_ndb->enquire->set_query(m_ndb->query); m_ndb->mset = Xapian::MSet(); // Get the query description and trim the "Xapian::Query" - sdata->m_description = m_ndb->query.get_description(); - if (sdata->m_description.find("Xapian::Query") == 0) - sdata->m_description = - sdata->m_description.substr(strlen("Xapian::Query")); - LOGDEB(("Db::SetQuery: Q: %s\n", sdata->m_description.c_str())); + string d = m_ndb->query.get_description(); + if (d.find("Xapian::Query") == 0) + d.erase(0, strlen("Xapian::Query")); + sdata->setDescription(d); + LOGDEB(("Db::SetQuery: Q: %s\n", sdata->getDescription().c_str())); return true; } diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 210e5aa8..a471b23a 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.3 2006-11-14 17:41:12 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.4 2006-11-17 10:06:34 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -21,10 +21,7 @@ static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.3 2006-11-14 17:41:12 dockes E // Handle translation from rcl's SearchData structures to Xapian Queries #include -#include -#ifndef NO_NAMESPACES -using namespace std; -#endif +#include #include "xapian.h" @@ -36,9 +33,13 @@ using namespace std; #include "unacpp.h" #include "utf8iter.h" +#ifndef NO_NAMESPACES +using namespace std; namespace Rcl { +#endif -typedef list::iterator qlist_it_t; +typedef vector::iterator qlist_it_t; +typedef vector::const_iterator qlist_cit_t; bool SearchData::toNativeQuery(Rcl::Db &db, void *d, const string& stemlang) { @@ -71,7 +72,7 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d, const string& stemlang) if (!m_filetypes.empty()) { list pqueries; Xapian::Query tq; - for (list::iterator it = m_filetypes.begin(); + for (vector::iterator it = m_filetypes.begin(); it != m_filetypes.end(); it++) { string term = "T" + *it; LOGDEB(("Adding file type term: [%s]\n", term.c_str())); @@ -90,6 +91,7 @@ bool SearchData::addClause(SearchDataClause* cl) { if (m_tp == SCLT_OR && (cl->m_tp == SCLT_EXCL)) { LOGERR(("SearchData::addClause: cant add EXCL to OR list\n")); + m_reason = "No Negative (AND_NOT) clauses allowed in OR queries"; return false; } m_query.push_back(cl); @@ -98,33 +100,46 @@ bool SearchData::addClause(SearchDataClause* cl) // Make me all new void SearchData::erase() { + LOGDEB(("SearchData::erase\n")); + m_tp = SCLT_AND; for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) delete *it; m_query.clear(); m_filetypes.clear(); m_topdir.erase(); m_description.erase(); + m_reason.erase(); } // Am I a file name only search ? This is to turn off term highlighting -bool SearchData::fileNameOnly() { +bool SearchData::fileNameOnly() +{ for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) if (!(*it)->isFileName()) return false; return true; } +// Extract all terms and term groups +bool SearchData::getTerms(vector& terms, + vector >& groups, + vector& gslks) const +{ + for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++) + (*it)->getTerms(terms, groups, gslks); + return true; +} + // Splitter callback for breaking a user query string into simple -// terms and phrases +// terms and phrases. class wsQData : public TextSplitCB { public: vector terms; // Debug string catterms() { string s; - for (unsigned int i = 0; i < terms.size(); i++) { + for (unsigned int i = 0; i < terms.size(); i++) s += "[" + terms[i] + "] "; - } return s; } bool takeword(const std::string &term, int , int, int) { @@ -132,71 +147,97 @@ class wsQData : public TextSplitCB { terms.push_back(term); return true; } - // Decapital + deaccent all terms - void dumball() { - for (vector::iterator it=terms.begin(); it !=terms.end();it++){ - string dumb; - dumb_string(*it, dumb); - *it = dumb; - } - } }; -/** Possibly expand term into its stem siblings, make them dumb strings */ -static void maybeStemExp(Db& db, const string& stemlang, const string& term, - list& exp) +// This used to be a static function, but we couldn't just keep adding +// parameters to the interface! +class StringToXapianQ { +public: + StringToXapianQ(Db& db) : m_db(db) { } + bool translate(const string &iq, + const string& stemlang, + string &ermsg, + list &pqueries, + int slack = 0, bool useNear = false); + bool getTerms(vector& terms, + vector >& groups) + { + terms.insert(terms.end(), m_terms.begin(), m_terms.end()); + groups.insert(groups.end(), m_groups.begin(), m_groups.end()); + return true; + } +private: + void maybeStemExp(const string& stemlang, const string& term, + list& exp); + + Db& m_db; + // Single terms and phrases resulting from breaking up text; + vector m_terms; + vector > m_groups; +}; + +/** Make term dumb and possibly expand it into its stem siblings */ +void StringToXapianQ::maybeStemExp(const string& stemlang, + const string& term, + list& exp) { - LOGDEB(("maybeStemExp: [%s]\n", term.c_str())); + LOGDEB2(("maybeStemExp: [%s]\n", term.c_str())); + if (term.empty()) { + exp.clear(); + return; + } + string term1; dumb_string(term, term1); - if (!stemlang.empty()) { - bool nostemexp = false; + + bool nostemexp = stemlang.empty() ? true : false; + if (!nostemexp) { // Check if the first letter is a majuscule in which // case we do not want to do stem expansion. Note that // the test is convoluted and possibly problematic - if (term.length() > 0) { - string noacterm,noaclowterm; - if (unacmaybefold(term, noacterm, "UTF-8", false) && - unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) { - Utf8Iter it1(noacterm); - Utf8Iter it2(noaclowterm); - if (*it1 != *it2) - nostemexp = true; - } - } - LOGDEB1(("Term: %s stem expansion: %s\n", - term.c_str(), nostemexp?"no":"yes")); - if (!nostemexp) { - exp = db.stemExpand(stemlang, term1); - return; + + string noacterm,noaclowterm; + if (unacmaybefold(term, noacterm, "UTF-8", false) && + unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) { + Utf8Iter it1(noacterm); + Utf8Iter it2(noaclowterm); + if (*it1 != *it2) + nostemexp = true; } + LOGDEB1(("Term: %s stem expansion: %s\n", term.c_str())); } - exp.push_back(term1); + if (nostemexp) { + exp = list(1, term1); + } else { + exp = m_db.stemExpand(stemlang, term1); + } } -/** Turn string into list of xapian queries. There is little +/** + * Turn string into list of xapian queries. There is little * interpretation done on the string (no +term -term or filename:term * stuff). We just separate words and phrases, and interpret * capitalized terms as wanting no stem expansion. * The final list contains one query for each term or phrase * - Elements corresponding to a stem-expanded part are an OP_OR - * composition of the stem-expanded terms (or a single term query). + * composition of the stem-expanded terms (or a single term query). * - Elements corresponding to a phrase are an OP_PHRASE composition of the * phrase terms (no stem expansion in this case) * @return the subquery count (either or'd stem-expanded terms or phrase word * count) */ -static bool stringToXapianQueries(const string &iq, - const string& stemlang, - Db& db, - string &ermsg, - list &pqueries, - int slack = 0, bool useNear = false) +bool StringToXapianQ::translate(const string &iq, + const string& stemlang, + string &ermsg, + list &pqueries, + int slack, bool useNear) { string qstring = iq; bool opt_stemexp = !stemlang.empty(); ermsg.erase(); + m_terms.clear(); + m_groups.clear(); // Split into words and phrases (word1 word2 "this is a phrase"): list phrases; @@ -231,10 +272,11 @@ static bool stringToXapianQueries(const string &iq, { string term = splitData.terms.front(); list exp; - maybeStemExp(db, stemlang, term, exp); + maybeStemExp(stemlang, term, exp); // Push either term or OR of stem-expanded set pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, exp.begin(), exp.end())); + m_terms.insert(m_terms.end(), exp.begin(), exp.end()); } break; @@ -245,14 +287,18 @@ static bool stringToXapianQueries(const string &iq, list orqueries; bool hadmultiple = false; string nolang, lang; + vector dumbterms; for (vector::iterator it = splitData.terms.begin(); it != splitData.terms.end(); it++) { listexp; lang = (op == Xapian::Query::OP_PHRASE || hadmultiple) ? nolang : stemlang; - maybeStemExp(db, lang, *it, exp); - if (exp.size() > 1) + maybeStemExp(lang, *it, exp); + dumbterms.insert(dumbterms.end(), exp.begin(), exp.end()); +#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF + if (exp.size() > 1) hadmultiple = true; +#endif orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, exp.begin(), exp.end())); } @@ -260,6 +306,7 @@ static bool stringToXapianQueries(const string &iq, orqueries.begin(), orqueries.end(), splitData.terms.size() + slack)); + m_groups.push_back(dumbterms); } } } catch (const Xapian::Error &e) { @@ -282,12 +329,15 @@ static bool stringToXapianQueries(const string &iq, bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, const string& stemlang) { + m_terms.clear(); + m_groups.clear(); Xapian::Query *qp = (Xapian::Query *)p; *qp = Xapian::Query(); Xapian::Query::op op; switch (m_tp) { case SCLT_AND: op = Xapian::Query::OP_AND; break; + // EXCL will be set with AND_NOT in the list. So it's an OR list here case SCLT_OR: case SCLT_EXCL: op = Xapian::Query::OP_OR; break; default: @@ -295,12 +345,14 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, return false; } list pqueries; - if (!stringToXapianQueries(m_text, stemlang, db, m_reason, pqueries)) + StringToXapianQ tr(db); + if (!tr.translate(m_text, stemlang, m_reason, pqueries)) return false; if (pqueries.empty()) { LOGERR(("SearchDataClauseSimple: resolved to null query\n")); return true; } + tr.getTerms(m_terms, m_groups); *qp = Xapian::Query(op, pqueries.begin(), pqueries.end()); return true; } @@ -319,28 +371,31 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p, return true; } -// Translate NEAR or PHRASE clause. We're not handling the distance parameter -// yet. +// Translate NEAR or PHRASE clause. bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p, const string& stemlang) { + m_terms.clear(); + m_groups.clear(); + Xapian::Query *qp = (Xapian::Query *)p; *qp = Xapian::Query(); list pqueries; Xapian::Query nq; + + // Use stringToXapianQueries to lowercase and simplify the phrase + // terms etc. The result should be a single element list string s = string("\"") + m_text + string("\""); bool useNear = m_tp == SCLT_NEAR; - - // Use stringToXapianQueries anyway to lowercase and simplify the - // phrase terms etc. The result should be a single element list - if (!stringToXapianQueries(s, stemlang, db, m_reason, pqueries, - m_slack, useNear)) + StringToXapianQ tr(db); + if (!tr.translate(s, stemlang, m_reason, pqueries, m_slack, useNear)) return false; if (pqueries.empty()) { LOGERR(("SearchDataClauseDist: resolved to null query\n")); return true; } + tr.getTerms(m_terms, m_groups); *qp = *pqueries.begin(); return true; } diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index 0fb989ee..79800c2c 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -1,16 +1,38 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ #ifndef _SEARCHDATA_H_INCLUDED_ #define _SEARCHDATA_H_INCLUDED_ -/* @(#$Id: searchdata.h,v 1.5 2006-11-15 14:57:53 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: searchdata.h,v 1.6 2006-11-17 10:06:34 dockes Exp $ (C) 2004 J.F.Dockes */ +/** + * Structures to hold data coming almost directly from the gui + * search fields and handle its translation to Xapian queries. + * This is not generic code, it reflects the choices made for the user + * interface, and it also knows some specific of recoll's usage of Xapian + * (ie: term prefixes) + */ #include -#include +#include #include "rcldb.h" #ifndef NO_NAMESPACES -using std::list; +using std::vector; using std::string; - namespace Rcl { #endif // NO_NAMESPACES @@ -24,20 +46,14 @@ enum SClType { class SearchDataClause; /** - * Holder for a list of search clauses. Some of the clauses can be comples - * subqueries. + * Holder for a list of search clauses. Some of the clauses may be be reference + * to other subqueries in the future. For now, they just reflect user entry in + * a query field: type, some text and possibly a distance. Each clause may + * hold several queries in the Xapian sense, for exemple several terms + * and phrases as would result from ["this is a phrase" term1 term2] */ class SearchData { - public: - SClType m_tp; // Only SCLT_AND or SCLT_OR here - list m_query; - list m_filetypes; // Restrict to filetypes if set. - string m_topdir; // Restrict to subtree. - // Printable expanded version of the complete query, obtained from Xapian - // valid after setQuery() call - string m_description; - string m_reason; - +public: SearchData(SClType tp) : m_tp(tp) {} ~SearchData() {erase();} @@ -53,58 +69,124 @@ class SearchData { /** We become the owner of cl and will delete it */ bool addClause(SearchDataClause *cl); + /** Retrieve error description */ string getReason() {return m_reason;} - private: + /** Get terms and phrase/near groups. Used in the GUI for highlighting + * The groups and gslks vectors are parallel and hold the phrases/near + * string groups and their associated slacks (distance in excess of group + * size) + */ + bool getTerms(vector& terms, + vector >& groups, vector& gslks) const; + /** + * Get/set the description field which is retrieved from xapian after + * initializing the query. It is stored here for usage in the GUI. + */ + string getDescription() {return m_description;} + void setDescription(const string& d) {m_description = d;} + string getTopdir() {return m_topdir;} + void setTopdir(const string& t) {m_topdir = t;} + void addFiletype(const string& ft) {m_filetypes.push_back(ft);} +private: + SClType m_tp; // Only SCLT_AND or SCLT_OR here + vector m_query; + vector m_filetypes; // Restrict to filetypes if set. + string m_topdir; // Restrict to subtree. + // Printable expanded version of the complete query, retrieved/set + // from rcldb after the Xapian::setQuery() call + string m_description; + string m_reason; + /* Copyconst and assignment private and forbidden */ SearchData(const SearchData &) {} SearchData& operator=(const SearchData&) {return *this;}; }; class SearchDataClause { - public: - SClType m_tp; - +public: SearchDataClause(SClType tp) : m_tp(tp) {} virtual ~SearchDataClause() {} - virtual bool toNativeQuery(Rcl::Db &db, void *, const string&) = 0; - virtual bool isFileName() {return m_tp == SCLT_FILENAME ? true : false;} - string getReason() {return m_reason;} - protected: - string m_reason; -}; + virtual bool toNativeQuery(Rcl::Db &db, void *, const string&) = 0; + + virtual bool isFileName() const {return m_tp==SCLT_FILENAME ? true: false;} + + virtual string getReason() const {return m_reason;} + + virtual bool getTerms(vector&, vector >&, + vector&) const + {return true;} + virtual SClType getTp() {return m_tp;} + + friend class SearchData; + +protected: + string m_reason; + SClType m_tp; +}; + +/** + * "Simple" data clause with user-entered query text. This can include + * multiple phrases and words, but no specified distance. + */ class SearchDataClauseSimple : public SearchDataClause { public: SearchDataClauseSimple(SClType tp, string txt) - : SearchDataClause(tp), m_text(txt) {} + : SearchDataClause(tp), m_text(txt), m_slack(0) {} virtual ~SearchDataClauseSimple() {} + virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang); + + virtual bool getTerms(vector& terms, + vector >& groups, + vector& gslks) const + { + terms.insert(terms.end(), m_terms.begin(), m_terms.end()); + groups.insert(groups.end(), m_groups.begin(), m_groups.end()); + gslks.insert(gslks.end(), m_groups.size(), m_slack); + return true; + } + protected: string m_text; + // Single terms and phrases resulting from breaking up m_text; + // valid after toNativeQuery() call + vector m_terms; + vector > m_groups; + // Declare m_slack here. Always 0, but allows getTerms to work for + // SearchDataClauseDist + int m_slack; }; +/** Filename search. */ class SearchDataClauseFilename : public SearchDataClauseSimple { - public: +public: SearchDataClauseFilename(string txt) : SearchDataClauseSimple(SCLT_FILENAME, txt) {} virtual ~SearchDataClauseFilename() {} virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang); }; +/** + * A clause coming from a NEAR or PHRASE entry field. There is only one + * string group, and a specified distance, which applies to it. + */ class SearchDataClauseDist : public SearchDataClauseSimple { public: SearchDataClauseDist(SClType tp, string txt, int slack) - : SearchDataClauseSimple(tp, txt), m_slack(slack) {} + : SearchDataClauseSimple(tp, txt) {m_slack = slack;} virtual ~SearchDataClauseDist() {} + virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang); -protected: - int m_slack; + // m_slack is declared in SearchDataClauseSimple }; +#ifdef NOTNOW +/** Future pointer to subquery ? */ class SearchDataClauseSub : public SearchDataClause { - public: +public: SearchDataClauseSub(SClType tp, SClType stp) : SearchDataClause(tp), m_sub(stp) {} virtual ~SearchDataClauseSub() {} @@ -113,6 +195,7 @@ class SearchDataClauseSub : public SearchDataClause { protected: SearchData m_sub; }; +#endif // NOTNOW } // Namespace Rcl #endif /* _SEARCHDATA_H_INCLUDED_ */