From e5c320ca5146899519e5c445b9a036ec6a88add5 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 24 Apr 2021 13:48:16 +0200 Subject: [PATCH] Add support for "issub" special field specifying that the results should be standalone (issub:0) or embedded (issub:1) --- src/query/wasaparseaux.cpp | 45 +++++++++++++++--------------------- src/query/wasaparserdriver.h | 27 +++++++++++----------- src/rcldb/rcldb.cpp | 4 ++-- src/rcldb/rcldb.h | 14 +++++++++++ src/rcldb/rclquery.cpp | 39 +++++++++++++++++++++++++------ src/rcldb/rclquery.h | 10 ++++---- src/rcldb/rclquery_p.h | 12 ++++++---- src/rcldb/searchdata.h | 17 ++++++++++++-- 8 files changed, 107 insertions(+), 61 deletions(-) diff --git a/src/query/wasaparseaux.cpp b/src/query/wasaparseaux.cpp index 7aa0fb1e..db702c5d 100644 --- a/src/query/wasaparseaux.cpp +++ b/src/query/wasaparseaux.cpp @@ -50,19 +50,6 @@ std::shared_ptr wasaStringToRcl( return sd; } -WasaParserDriver::WasaParserDriver(const RclConfig *c, const std::string sl, - const std::string& as) - : m_stemlang(sl), m_autosuffs(as), m_config(c), - m_index(0), m_result(0), m_haveDates(false), - m_maxSize(-1), m_minSize(-1) -{ - -} - -WasaParserDriver::~WasaParserDriver() -{ -} - SearchData *WasaParserDriver::parse(const std::string& in) { m_input = in; @@ -83,13 +70,11 @@ SearchData *WasaParserDriver::parse(const std::string& in) return m_result; // Set the top level filters (types, dates, size) - for (vector::const_iterator it = m_filetypes.begin(); - it != m_filetypes.end(); it++) { - m_result->addFiletype(*it); + for (const auto& ft : m_filetypes) { + m_result->addFiletype(ft); } - for (vector::const_iterator it = m_nfiletypes.begin(); - it != m_nfiletypes.end(); it++) { - m_result->remFiletype(*it); + for (const auto& ft : m_nfiletypes) { + m_result->remFiletype(ft); } if (m_haveDates) { m_result->setDateSpan(&m_dates); @@ -100,6 +85,10 @@ SearchData *WasaParserDriver::parse(const std::string& in) if (m_maxSize != -1) { m_result->setMaxSize(m_maxSize); } + if (m_subSpec != Rcl::SearchData::SUBDOC_ANY) { + m_result->setSubSpec(m_subSpec); + } + //if (m_result) m_result->dump(cout); return m_result; } @@ -122,8 +111,7 @@ void WasaParserDriver::UNGETCHAR(int c) // Add clause to query, handling special pseudo-clauses for size/date // etc. (mostly determined on field name). -bool WasaParserDriver::addClause(SearchData *sd, - SearchDataClauseSimple* cl) +bool WasaParserDriver::addClause(SearchData *sd, SearchDataClauseSimple* cl) { if (cl->getfield().empty()) { // Simple clause with empty field spec. @@ -132,7 +120,7 @@ bool WasaParserDriver::addClause(SearchData *sd, if (!m_autosuffs.empty()) { vector asfv; if (stringToStrings(m_autosuffs, asfv)) { - if (find_if(asfv.begin(), asfv.end(), + if (find_if(asfv.begin(), asfv.end(), StringIcmpPred(cl->gettext())) != asfv.end()) { cl->setfield("ext"); cl->addModifier(SearchDataClause::SDCM_NOSTEMMING); @@ -156,6 +144,13 @@ bool WasaParserDriver::addClause(SearchData *sd, return false; } + // Filtering for standalone- or sub-documents + if (!fld.compare("issub")) { + m_subSpec = atoi(cl->gettext().c_str()); + delete cl; + return false; + } + if (!fld.compare("rclcat") || !fld.compare("type")) { vector mtypes; if (m_config && m_config->getMimeCatTypes(cl->gettext(), mtypes)) { @@ -231,8 +226,7 @@ bool WasaParserDriver::addClause(SearchData *sd, if (!fld.compare("dir")) { // dir filtering special case - SearchDataClausePath *nclause = - new SearchDataClausePath(cl->gettext(), cl->getexclude()); + SearchDataClausePath *nclause = new SearchDataClausePath(cl->gettext(), cl->getexclude()); delete cl; return sd->addClause(nclause); } @@ -258,8 +252,7 @@ bool WasaParserDriver::addClause(SearchData *sd, } if (tp != SCLT_FILENAME) { - SearchDataClauseSimple *ncl = - new SearchDataClauseSimple(tp, ns, ofld); + SearchDataClauseSimple *ncl = new SearchDataClauseSimple(tp, ns, ofld); delete cl; return sd->addClause(ncl); } diff --git a/src/query/wasaparserdriver.h b/src/query/wasaparserdriver.h index 7c75a921..a76c2e54 100644 --- a/src/query/wasaparserdriver.h +++ b/src/query/wasaparserdriver.h @@ -22,14 +22,12 @@ #include #include "smallut.h" +#include "searchdata.h" class WasaParserDriver; -namespace Rcl { - class SearchData; - class SearchDataClauseSimple; -} + namespace yy { - class parser; +class parser; } class RclConfig; @@ -37,9 +35,10 @@ class RclConfig; class WasaParserDriver { public: - WasaParserDriver(const RclConfig *c, const std::string sl, - const std::string& as); - ~WasaParserDriver(); + WasaParserDriver(const RclConfig *c, const std::string sl, const std::string& as) + : m_stemlang(sl), m_autosuffs(as), m_config(c) {} + + ~WasaParserDriver() {} Rcl::SearchData *parse(const std::string&); bool addClause(Rcl::SearchData *sd, Rcl::SearchDataClauseSimple* cl); @@ -67,20 +66,20 @@ private: // input string. std::string m_input; // Current position in m_input - unsigned int m_index; + unsigned int m_index{0}; // Characters pushed-back, ready for next getchar. std::stack m_returns; // Result, set by parser. - Rcl::SearchData *m_result; + Rcl::SearchData *m_result{nullptr}; // Storage for top level filters std::vector m_filetypes; std::vector m_nfiletypes; - bool m_haveDates; + bool m_haveDates{false}; DateInterval m_dates; // Restrict to date interval - int64_t m_maxSize; - int64_t m_minSize; - + int64_t m_maxSize{-1}; + int64_t m_minSize{-1}; + int m_subSpec{Rcl::SearchData::SUBDOC_ANY}; std::string m_reason; // Let the quoted string reader store qualifiers in there, simpler diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index e367cb08..af0882fe 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -2557,7 +2557,7 @@ bool Db::getSubDocs(const Doc &idoc, vector& subdocs) LOGERR("Db::getSubDocs: xapian error: " << m_reason << "\n"); return false; } - if (xit == xdoc.termlist_end()) { + if (xit == xdoc.termlist_end() || get_prefix(*xit) != parent_prefix) { LOGERR("Db::getSubDocs: parent term not found\n"); return false; } @@ -2642,7 +2642,7 @@ bool Db::getContainerDoc(const Doc &idoc, Doc& ctdoc) LOGERR("Db::getContainerDoc: xapian error: " << m_reason << "\n"); return false; } - if (xit == xdoc.termlist_end()) { + if (xit == xdoc.termlist_end() || get_prefix(*xit) != parent_prefix) { LOGERR("Db::getContainerDoc: parent term not found\n"); return false; } diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 7728874b..88a92cae 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -151,6 +151,20 @@ inline string strip_prefix(const string& trm) return trm.substr(st); } +inline string get_prefix(const string& trm) +{ + if (!has_prefix(trm)) + return trm; + string::size_type st = 0; + if (o_index_stripchars) { + st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ"); + return trm.substr(0, st); + } else { + st = trm.find_last_of(":") + 1; + return trm.substr(1, st-2); + } +} + inline string wrap_prefix(const string& pfx) { if (o_index_stripchars) { diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp index 795d14a9..19b88f79 100644 --- a/src/rcldb/rclquery.cpp +++ b/src/rcldb/rclquery.cpp @@ -152,8 +152,7 @@ private: }; Query::Query(Db *db) - : m_nq(new Native(this)), m_db(db), m_sorter(0), m_sortAscending(true), - m_collapseDuplicates(false), m_resCnt(-1), m_snipMaxPosWalk(1000000) + : m_nq(new Native(this)), m_db(db) { if (db) db->getConf()->getConfParam("snippetMaxPosWalk", &m_snipMaxPosWalk); @@ -179,6 +178,27 @@ void Query::setSortBy(const string& fld, bool ascending) { (m_sortAscending ? "ascending" : "descending") << "\n"); } +static const string parent_prefix{"F"}; + +class SubdocDecider : public Xapian::MatchDecider { +public: + SubdocDecider(bool sel) : MatchDecider(), m_select(sel) {} + virtual ~SubdocDecider() {} + + virtual bool operator()(const Xapian::Document &doc) const { + bool hasparent{false}; + try { + Xapian::TermIterator xit = doc.termlist_begin(); + xit.skip_to(wrap_prefix(parent_prefix)); + hasparent = (xit != doc.termlist_end()) && (get_prefix(*xit) == parent_prefix); + } catch (...) { + } + return hasparent == m_select; + } + + bool m_select; +}; + // Prepare query out of user search data bool Query::setQuery(std::shared_ptr sdata) { @@ -199,8 +219,13 @@ bool Query::setQuery(std::shared_ptr sdata) m_reason += sdata->getReason(); return false; } - m_nq->xquery = xq; + + if (sdata->getSubSpec() == SearchData::SUBDOC_NO) { + m_nq->subdecider = new SubdocDecider(false); + } else if (sdata->getSubSpec() == SearchData::SUBDOC_YES) { + m_nq->subdecider = new SubdocDecider(true); + } string d; for (int tries = 0; tries < 2; tries++) { @@ -361,7 +386,8 @@ int Query::getResCnt(int checkatleast, bool useestimate) Chrono chron; XAPTRY(if (checkatleast == -1) checkatleast = m_db->docCnt(); - m_nq->xmset = m_nq->xenquire->get_mset(0, qquantum, checkatleast), + m_nq->xmset = m_nq->xenquire->get_mset( + 0, qquantum, checkatleast, 0, m_nq->subdecider), m_db->m_ndb->xrdb, m_reason); if (!m_reason.empty()) { LOGERR("xenquire->get_mset: exception: " << m_reason << "\n"); @@ -401,10 +427,9 @@ bool Query::getDoc(int xapi, Doc &doc, bool fetchtext) if (!(xapi >= first && xapi <= last)) { LOGDEB("Fetching for first " << xapi << ", count " << qquantum << "\n"); - XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(xapi, qquantum, - (const Xapian::RSet *)0), + XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset( + xapi, qquantum, nullptr, m_nq->subdecider), m_db->m_ndb->xrdb, m_reason); - if (!m_reason.empty()) { LOGERR("enquire->get_mset: exception: " << m_reason << "\n"); return false; diff --git a/src/rcldb/rclquery.h b/src/rcldb/rclquery.h index 76d04016..cade3650 100644 --- a/src/rcldb/rclquery.h +++ b/src/rcldb/rclquery.h @@ -139,13 +139,13 @@ public: private: std::string m_reason; // Error explanation Db *m_db; - void *m_sorter; + void *m_sorter{nullptr}; std::string m_sortField; - bool m_sortAscending; - bool m_collapseDuplicates; - int m_resCnt; + bool m_sortAscending{true}; + bool m_collapseDuplicates{false}; + int m_resCnt{-1}; std::shared_ptr m_sd; - int m_snipMaxPosWalk; + int m_snipMaxPosWalk{1000000}; }; #ifndef NO_NAMESPACES diff --git a/src/rcldb/rclquery_p.h b/src/rcldb/rclquery_p.h index 7d44e626..65279ea7 100644 --- a/src/rcldb/rclquery_p.h +++ b/src/rcldb/rclquery_p.h @@ -32,24 +32,26 @@ namespace Rcl { class Query::Native { public: // The query I belong to - Query *m_q; + Query *m_q{nullptr}; // query descriptor: terms and subqueries joined by operators // (or/and etc...) Xapian::Query xquery; // Open query descriptor. - Xapian::Enquire *xenquire; + Xapian::Enquire *xenquire{nullptr}; // Partial result set Xapian::MSet xmset; // Term frequencies for current query. See makeAbstract, setQuery std::map termfreqs; - + Xapian::MatchDecider *subdecider{nullptr}; + Native(Query *q) - : m_q(q), xenquire(0) { } + : m_q(q), xenquire(0) {} ~Native() { clear(); } void clear() { - delete xenquire; xenquire = 0; + deleteZ(xenquire); + deleteZ(subdecider); termfreqs.clear(); } /** Return a list of terms which matched for a specific result document */ diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index c0b891e8..dd2460ae 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -114,6 +114,17 @@ public: void setMinSize(int64_t size) {m_minSize = size;} void setMaxSize(int64_t size) {m_maxSize = size;} + enum SubdocSpec {SUBDOC_ANY = -1, SUBDOC_NO = 0, SUBDOC_YES = 1}; + void setSubSpec(int spec) { + switch (spec) { + case SUBDOC_ANY: + case SUBDOC_NO: + case SUBDOC_YES: + m_subspec = spec; + } + } + int getSubSpec() {return m_subspec;} + /** Set date span for filtering results */ void setDateSpan(DateInterval *dip) {m_dates = *dip; m_haveDates = true;} @@ -174,12 +185,14 @@ private: std::shared_ptr m_autophrase; // Special stuff produced by input which looks like a clause but means - // something else (date and size specs) + // something else (date, size specs, etc.) bool m_haveDates{false}; DateInterval m_dates; // Restrict to date interval int64_t m_maxSize{-1}; int64_t m_minSize{-1}; - + // Filtering for subdocs: -1:any, 0: only free-standing, 1: only subdocs + int m_subspec{SUBDOC_ANY}; + // Printable expanded version of the complete query, retrieved/set // from rcldb after the Xapian::setQuery() call std::string m_description;