From bf46e6ca0e1244a83f251e49938ba5824f8994c5 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sun, 21 Aug 2022 14:14:07 +0200 Subject: [PATCH] Search: change query processing a bit so that we can use OP_FILTER for path selection --- src/RECOLL-VERSION.txt | 2 +- src/common/autoconfig-mac.h | 4 +- src/common/autoconfig-win.h | 4 +- src/rcldb/searchdata.cpp | 121 ++++++++++++------------------ src/rcldb/searchdata.h | 11 ++- src/rcldb/searchdatatox.cpp | 27 ++++--- tests/langparser1/langparser1.txt | 6 +- 7 files changed, 82 insertions(+), 93 deletions(-) diff --git a/src/RECOLL-VERSION.txt b/src/RECOLL-VERSION.txt index b9488ec7..7aa332e4 100644 --- a/src/RECOLL-VERSION.txt +++ b/src/RECOLL-VERSION.txt @@ -1 +1 @@ -1.32.8 +1.33.0 diff --git a/src/common/autoconfig-mac.h b/src/common/autoconfig-mac.h index 95d3e346..7242c76e 100644 --- a/src/common/autoconfig-mac.h +++ b/src/common/autoconfig-mac.h @@ -125,7 +125,7 @@ #define PACKAGE_NAME "Recoll" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "Recoll 1.32.8" +#define PACKAGE_STRING "Recoll 1.33.0" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "recoll" @@ -134,7 +134,7 @@ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "1.32.8" +#define PACKAGE_VERSION "1.33.0" /* putenv parameter is const */ /* #undef PUTENV_ARG_CONST */ diff --git a/src/common/autoconfig-win.h b/src/common/autoconfig-win.h index b3fb34d2..66a60fed 100644 --- a/src/common/autoconfig-win.h +++ b/src/common/autoconfig-win.h @@ -118,7 +118,7 @@ #define PACKAGE_NAME "Recoll" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "Recoll 1.32.8" +#define PACKAGE_STRING "Recoll 1.33.0" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "recoll" @@ -127,7 +127,7 @@ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "1.32.8" +#define PACKAGE_VERSION "1.33.0" /* putenv parameter is const */ /* #undef PUTENV_ARG_CONST */ diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 342fc901..7b7e254f 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2006 J.F.Dockes +/* Copyright (C) 2006-2022 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -50,14 +50,11 @@ using namespace std; namespace Rcl { -typedef vector::iterator qlist_it_t; -typedef vector::const_iterator qlist_cit_t; - SearchData::~SearchData() { - LOGDEB0("SearchData::~SearchData\n" ); - for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) - delete *it; + LOGDEB0("SearchData::~SearchData\n"); + for (auto& clausep : m_query) + delete clausep; } // This is called by the GUI simple search if the option is set: add @@ -66,44 +63,42 @@ SearchData::~SearchData() // We remove very common terms from the query to avoid performance issues. bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold) { - LOGDEB0("SearchData::maybeAddAutoPhrase()\n" ); + LOGDEB0("SearchData::maybeAddAutoPhrase()\n"); // cerr << "BEFORE SIMPLIFY\n"; dump(cerr); simplify(); // cerr << "AFTER SIMPLIFY\n"; dump(cerr); - if (!m_query.size()) { - LOGDEB2("SearchData::maybeAddAutoPhrase: empty query\n" ); + if (m_query.empty()) { + LOGDEB2("SearchData::maybeAddAutoPhrase: empty query\n"); return false; } string field; + auto clp0 = dynamic_cast(*m_query.begin()); + if (clp0) + field = clp0->getfield(); vector words; // Walk the clause list. If this is not an AND list, we find any // non simple clause or different field names, bail out. - for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) { - SClType tp = (*it)->m_tp; + for (auto& clausep : m_query) { + SClType tp = clausep->m_tp; if (tp != SCLT_AND) { - LOGDEB2("SearchData::maybeAddAutoPhrase: wrong tp " << (tp) << "\n" ); + LOGDEB2("SearchData::maybeAddAutoPhrase: wrong tp " << tp << "\n"); return false; } - SearchDataClauseSimple *clp = - dynamic_cast(*it); + auto clp = dynamic_cast(clausep); if (clp == 0) { - LOGDEB2("SearchData::maybeAddAutoPhrase: dyncast failed\n" ); + LOGDEB2("SearchData::maybeAddAutoPhrase: other than clauseSimple in query.\n"); return false; } - if (it == m_query.begin()) { - field = clp->getfield(); - } else { - if (clp->getfield().compare(field)) { - LOGDEB2("SearchData::maybeAddAutoPhrase: diff. fields\n" ); - return false; - } + if (clp->getfield().compare(field)) { + LOGDEB2("SearchData::maybeAddAutoPhrase: diff. fields\n"); + return false; } // If there are wildcards or quotes in there, bail out if (clp->gettext().find_first_of("\"*[?") != string::npos) { - LOGDEB2("SearchData::maybeAddAutoPhrase: wildcards\n" ); + LOGDEB2("SearchData::maybeAddAutoPhrase: wildcards\n"); return false; } @@ -124,16 +119,15 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold) if (!doccnt) doccnt = 1; string swords; - for (vector::iterator it = words.begin(); - it != words.end(); it++) { - double freq = double(db.termDocCnt(*it)) / doccnt; + for (const auto& word : words) { + double freq = double(db.termDocCnt(word)) / doccnt; if (freq < freqThreshold) { if (!swords.empty()) swords.append(1, ' '); - swords += *it; + swords += word; } else { - LOGDEB0("SearchData::Autophrase: [" << *it << "] too frequent (" - << (100 * freq) << " %" << ")\n" ); + LOGDEB0("SearchData::Autophrase: [" << word << "] too frequent (" + << (100 * freq) << " %" << ")\n"); slack++; } } @@ -141,7 +135,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold) // We can't make a phrase with a single word :) int nwords = TextSplit::countWords(swords); if (nwords <= 1) { - LOGDEB2("SearchData::maybeAddAutoPhrase: ended with 1 word\n" ); + LOGDEB2("SearchData::maybeAddAutoPhrase: ended with 1 word\n"); return false; } @@ -149,8 +143,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold) // an actual user-entered phrase slack += 1 + nwords / 3; - m_autophrase = std::shared_ptr( - new SearchDataClauseDist(SCLT_PHRASE, swords, slack, field)); + m_autophrase = make_shared(SCLT_PHRASE, swords, slack, field); return true; } @@ -158,7 +151,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold) bool SearchData::addClause(SearchDataClause* cl) { if (m_tp == SCLT_OR && cl->getexclude()) { - LOGERR("SearchData::addClause: cant add EXCL to OR list\n" ); + LOGERR("SearchData::addClause: cant add EXCL to OR list\n"); m_reason = "No Negative (AND_NOT) clauses allowed in OR queries"; return false; } @@ -172,27 +165,30 @@ bool SearchData::addClause(SearchDataClause* cl) // There can't be a subclause in a filename search: no possible need to recurse bool SearchData::fileNameOnly() { - for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) - if (!(*it)->isFileName()) + for (const auto& clausep : m_query) { + if (!clausep->isFileName()) return false; + } return true; } // The query language creates a lot of subqueries. See if we can merge them. void SearchData::simplify() { + LOGDEB0("SearchData::simplify()\n"); for (unsigned int i = 0; i < m_query.size(); i++) { if (m_query[i]->m_tp != SCLT_SUB) continue; - //C[est ce dyncast qui crashe?? - SearchDataClauseSub *clsubp = - dynamic_cast(m_query[i]); - if (clsubp == 0) { + + auto clsubp = dynamic_cast(m_query[i]); + if (nullptr == clsubp) { // ?? continue; } - if (clsubp->getSub()->m_tp != m_tp) + if (clsubp->getSub()->m_tp != m_tp) { + LOGDEB0("Not simplifying because sub has differing m_tp\n"); continue; + } clsubp->getSub()->simplify(); @@ -211,8 +207,8 @@ void SearchData::simplify() clsubp->getSub()->m_filetypes.begin(), clsubp->getSub()->m_filetypes.end()); m_nfiletypes.insert(m_nfiletypes.end(), - clsubp->getSub()->m_nfiletypes.begin(), - clsubp->getSub()->m_nfiletypes.end()); + clsubp->getSub()->m_nfiletypes.begin(), + clsubp->getSub()->m_nfiletypes.end()); if (clsubp->getSub()->m_haveDates && !m_haveDates) { m_dates = clsubp->getSub()->m_dates; } @@ -220,34 +216,17 @@ void SearchData::simplify() m_maxSize = clsubp->getSub()->m_maxSize; if (m_minSize == -1) m_minSize = clsubp->getSub()->m_minSize; - m_haveWildCards = m_haveWildCards || - clsubp->getSub()->m_haveWildCards; + m_haveWildCards = m_haveWildCards || clsubp->getSub()->m_haveWildCards; // And then let the clauses processing go on, there are // none anyway, we will just delete the subquery. } - bool allsametp = true; - for (qlist_it_t it1 = clsubp->getSub()->m_query.begin(); - it1 != clsubp->getSub()->m_query.end(); it1++) { - // We want all AND or OR clause, and same as our conjunction - if (((*it1)->getTp() != SCLT_AND && (*it1)->getTp() != SCLT_OR) || - (*it1)->getTp() != m_tp) { - allsametp = false; - break; - } - } - if (!allsametp) - continue; - - // All ok: delete the clause_sub, and insert the queries from - // its searchdata in its place + // Delete the clause_sub, and insert the queries from its searchdata in its place m_query.erase(m_query.begin() + i); - m_query.insert(m_query.begin() + i, - clsubp->getSub()->m_query.begin(), + m_query.insert(m_query.begin() + i, clsubp->getSub()->m_query.begin(), clsubp->getSub()->m_query.end()); - for (unsigned int j = i; - j < i + clsubp->getSub()->m_query.size(); j++) { + for (unsigned int j = i; j < i + clsubp->getSub()->m_query.size(); j++) { m_query[j]->setParent(this); } i += int(clsubp->getSub()->m_query.size()) - 1; @@ -262,11 +241,10 @@ void SearchData::simplify() // Extract terms and groups for highlighting void SearchData::getTerms(HighlightData &hld) const { - for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++) { - if (!((*it)->getmodifiers() & SearchDataClause::SDCM_NOTERMS) && - !(*it)->getexclude()) { - (*it)->getTerms(hld); - } + for (const auto& clausep : m_query) { + if (!(clausep->getModifiers() & SearchDataClause::SDCM_NOTERMS) && !clausep->getexclude()) { + clausep->getTerms(hld); + } } return; } @@ -294,10 +272,9 @@ void SearchData::dump(ostream& o) const " ft " << m_filetypes.size() << " nft " << m_nfiletypes.size() << " hd " << m_haveDates << " maxs " << m_maxSize << " mins " << m_minSize << " wc " << m_haveWildCards << "\n"; - for (std::vector::const_iterator it = - m_query.begin(); it != m_query.end(); it++) { + for (const auto& clausep : m_query) { o << dumptabs; - (*it)->dump(o); + clausep->dump(o); o << "\n"; } // o << dumptabs << "\n"; diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index 6a102d97..d3c908e9 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2004 J.F.Dockes +/* Copyright (C) 2004-2022 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -18,7 +18,7 @@ #define _SEARCHDATA_H_INCLUDED_ /** - * Structures to hold data coming almost directly from the gui + * Structures to hold data coming almost directly from the GUI * and handle its translation to Xapian queries. * This is not generic code, it reflects the choices made for the user * interface, and it also knows some specific of recoll's usage of Xapian @@ -27,9 +27,9 @@ #include #include #include +#include #include "rcldb.h" -#include #include "smallut.h" #include "cstr.h" #include "hldata.h" @@ -238,6 +238,7 @@ public: // Aargh special case. pathelts are case/diac-sensitive // even in a stripped index SDCM_PATHELT = 0x80, + SDCM_FILTER = 0x100, }; enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE}; @@ -285,7 +286,7 @@ public: virtual void addModifier(Modifier mod) { m_modifiers = m_modifiers | mod; } - virtual unsigned int getmodifiers() { + virtual unsigned int getModifiers() { return m_modifiers; } virtual void setWeight(float w) { @@ -419,6 +420,7 @@ public: : SearchDataClauseSimple(txt, SCLT_FILENAME) { // File name searches don't count when looking for wild cards. m_haveWildCards = false; + addModifier(SDCM_FILTER); } virtual ~SearchDataClauseFilename() {} @@ -454,6 +456,7 @@ public: : SearchDataClauseSimple(SCLT_PATH, txt, "dir") { m_exclude = excl; m_haveWildCards = false; + addModifier(SDCM_FILTER); } virtual ~SearchDataClausePath() {} diff --git a/src/rcldb/searchdatatox.cpp b/src/rcldb/searchdatatox.cpp index 5eae257b..27beba0b 100644 --- a/src/rcldb/searchdatatox.cpp +++ b/src/rcldb/searchdatatox.cpp @@ -65,7 +65,6 @@ bool SearchData::expandFileTypes(Db &db, vector& tps) return false; } vector exptps; - for (const auto& mtype : tps) { if (cfg->isMimeCategory(mtype)) { vector ctps; @@ -106,6 +105,14 @@ bool SearchData::clausesToQuery( { Xapian::Query xq; for (auto& clausep : query) { +#if 0 + string txt; + auto clp = dynamic_cast(clausep); + if (clp) + txt = clp->gettext(); + LOGINF("Clause: tp: " << clausep->getTp() << " txt: [" << txt << "] mods: " << + std::hex << clausep->getModifiers() << std::dec << "\n"); +#endif Xapian::Query nq; if (!clausep->toNativeQuery(db, &nq)) { LOGERR("SearchData::clausesToQuery: toNativeQuery failed: " @@ -114,7 +121,7 @@ bool SearchData::clausesToQuery( return false; } if (nq.empty()) { - LOGDEB("SearchData::clausesToQuery: skipping empty clause\n"); + LOGDEB0("SearchData::clausesToQuery: skipping empty clause\n"); continue; } // If this structure is an AND list, must use AND_NOT for excl clauses. @@ -125,7 +132,11 @@ bool SearchData::clausesToQuery( if (clausep->getexclude()) { op = Xapian::Query::OP_AND_NOT; } else { - op = Xapian::Query::OP_AND; + if (clausep->getModifiers() & SearchDataClause::SDCM_FILTER) { + op = Xapian::Query::OP_FILTER; + } else { + op = Xapian::Query::OP_AND; + } } } else { op = Xapian::Query::OP_OR; @@ -166,12 +177,12 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d) db.getConf()->getConfParam("autocasesens", &m_autocasesens); db.getConf()->getConfParam("autodiacsens", &m_autodiacsens); + simplify(); // Walk the clause list translating each in turn and building the // Xapian query tree Xapian::Query xq; if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) { - LOGERR("SearchData::toNativeQuery: clausesToQuery failed. reason: " - << m_reason << "\n"); + LOGERR("SearchData::toNativeQuery: clausesToQuery failed. reason: " << m_reason << "\n"); return false; } @@ -231,8 +242,7 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d) leftzeropad(minvalue, 12); string maxvalue(max); leftzeropad(maxvalue, 12); - sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE, - minvalue, maxvalue); + sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE, minvalue, maxvalue); } // If no probabilistic query is provided then promote the @@ -249,8 +259,7 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d) if (m_autophrase) { Xapian::Query apq; if (m_autophrase->toNativeQuery(db, &apq)) { - xq = xq.empty() ? apq : - Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq); + xq = xq.empty() ? apq : Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq); } } diff --git a/tests/langparser1/langparser1.txt b/tests/langparser1/langparser1.txt index 255af350..e825a329 100644 --- a/tests/langparser1/langparser1.txt +++ b/tests/langparser1/langparser1.txt @@ -7,7 +7,7 @@ Query: (A AND B) OR (C AND D) -> Recoll query: Query(((a AND b) OR (c AND d))) Query: (A OR B) AND (C OR D) -> Recoll query: Query(((a OR b) AND (c OR d))) Query: -the B -> Recoll query: Query((( AND_NOT the) AND b)) - Query: A -B -> Recoll query: Query((a AND ( AND_NOT b))) + Query: A -B -> Recoll query: Query((a AND_NOT b)) Query: mime:text/plain -> Recoll query: Query(( FILTER Ttext/plain)) Query: size>10k -> Recoll query: Query(( FILTER VALUE_GE 2 000000010000)) Query: date:3000-01-01 -> Recoll query: Query(( FILTER D30000101)) @@ -23,5 +23,5 @@ Query: A OR B date:3000-01-01 -> Recoll query: Query(((a OR b) FILTER D30000101)) Query: A OR B AND date:3000-01-01 -> Recoll query: Query(((a OR b) FILTER D30000101)) Query: title:A B -> Recoll query: Query((Sa AND b)) - Query: title:A -B -> Recoll query: Query((Sa AND ( AND_NOT b))) - Query: A -title:B -> Recoll query: Query((a AND ( AND_NOT Sb))) + Query: title:A -B -> Recoll query: Query((Sa AND_NOT b)) + Query: A -title:B -> Recoll query: Query((a AND_NOT Sb))