Search: change query processing a bit so that we can use OP_FILTER for path selection

This commit is contained in:
Jean-Francois Dockes 2022-08-21 14:14:07 +02:00
parent 16467900bd
commit bf46e6ca0e
7 changed files with 82 additions and 93 deletions

View File

@ -1 +1 @@
1.32.8
1.33.0

View File

@ -125,7 +125,7 @@
#define PACKAGE_NAME "Recoll"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "Recoll 1.32.8"
#define PACKAGE_STRING "Recoll 1.33.0"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "recoll"
@ -134,7 +134,7 @@
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "1.32.8"
#define PACKAGE_VERSION "1.33.0"
/* putenv parameter is const */
/* #undef PUTENV_ARG_CONST */

View File

@ -118,7 +118,7 @@
#define PACKAGE_NAME "Recoll"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "Recoll 1.32.8"
#define PACKAGE_STRING "Recoll 1.33.0"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "recoll"
@ -127,7 +127,7 @@
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "1.32.8"
#define PACKAGE_VERSION "1.33.0"
/* putenv parameter is const */
/* #undef PUTENV_ARG_CONST */

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2006 J.F.Dockes
/* Copyright (C) 2006-2022 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@ -50,14 +50,11 @@ using namespace std;
namespace Rcl {
typedef vector<SearchDataClause *>::iterator qlist_it_t;
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
SearchData::~SearchData()
{
LOGDEB0("SearchData::~SearchData\n" );
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
delete *it;
LOGDEB0("SearchData::~SearchData\n");
for (auto& clausep : m_query)
delete clausep;
}
// This is called by the GUI simple search if the option is set: add
@ -66,44 +63,42 @@ SearchData::~SearchData()
// We remove very common terms from the query to avoid performance issues.
bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
{
LOGDEB0("SearchData::maybeAddAutoPhrase()\n" );
LOGDEB0("SearchData::maybeAddAutoPhrase()\n");
// cerr << "BEFORE SIMPLIFY\n"; dump(cerr);
simplify();
// cerr << "AFTER SIMPLIFY\n"; dump(cerr);
if (!m_query.size()) {
LOGDEB2("SearchData::maybeAddAutoPhrase: empty query\n" );
if (m_query.empty()) {
LOGDEB2("SearchData::maybeAddAutoPhrase: empty query\n");
return false;
}
string field;
auto clp0 = dynamic_cast<SearchDataClauseSimple*>(*m_query.begin());
if (clp0)
field = clp0->getfield();
vector<string> words;
// Walk the clause list. If this is not an AND list, we find any
// non simple clause or different field names, bail out.
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
SClType tp = (*it)->m_tp;
for (auto& clausep : m_query) {
SClType tp = clausep->m_tp;
if (tp != SCLT_AND) {
LOGDEB2("SearchData::maybeAddAutoPhrase: wrong tp " << (tp) << "\n" );
LOGDEB2("SearchData::maybeAddAutoPhrase: wrong tp " << tp << "\n");
return false;
}
SearchDataClauseSimple *clp =
dynamic_cast<SearchDataClauseSimple*>(*it);
auto clp = dynamic_cast<SearchDataClauseSimple*>(clausep);
if (clp == 0) {
LOGDEB2("SearchData::maybeAddAutoPhrase: dyncast failed\n" );
LOGDEB2("SearchData::maybeAddAutoPhrase: other than clauseSimple in query.\n");
return false;
}
if (it == m_query.begin()) {
field = clp->getfield();
} else {
if (clp->getfield().compare(field)) {
LOGDEB2("SearchData::maybeAddAutoPhrase: diff. fields\n" );
LOGDEB2("SearchData::maybeAddAutoPhrase: diff. fields\n");
return false;
}
}
// If there are wildcards or quotes in there, bail out
if (clp->gettext().find_first_of("\"*[?") != string::npos) {
LOGDEB2("SearchData::maybeAddAutoPhrase: wildcards\n" );
LOGDEB2("SearchData::maybeAddAutoPhrase: wildcards\n");
return false;
}
@ -124,16 +119,15 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
if (!doccnt)
doccnt = 1;
string swords;
for (vector<string>::iterator it = words.begin();
it != words.end(); it++) {
double freq = double(db.termDocCnt(*it)) / doccnt;
for (const auto& word : words) {
double freq = double(db.termDocCnt(word)) / doccnt;
if (freq < freqThreshold) {
if (!swords.empty())
swords.append(1, ' ');
swords += *it;
swords += word;
} else {
LOGDEB0("SearchData::Autophrase: [" << *it << "] too frequent ("
<< (100 * freq) << " %" << ")\n" );
LOGDEB0("SearchData::Autophrase: [" << word << "] too frequent ("
<< (100 * freq) << " %" << ")\n");
slack++;
}
}
@ -141,7 +135,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
// We can't make a phrase with a single word :)
int nwords = TextSplit::countWords(swords);
if (nwords <= 1) {
LOGDEB2("SearchData::maybeAddAutoPhrase: ended with 1 word\n" );
LOGDEB2("SearchData::maybeAddAutoPhrase: ended with 1 word\n");
return false;
}
@ -149,8 +143,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
// an actual user-entered phrase
slack += 1 + nwords / 3;
m_autophrase = std::shared_ptr<SearchDataClauseDist>(
new SearchDataClauseDist(SCLT_PHRASE, swords, slack, field));
m_autophrase = make_shared<SearchDataClauseDist>(SCLT_PHRASE, swords, slack, field);
return true;
}
@ -158,7 +151,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
bool SearchData::addClause(SearchDataClause* cl)
{
if (m_tp == SCLT_OR && cl->getexclude()) {
LOGERR("SearchData::addClause: cant add EXCL to OR list\n" );
LOGERR("SearchData::addClause: cant add EXCL to OR list\n");
m_reason = "No Negative (AND_NOT) clauses allowed in OR queries";
return false;
}
@ -172,27 +165,30 @@ bool SearchData::addClause(SearchDataClause* cl)
// There can't be a subclause in a filename search: no possible need to recurse
bool SearchData::fileNameOnly()
{
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
if (!(*it)->isFileName())
for (const auto& clausep : m_query) {
if (!clausep->isFileName())
return false;
}
return true;
}
// The query language creates a lot of subqueries. See if we can merge them.
void SearchData::simplify()
{
LOGDEB0("SearchData::simplify()\n");
for (unsigned int i = 0; i < m_query.size(); i++) {
if (m_query[i]->m_tp != SCLT_SUB)
continue;
//C[est ce dyncast qui crashe??
SearchDataClauseSub *clsubp =
dynamic_cast<SearchDataClauseSub*>(m_query[i]);
if (clsubp == 0) {
auto clsubp = dynamic_cast<SearchDataClauseSub*>(m_query[i]);
if (nullptr == clsubp) {
// ??
continue;
}
if (clsubp->getSub()->m_tp != m_tp)
if (clsubp->getSub()->m_tp != m_tp) {
LOGDEB0("Not simplifying because sub has differing m_tp\n");
continue;
}
clsubp->getSub()->simplify();
@ -220,34 +216,17 @@ void SearchData::simplify()
m_maxSize = clsubp->getSub()->m_maxSize;
if (m_minSize == -1)
m_minSize = clsubp->getSub()->m_minSize;
m_haveWildCards = m_haveWildCards ||
clsubp->getSub()->m_haveWildCards;
m_haveWildCards = m_haveWildCards || clsubp->getSub()->m_haveWildCards;
// And then let the clauses processing go on, there are
// none anyway, we will just delete the subquery.
}
bool allsametp = true;
for (qlist_it_t it1 = clsubp->getSub()->m_query.begin();
it1 != clsubp->getSub()->m_query.end(); it1++) {
// We want all AND or OR clause, and same as our conjunction
if (((*it1)->getTp() != SCLT_AND && (*it1)->getTp() != SCLT_OR) ||
(*it1)->getTp() != m_tp) {
allsametp = false;
break;
}
}
if (!allsametp)
continue;
// All ok: delete the clause_sub, and insert the queries from
// its searchdata in its place
// Delete the clause_sub, and insert the queries from its searchdata in its place
m_query.erase(m_query.begin() + i);
m_query.insert(m_query.begin() + i,
clsubp->getSub()->m_query.begin(),
m_query.insert(m_query.begin() + i, clsubp->getSub()->m_query.begin(),
clsubp->getSub()->m_query.end());
for (unsigned int j = i;
j < i + clsubp->getSub()->m_query.size(); j++) {
for (unsigned int j = i; j < i + clsubp->getSub()->m_query.size(); j++) {
m_query[j]->setParent(this);
}
i += int(clsubp->getSub()->m_query.size()) - 1;
@ -262,10 +241,9 @@ void SearchData::simplify()
// Extract terms and groups for highlighting
void SearchData::getTerms(HighlightData &hld) const
{
for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++) {
if (!((*it)->getmodifiers() & SearchDataClause::SDCM_NOTERMS) &&
!(*it)->getexclude()) {
(*it)->getTerms(hld);
for (const auto& clausep : m_query) {
if (!(clausep->getModifiers() & SearchDataClause::SDCM_NOTERMS) && !clausep->getexclude()) {
clausep->getTerms(hld);
}
}
return;
@ -294,10 +272,9 @@ void SearchData::dump(ostream& o) const
" ft " << m_filetypes.size() << " nft " << m_nfiletypes.size() <<
" hd " << m_haveDates << " maxs " << m_maxSize << " mins " <<
m_minSize << " wc " << m_haveWildCards << "\n";
for (std::vector<SearchDataClause*>::const_iterator it =
m_query.begin(); it != m_query.end(); it++) {
for (const auto& clausep : m_query) {
o << dumptabs;
(*it)->dump(o);
clausep->dump(o);
o << "\n";
}
// o << dumptabs << "\n";

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2004 J.F.Dockes
/* Copyright (C) 2004-2022 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@ -18,7 +18,7 @@
#define _SEARCHDATA_H_INCLUDED_
/**
* Structures to hold data coming almost directly from the gui
* Structures to hold data coming almost directly from the GUI
* and handle its translation to Xapian queries.
* This is not generic code, it reflects the choices made for the user
* interface, and it also knows some specific of recoll's usage of Xapian
@ -27,9 +27,9 @@
#include <string>
#include <vector>
#include <ostream>
#include <memory>
#include "rcldb.h"
#include <memory>
#include "smallut.h"
#include "cstr.h"
#include "hldata.h"
@ -238,6 +238,7 @@ public:
// Aargh special case. pathelts are case/diac-sensitive
// even in a stripped index
SDCM_PATHELT = 0x80,
SDCM_FILTER = 0x100,
};
enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE};
@ -285,7 +286,7 @@ public:
virtual void addModifier(Modifier mod) {
m_modifiers = m_modifiers | mod;
}
virtual unsigned int getmodifiers() {
virtual unsigned int getModifiers() {
return m_modifiers;
}
virtual void setWeight(float w) {
@ -419,6 +420,7 @@ public:
: SearchDataClauseSimple(txt, SCLT_FILENAME) {
// File name searches don't count when looking for wild cards.
m_haveWildCards = false;
addModifier(SDCM_FILTER);
}
virtual ~SearchDataClauseFilename() {}
@ -454,6 +456,7 @@ public:
: SearchDataClauseSimple(SCLT_PATH, txt, "dir") {
m_exclude = excl;
m_haveWildCards = false;
addModifier(SDCM_FILTER);
}
virtual ~SearchDataClausePath() {}

View File

@ -65,7 +65,6 @@ bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
return false;
}
vector<string> exptps;
for (const auto& mtype : tps) {
if (cfg->isMimeCategory(mtype)) {
vector<string> ctps;
@ -106,6 +105,14 @@ bool SearchData::clausesToQuery(
{
Xapian::Query xq;
for (auto& clausep : query) {
#if 0
string txt;
auto clp = dynamic_cast<SearchDataClauseSimple*>(clausep);
if (clp)
txt = clp->gettext();
LOGINF("Clause: tp: " << clausep->getTp() << " txt: [" << txt << "] mods: " <<
std::hex << clausep->getModifiers() << std::dec << "\n");
#endif
Xapian::Query nq;
if (!clausep->toNativeQuery(db, &nq)) {
LOGERR("SearchData::clausesToQuery: toNativeQuery failed: "
@ -114,7 +121,7 @@ bool SearchData::clausesToQuery(
return false;
}
if (nq.empty()) {
LOGDEB("SearchData::clausesToQuery: skipping empty clause\n");
LOGDEB0("SearchData::clausesToQuery: skipping empty clause\n");
continue;
}
// If this structure is an AND list, must use AND_NOT for excl clauses.
@ -124,9 +131,13 @@ bool SearchData::clausesToQuery(
if (tp == SCLT_AND) {
if (clausep->getexclude()) {
op = Xapian::Query::OP_AND_NOT;
} else {
if (clausep->getModifiers() & SearchDataClause::SDCM_FILTER) {
op = Xapian::Query::OP_FILTER;
} else {
op = Xapian::Query::OP_AND;
}
}
} else {
op = Xapian::Query::OP_OR;
}
@ -166,12 +177,12 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
db.getConf()->getConfParam("autocasesens", &m_autocasesens);
db.getConf()->getConfParam("autodiacsens", &m_autodiacsens);
simplify();
// Walk the clause list translating each in turn and building the
// Xapian query tree
Xapian::Query xq;
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
LOGERR("SearchData::toNativeQuery: clausesToQuery failed. reason: "
<< m_reason << "\n");
LOGERR("SearchData::toNativeQuery: clausesToQuery failed. reason: " << m_reason << "\n");
return false;
}
@ -231,8 +242,7 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
leftzeropad(minvalue, 12);
string maxvalue(max);
leftzeropad(maxvalue, 12);
sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE,
minvalue, maxvalue);
sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE, minvalue, maxvalue);
}
// If no probabilistic query is provided then promote the
@ -249,8 +259,7 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
if (m_autophrase) {
Xapian::Query apq;
if (m_autophrase->toNativeQuery(db, &apq)) {
xq = xq.empty() ? apq :
Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq);
xq = xq.empty() ? apq : Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq);
}
}

View File

@ -7,7 +7,7 @@
Query: (A AND B) OR (C AND D) -> Recoll query: Query(((a AND b) OR (c AND d)))
Query: (A OR B) AND (C OR D) -> Recoll query: Query(((a OR b) AND (c OR d)))
Query: -the B -> Recoll query: Query(((<alldocuments> AND_NOT the) AND b))
Query: A -B -> Recoll query: Query((a AND (<alldocuments> AND_NOT b)))
Query: A -B -> Recoll query: Query((a AND_NOT b))
Query: mime:text/plain -> Recoll query: Query((<alldocuments> FILTER Ttext/plain))
Query: size>10k -> Recoll query: Query((<alldocuments> FILTER VALUE_GE 2 000000010000))
Query: date:3000-01-01 -> Recoll query: Query((<alldocuments> FILTER D30000101))
@ -23,5 +23,5 @@
Query: A OR B date:3000-01-01 -> Recoll query: Query(((a OR b) FILTER D30000101))
Query: A OR B AND date:3000-01-01 -> Recoll query: Query(((a OR b) FILTER D30000101))
Query: title:A B -> Recoll query: Query((Sa AND b))
Query: title:A -B -> Recoll query: Query((Sa AND (<alldocuments> AND_NOT b)))
Query: A -title:B -> Recoll query: Query((a AND (<alldocuments> AND_NOT Sb)))
Query: title:A -B -> Recoll query: Query((Sa AND_NOT b))
Query: A -title:B -> Recoll query: Query((a AND_NOT Sb))