Add support for "issub" special field specifying that the results should be standalone (issub:0) or embedded (issub:1)

This commit is contained in:
Jean-Francois Dockes 2021-04-24 13:48:16 +02:00
parent d1058dc676
commit e5c320ca51
8 changed files with 107 additions and 61 deletions

View File

@ -50,19 +50,6 @@ std::shared_ptr<SearchData> wasaStringToRcl(
return sd;
}
WasaParserDriver::WasaParserDriver(const RclConfig *c, const std::string sl,
const std::string& as)
: m_stemlang(sl), m_autosuffs(as), m_config(c),
m_index(0), m_result(0), m_haveDates(false),
m_maxSize(-1), m_minSize(-1)
{
}
WasaParserDriver::~WasaParserDriver()
{
}
SearchData *WasaParserDriver::parse(const std::string& in)
{
m_input = in;
@ -83,13 +70,11 @@ SearchData *WasaParserDriver::parse(const std::string& in)
return m_result;
// Set the top level filters (types, dates, size)
for (vector<string>::const_iterator it = m_filetypes.begin();
it != m_filetypes.end(); it++) {
m_result->addFiletype(*it);
for (const auto& ft : m_filetypes) {
m_result->addFiletype(ft);
}
for (vector<string>::const_iterator it = m_nfiletypes.begin();
it != m_nfiletypes.end(); it++) {
m_result->remFiletype(*it);
for (const auto& ft : m_nfiletypes) {
m_result->remFiletype(ft);
}
if (m_haveDates) {
m_result->setDateSpan(&m_dates);
@ -100,6 +85,10 @@ SearchData *WasaParserDriver::parse(const std::string& in)
if (m_maxSize != -1) {
m_result->setMaxSize(m_maxSize);
}
if (m_subSpec != Rcl::SearchData::SUBDOC_ANY) {
m_result->setSubSpec(m_subSpec);
}
//if (m_result) m_result->dump(cout);
return m_result;
}
@ -122,8 +111,7 @@ void WasaParserDriver::UNGETCHAR(int c)
// Add clause to query, handling special pseudo-clauses for size/date
// etc. (mostly determined on field name).
bool WasaParserDriver::addClause(SearchData *sd,
SearchDataClauseSimple* cl)
bool WasaParserDriver::addClause(SearchData *sd, SearchDataClauseSimple* cl)
{
if (cl->getfield().empty()) {
// Simple clause with empty field spec.
@ -132,7 +120,7 @@ bool WasaParserDriver::addClause(SearchData *sd,
if (!m_autosuffs.empty()) {
vector<string> asfv;
if (stringToStrings(m_autosuffs, asfv)) {
if (find_if(asfv.begin(), asfv.end(),
if (find_if(asfv.begin(), asfv.end(),
StringIcmpPred(cl->gettext())) != asfv.end()) {
cl->setfield("ext");
cl->addModifier(SearchDataClause::SDCM_NOSTEMMING);
@ -156,6 +144,13 @@ bool WasaParserDriver::addClause(SearchData *sd,
return false;
}
// Filtering for standalone- or sub-documents
if (!fld.compare("issub")) {
m_subSpec = atoi(cl->gettext().c_str());
delete cl;
return false;
}
if (!fld.compare("rclcat") || !fld.compare("type")) {
vector<string> mtypes;
if (m_config && m_config->getMimeCatTypes(cl->gettext(), mtypes)) {
@ -231,8 +226,7 @@ bool WasaParserDriver::addClause(SearchData *sd,
if (!fld.compare("dir")) {
// dir filtering special case
SearchDataClausePath *nclause =
new SearchDataClausePath(cl->gettext(), cl->getexclude());
SearchDataClausePath *nclause = new SearchDataClausePath(cl->gettext(), cl->getexclude());
delete cl;
return sd->addClause(nclause);
}
@ -258,8 +252,7 @@ bool WasaParserDriver::addClause(SearchData *sd,
}
if (tp != SCLT_FILENAME) {
SearchDataClauseSimple *ncl =
new SearchDataClauseSimple(tp, ns, ofld);
SearchDataClauseSimple *ncl = new SearchDataClauseSimple(tp, ns, ofld);
delete cl;
return sd->addClause(ncl);
}

View File

@ -22,14 +22,12 @@
#include <vector>
#include "smallut.h"
#include "searchdata.h"
class WasaParserDriver;
namespace Rcl {
class SearchData;
class SearchDataClauseSimple;
}
namespace yy {
class parser;
class parser;
}
class RclConfig;
@ -37,9 +35,10 @@ class RclConfig;
class WasaParserDriver {
public:
WasaParserDriver(const RclConfig *c, const std::string sl,
const std::string& as);
~WasaParserDriver();
WasaParserDriver(const RclConfig *c, const std::string sl, const std::string& as)
: m_stemlang(sl), m_autosuffs(as), m_config(c) {}
~WasaParserDriver() {}
Rcl::SearchData *parse(const std::string&);
bool addClause(Rcl::SearchData *sd, Rcl::SearchDataClauseSimple* cl);
@ -67,20 +66,20 @@ private:
// input string.
std::string m_input;
// Current position in m_input
unsigned int m_index;
unsigned int m_index{0};
// Characters pushed-back, ready for next getchar.
std::stack<int> m_returns;
// Result, set by parser.
Rcl::SearchData *m_result;
Rcl::SearchData *m_result{nullptr};
// Storage for top level filters
std::vector<std::string> m_filetypes;
std::vector<std::string> m_nfiletypes;
bool m_haveDates;
bool m_haveDates{false};
DateInterval m_dates; // Restrict to date interval
int64_t m_maxSize;
int64_t m_minSize;
int64_t m_maxSize{-1};
int64_t m_minSize{-1};
int m_subSpec{Rcl::SearchData::SUBDOC_ANY};
std::string m_reason;
// Let the quoted string reader store qualifiers in there, simpler

View File

@ -2557,7 +2557,7 @@ bool Db::getSubDocs(const Doc &idoc, vector<Doc>& subdocs)
LOGERR("Db::getSubDocs: xapian error: " << m_reason << "\n");
return false;
}
if (xit == xdoc.termlist_end()) {
if (xit == xdoc.termlist_end() || get_prefix(*xit) != parent_prefix) {
LOGERR("Db::getSubDocs: parent term not found\n");
return false;
}
@ -2642,7 +2642,7 @@ bool Db::getContainerDoc(const Doc &idoc, Doc& ctdoc)
LOGERR("Db::getContainerDoc: xapian error: " << m_reason << "\n");
return false;
}
if (xit == xdoc.termlist_end()) {
if (xit == xdoc.termlist_end() || get_prefix(*xit) != parent_prefix) {
LOGERR("Db::getContainerDoc: parent term not found\n");
return false;
}

View File

@ -151,6 +151,20 @@ inline string strip_prefix(const string& trm)
return trm.substr(st);
}
inline string get_prefix(const string& trm)
{
if (!has_prefix(trm))
return trm;
string::size_type st = 0;
if (o_index_stripchars) {
st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ");
return trm.substr(0, st);
} else {
st = trm.find_last_of(":") + 1;
return trm.substr(1, st-2);
}
}
inline string wrap_prefix(const string& pfx)
{
if (o_index_stripchars) {

View File

@ -152,8 +152,7 @@ private:
};
Query::Query(Db *db)
: m_nq(new Native(this)), m_db(db), m_sorter(0), m_sortAscending(true),
m_collapseDuplicates(false), m_resCnt(-1), m_snipMaxPosWalk(1000000)
: m_nq(new Native(this)), m_db(db)
{
if (db)
db->getConf()->getConfParam("snippetMaxPosWalk", &m_snipMaxPosWalk);
@ -179,6 +178,27 @@ void Query::setSortBy(const string& fld, bool ascending) {
(m_sortAscending ? "ascending" : "descending") << "\n");
}
static const string parent_prefix{"F"};
class SubdocDecider : public Xapian::MatchDecider {
public:
SubdocDecider(bool sel) : MatchDecider(), m_select(sel) {}
virtual ~SubdocDecider() {}
virtual bool operator()(const Xapian::Document &doc) const {
bool hasparent{false};
try {
Xapian::TermIterator xit = doc.termlist_begin();
xit.skip_to(wrap_prefix(parent_prefix));
hasparent = (xit != doc.termlist_end()) && (get_prefix(*xit) == parent_prefix);
} catch (...) {
}
return hasparent == m_select;
}
bool m_select;
};
// Prepare query out of user search data
bool Query::setQuery(std::shared_ptr<SearchData> sdata)
{
@ -199,8 +219,13 @@ bool Query::setQuery(std::shared_ptr<SearchData> sdata)
m_reason += sdata->getReason();
return false;
}
m_nq->xquery = xq;
if (sdata->getSubSpec() == SearchData::SUBDOC_NO) {
m_nq->subdecider = new SubdocDecider(false);
} else if (sdata->getSubSpec() == SearchData::SUBDOC_YES) {
m_nq->subdecider = new SubdocDecider(true);
}
string d;
for (int tries = 0; tries < 2; tries++) {
@ -361,7 +386,8 @@ int Query::getResCnt(int checkatleast, bool useestimate)
Chrono chron;
XAPTRY(if (checkatleast == -1)
checkatleast = m_db->docCnt();
m_nq->xmset = m_nq->xenquire->get_mset(0, qquantum, checkatleast),
m_nq->xmset = m_nq->xenquire->get_mset(
0, qquantum, checkatleast, 0, m_nq->subdecider),
m_db->m_ndb->xrdb, m_reason);
if (!m_reason.empty()) {
LOGERR("xenquire->get_mset: exception: " << m_reason << "\n");
@ -401,10 +427,9 @@ bool Query::getDoc(int xapi, Doc &doc, bool fetchtext)
if (!(xapi >= first && xapi <= last)) {
LOGDEB("Fetching for first " << xapi << ", count " << qquantum << "\n");
XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(xapi, qquantum,
(const Xapian::RSet *)0),
XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(
xapi, qquantum, nullptr, m_nq->subdecider),
m_db->m_ndb->xrdb, m_reason);
if (!m_reason.empty()) {
LOGERR("enquire->get_mset: exception: " << m_reason << "\n");
return false;

View File

@ -139,13 +139,13 @@ public:
private:
std::string m_reason; // Error explanation
Db *m_db;
void *m_sorter;
void *m_sorter{nullptr};
std::string m_sortField;
bool m_sortAscending;
bool m_collapseDuplicates;
int m_resCnt;
bool m_sortAscending{true};
bool m_collapseDuplicates{false};
int m_resCnt{-1};
std::shared_ptr<SearchData> m_sd;
int m_snipMaxPosWalk;
int m_snipMaxPosWalk{1000000};
};
#ifndef NO_NAMESPACES

View File

@ -32,24 +32,26 @@ namespace Rcl {
class Query::Native {
public:
// The query I belong to
Query *m_q;
Query *m_q{nullptr};
// query descriptor: terms and subqueries joined by operators
// (or/and etc...)
Xapian::Query xquery;
// Open query descriptor.
Xapian::Enquire *xenquire;
Xapian::Enquire *xenquire{nullptr};
// Partial result set
Xapian::MSet xmset;
// Term frequencies for current query. See makeAbstract, setQuery
std::map<std::string, double> termfreqs;
Xapian::MatchDecider *subdecider{nullptr};
Native(Query *q)
: m_q(q), xenquire(0) { }
: m_q(q), xenquire(0) {}
~Native() {
clear();
}
void clear() {
delete xenquire; xenquire = 0;
deleteZ(xenquire);
deleteZ(subdecider);
termfreqs.clear();
}
/** Return a list of terms which matched for a specific result document */

View File

@ -114,6 +114,17 @@ public:
void setMinSize(int64_t size) {m_minSize = size;}
void setMaxSize(int64_t size) {m_maxSize = size;}
enum SubdocSpec {SUBDOC_ANY = -1, SUBDOC_NO = 0, SUBDOC_YES = 1};
void setSubSpec(int spec) {
switch (spec) {
case SUBDOC_ANY:
case SUBDOC_NO:
case SUBDOC_YES:
m_subspec = spec;
}
}
int getSubSpec() {return m_subspec;}
/** Set date span for filtering results */
void setDateSpan(DateInterval *dip) {m_dates = *dip; m_haveDates = true;}
@ -174,12 +185,14 @@ private:
std::shared_ptr<SearchDataClauseDist> m_autophrase;
// Special stuff produced by input which looks like a clause but means
// something else (date and size specs)
// something else (date, size specs, etc.)
bool m_haveDates{false};
DateInterval m_dates; // Restrict to date interval
int64_t m_maxSize{-1};
int64_t m_minSize{-1};
// Filtering for subdocs: -1:any, 0: only free-standing, 1: only subdocs
int m_subspec{SUBDOC_ANY};
// Printable expanded version of the complete query, retrieved/set
// from rcldb after the Xapian::setQuery() call
std::string m_description;