Use a xapian phrase search on the split path for filtering on directory location (much faster than the current method)
This commit is contained in:
parent
6ebc4b4fad
commit
21c6025ba7
@ -74,6 +74,8 @@ static const unsigned int baseTextPosition = 100000;
|
|||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
const string pathelt_prefix = "XP";
|
||||||
|
|
||||||
string version_string(){
|
string version_string(){
|
||||||
return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
|
return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
|
||||||
string(Xapian::version_string());
|
string(Xapian::version_string());
|
||||||
@ -921,7 +923,24 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
splitter.basepos += splitter.curpos + 100;
|
splitter.basepos += splitter.curpos + 100;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Split and index the path from the url for path-based filtering
|
||||||
|
{
|
||||||
|
string path = url_gpath(doc.url);
|
||||||
|
vector<string> vpath;
|
||||||
|
stringToTokens(path, vpath, "/");
|
||||||
|
splitter.curpos = 0;
|
||||||
|
newdocument.add_posting(pathelt_prefix,
|
||||||
|
splitter.basepos + splitter.curpos++);
|
||||||
|
for (vector<string>::const_iterator it = vpath.begin();
|
||||||
|
it != vpath.end(); it++){
|
||||||
|
newdocument.add_posting(pathelt_prefix + *it,
|
||||||
|
splitter.basepos + splitter.curpos++);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
splitter.basepos += splitter.curpos + 100;
|
||||||
|
|
||||||
|
|
||||||
// Index textual metadata. These are all indexed as text with
|
// Index textual metadata. These are all indexed as text with
|
||||||
// positions, as we may want to do phrase searches with them (this
|
// positions, as we may want to do phrase searches with them (this
|
||||||
// makes no sense for keywords by the way).
|
// makes no sense for keywords by the way).
|
||||||
@ -938,8 +957,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n",
|
LOGDEB0(("Db::add: field [%s] pfx [%s]: [%s]\n",
|
||||||
meta_it->first.c_str(), pfx.c_str(),
|
meta_it->first.c_str(), pfx.c_str(),
|
||||||
meta_it->second.c_str()));
|
meta_it->second.c_str()));
|
||||||
splitter.setprefix(pfx); // Subject
|
splitter.setprefix(pfx); // Subject
|
||||||
if (!splitter.text_to_words(meta_it->second))
|
if (!splitter.text_to_words(meta_it->second))
|
||||||
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
|
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
|
||||||
|
|||||||
@ -262,6 +262,7 @@ private:
|
|||||||
// the most reasonable place.
|
// the most reasonable place.
|
||||||
string version_string();
|
string version_string();
|
||||||
|
|
||||||
|
extern const string pathelt_prefix;
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
}
|
}
|
||||||
#endif // NO_NAMESPACES
|
#endif // NO_NAMESPACES
|
||||||
|
|||||||
@ -24,42 +24,6 @@ static char rcsid[] = "@(#$Id: rclquery.cpp,v 1.11 2008-12-19 09:55:36 dockes Ex
|
|||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
class FilterMatcher : public Xapian::MatchDecider {
|
|
||||||
public:
|
|
||||||
FilterMatcher(const string &topdir)
|
|
||||||
: m_topdir(topdir)
|
|
||||||
{}
|
|
||||||
virtual ~FilterMatcher() {}
|
|
||||||
|
|
||||||
virtual
|
|
||||||
#if XAPIAN_MAJOR_VERSION < 1
|
|
||||||
int
|
|
||||||
#else
|
|
||||||
bool
|
|
||||||
#endif
|
|
||||||
operator()(const Xapian::Document &xdoc) const
|
|
||||||
{
|
|
||||||
// Parse xapian document's data and populate doc fields
|
|
||||||
string data = xdoc.get_data();
|
|
||||||
ConfSimple parms(data);
|
|
||||||
|
|
||||||
// The only filtering for now is on file path (subtree)
|
|
||||||
string url;
|
|
||||||
parms.get(Doc::keyurl, url);
|
|
||||||
LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n",
|
|
||||||
m_topdir.c_str(), url.c_str()));
|
|
||||||
if (url.find(m_topdir, 7) == 7) {
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
string m_topdir;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Sort helper class
|
// Sort helper class
|
||||||
class QSorter : public Xapian::Sorter {
|
class QSorter : public Xapian::Sorter {
|
||||||
public:
|
public:
|
||||||
@ -134,23 +98,14 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
|
|||||||
}
|
}
|
||||||
m_reason.erase();
|
m_reason.erase();
|
||||||
|
|
||||||
m_filterTopDir = sdata->getTopdir();
|
|
||||||
m_nq->clear();
|
m_nq->clear();
|
||||||
|
|
||||||
if (!m_filterTopDir.empty()) {
|
|
||||||
#if XAPIAN_FILTERING
|
|
||||||
m_nq->decider =
|
|
||||||
#else
|
|
||||||
m_nq->postfilter =
|
|
||||||
#endif
|
|
||||||
new FilterMatcher(m_filterTopDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
Xapian::Query xq;
|
Xapian::Query xq;
|
||||||
if (!sdata->toNativeQuery(*m_db, &xq)) {
|
if (!sdata->toNativeQuery(*m_db, &xq)) {
|
||||||
m_reason += sdata->getReason();
|
m_reason += sdata->getReason();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
m_nq->xquery = xq;
|
m_nq->xquery = xq;
|
||||||
|
|
||||||
string d;
|
string d;
|
||||||
@ -162,6 +117,7 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
|
|||||||
} else {
|
} else {
|
||||||
m_nq->xenquire->set_collapse_key(Xapian::BAD_VALUENO);
|
m_nq->xenquire->set_collapse_key(Xapian::BAD_VALUENO);
|
||||||
}
|
}
|
||||||
|
m_nq->xenquire->set_docid_order(Xapian::Enquire::DONT_CARE);
|
||||||
if (!m_sortField.empty()) {
|
if (!m_sortField.empty()) {
|
||||||
if (m_sorter) {
|
if (m_sorter) {
|
||||||
delete (QSorter*)m_sorter;
|
delete (QSorter*)m_sorter;
|
||||||
@ -194,9 +150,7 @@ bool Query::setQuery(RefCntr<SearchData> sdata)
|
|||||||
|
|
||||||
if (d.find("Xapian::Query") == 0)
|
if (d.find("Xapian::Query") == 0)
|
||||||
d.erase(0, strlen("Xapian::Query"));
|
d.erase(0, strlen("Xapian::Query"));
|
||||||
if (!m_filterTopDir.empty()) {
|
|
||||||
d += string(" [dir: ") + m_filterTopDir + "]";
|
|
||||||
}
|
|
||||||
sdata->setDescription(d);
|
sdata->setDescription(d);
|
||||||
LOGDEB(("Query::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
|
LOGDEB(("Query::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
|
||||||
return true;
|
return true;
|
||||||
@ -252,8 +206,9 @@ bool Query::getMatchTerms(unsigned long xdocid, list<string>& terms)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Mset size
|
// Mset size
|
||||||
static const int qquantum = 30;
|
static const int qquantum = 50;
|
||||||
|
|
||||||
// Get estimated result count for query. Xapian actually does most of
|
// Get estimated result count for query. Xapian actually does most of
|
||||||
// the search job in there, this can be long
|
// the search job in there, this can be long
|
||||||
@ -269,7 +224,7 @@ int Query::getResCnt()
|
|||||||
Chrono chron;
|
Chrono chron;
|
||||||
|
|
||||||
XAPTRY(m_nq->xmset =
|
XAPTRY(m_nq->xmset =
|
||||||
m_nq->xenquire->get_mset(0, qquantum,0, m_nq->decider);
|
m_nq->xenquire->get_mset(0, qquantum, (const Xapian::RSet *)0);
|
||||||
ret = m_nq->xmset.get_matches_lower_bound(),
|
ret = m_nq->xmset.get_matches_lower_bound(),
|
||||||
m_db->m_ndb->xrdb, m_reason);
|
m_db->m_ndb->xrdb, m_reason);
|
||||||
|
|
||||||
@ -283,76 +238,29 @@ int Query::getResCnt()
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Get document at rank i in query (i is the index in the whole result
|
// Get document at rank xapi in query results. We check if the
|
||||||
// set, as in the enquire class. We check if the current mset has the
|
// current mset has the doc, else ask for an other one. We use msets
|
||||||
// doc, else ask for an other one. We use msets of 10 documents. Don't
|
// of qquantum documents.
|
||||||
// know if the whole thing makes sense at all but it seems to work.
|
|
||||||
//
|
//
|
||||||
// If there is a postquery filter (ie: file names), we have to
|
// Note that as stated by a Xapian developer, Enquire searches from
|
||||||
// maintain a correspondance from the sequential external index
|
// scratch each time get_mset() is called. So the better performance
|
||||||
// sequence to the internal Xapian hole-y one (the holes being the documents
|
// on subsequent calls is probably only due to disk caching.
|
||||||
// that dont match the filter).
|
bool Query::getDoc(int xapi, Doc &doc)
|
||||||
bool Query::getDoc(int exti, Doc &doc)
|
|
||||||
{
|
{
|
||||||
LOGDEB1(("Query::getDoc: exti %d\n", exti));
|
LOGDEB1(("Query::getDoc: xapian enquire index %d\n", xapi));
|
||||||
if (ISNULL(m_nq) || !m_nq->xenquire) {
|
if (ISNULL(m_nq) || !m_nq->xenquire) {
|
||||||
LOGERR(("Query::getDoc: no query opened\n"));
|
LOGERR(("Query::getDoc: no query opened\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int xapi;
|
|
||||||
if (m_nq->postfilter) {
|
|
||||||
// There is a postquery filter, does this fall in already known area ?
|
|
||||||
if (exti >= (int)m_nq->m_dbindices.size()) {
|
|
||||||
// Have to fetch xapian docs and filter until we get
|
|
||||||
// enough or fail
|
|
||||||
m_nq->m_dbindices.reserve(exti+1);
|
|
||||||
// First xapian doc we fetch is the one after last stored
|
|
||||||
int first = m_nq->m_dbindices.size() > 0 ?
|
|
||||||
m_nq->m_dbindices.back() + 1 : 0;
|
|
||||||
// Loop until we get enough docs
|
|
||||||
while (exti >= (int)m_nq->m_dbindices.size()) {
|
|
||||||
LOGDEB(("Query::getDoc: fetching %d starting at %d\n",
|
|
||||||
qquantum, first));
|
|
||||||
|
|
||||||
XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(first, qquantum),
|
|
||||||
m_db->m_ndb->xrdb, m_reason);
|
|
||||||
|
|
||||||
if (!m_reason.empty()) {
|
|
||||||
LOGERR(("enquire->get_mset: exception: %s\n",
|
|
||||||
m_reason.c_str()));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (m_nq->xmset.empty()) {
|
|
||||||
LOGDEB(("Query::getDoc: got empty mset\n"));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
first = m_nq->xmset.get_firstitem();
|
|
||||||
for (unsigned int i = 0; i < m_nq->xmset.size() ; i++) {
|
|
||||||
LOGDEB(("Query::getDoc: [%d]\n", i));
|
|
||||||
Xapian::Document xdoc = m_nq->xmset[i].get_document();
|
|
||||||
if ((*m_nq->postfilter)(xdoc)) {
|
|
||||||
m_nq->m_dbindices.push_back(first + i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
first = first + m_nq->xmset.size();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
xapi = m_nq->m_dbindices[exti];
|
|
||||||
} else {
|
|
||||||
xapi = exti;
|
|
||||||
}
|
|
||||||
|
|
||||||
// From there on, we work with a xapian enquire item number. Fetch it
|
|
||||||
int first = m_nq->xmset.get_firstitem();
|
int first = m_nq->xmset.get_firstitem();
|
||||||
int last = first + m_nq->xmset.size() -1;
|
int last = first + m_nq->xmset.size() -1;
|
||||||
|
|
||||||
if (!(xapi >= first && xapi <= last)) {
|
if (!(xapi >= first && xapi <= last)) {
|
||||||
LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum));
|
LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum));
|
||||||
|
|
||||||
XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(xapi, qquantum,
|
XAPTRY(m_nq->xmset = m_nq->xenquire->get_mset(xapi, qquantum,
|
||||||
0, m_nq->decider),
|
(const Xapian::RSet *)0),
|
||||||
m_db->m_ndb->xrdb, m_reason);
|
m_db->m_ndb->xrdb, m_reason);
|
||||||
|
|
||||||
if (!m_reason.empty()) {
|
if (!m_reason.empty()) {
|
||||||
@ -408,6 +316,7 @@ bool Query::getDoc(int exti, Doc &doc)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
doc.meta[Rcl::Doc::keyudi] = udi;
|
doc.meta[Rcl::Doc::keyudi] = udi;
|
||||||
|
|
||||||
// Parse xapian document's data and populate doc fields
|
// Parse xapian document's data and populate doc fields
|
||||||
return m_db->m_ndb->dbDataToRclDoc(docid, data, doc, pc);
|
return m_db->m_ndb->dbDataToRclDoc(docid, data, doc, pc);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -90,7 +90,6 @@ class Query {
|
|||||||
Native *m_nq;
|
Native *m_nq;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
string m_filterTopDir; // Current query filter on subtree top directory
|
|
||||||
string m_reason; // Error explanation
|
string m_reason; // Error explanation
|
||||||
Db *m_db;
|
Db *m_db;
|
||||||
void *m_sorter;
|
void *m_sorter;
|
||||||
|
|||||||
@ -17,51 +17,25 @@ class Query::Native {
|
|||||||
public:
|
public:
|
||||||
/** The query I belong to */
|
/** The query I belong to */
|
||||||
Query *m_q;
|
Query *m_q;
|
||||||
|
|
||||||
|
|
||||||
/** query descriptor: terms and subqueries joined by operators
|
/** query descriptor: terms and subqueries joined by operators
|
||||||
* (or/and etc...)
|
* (or/and etc...)
|
||||||
*/
|
*/
|
||||||
Xapian::Query xquery;
|
Xapian::Query xquery;
|
||||||
|
|
||||||
/** In case there is a postq filter: sequence of db indices that match */
|
|
||||||
vector<int> m_dbindices;
|
|
||||||
|
|
||||||
// Filtering results on location. There are 2 possible approaches
|
|
||||||
// for this:
|
|
||||||
// - Set a "MatchDecider" to be used by Xapian during the query
|
|
||||||
// - Filter the results out of Xapian (this also uses a
|
|
||||||
// Xapian::MatchDecider object, but applied to the results by Recoll.
|
|
||||||
//
|
|
||||||
// The result filtering approach was the first implemented.
|
|
||||||
//
|
|
||||||
// The efficiency of both methods depend on the searches, so the code
|
|
||||||
// for both has been kept. A nice point for the Xapian approach is that
|
|
||||||
// the result count estimate are correct (they are wrong with
|
|
||||||
// the postfilter approach). It is also faster in some worst case scenarios
|
|
||||||
// so this now the default (but the post-filtering is faster in many common
|
|
||||||
// cases).
|
|
||||||
//
|
|
||||||
// Which is used is decided in SetQuery(), by setting either of
|
|
||||||
// the two following members. This in turn is controlled by a
|
|
||||||
// preprocessor directive.
|
|
||||||
#define XAPIAN_FILTERING 1
|
|
||||||
Xapian::MatchDecider *decider; // Xapian does the filtering
|
|
||||||
Xapian::MatchDecider *postfilter; // Result filtering done by Recoll
|
|
||||||
|
|
||||||
Xapian::Enquire *xenquire; // Open query descriptor.
|
Xapian::Enquire *xenquire; // Open query descriptor.
|
||||||
Xapian::MSet xmset; // Partial result set
|
Xapian::MSet xmset; // Partial result set
|
||||||
// Term frequencies for current query. See makeAbstract, setQuery
|
// Term frequencies for current query. See makeAbstract, setQuery
|
||||||
map<string, double> termfreqs;
|
map<string, double> termfreqs;
|
||||||
|
|
||||||
Native(Query *q)
|
Native(Query *q)
|
||||||
: m_q(q), decider(0), postfilter(0), xenquire(0)
|
: m_q(q), xenquire(0)
|
||||||
{ }
|
{ }
|
||||||
~Native() {
|
~Native() {
|
||||||
clear();
|
clear();
|
||||||
}
|
}
|
||||||
void clear() {
|
void clear() {
|
||||||
m_dbindices.clear();
|
|
||||||
delete decider; decider = 0;
|
|
||||||
delete postfilter; postfilter = 0;
|
|
||||||
delete xenquire; xenquire = 0;
|
delete xenquire; xenquire = 0;
|
||||||
termfreqs.clear();
|
termfreqs.clear();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -249,6 +249,21 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
|||||||
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
|
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add the directory filtering clause
|
||||||
|
if (!m_topdir.empty()) {
|
||||||
|
vector<string> vpath;
|
||||||
|
stringToTokens(m_topdir, vpath, "/");
|
||||||
|
vector<string> pvpath;
|
||||||
|
pvpath.push_back(pathelt_prefix);
|
||||||
|
for (vector<string>::const_iterator it = vpath.begin();
|
||||||
|
it != vpath.end(); it++){
|
||||||
|
pvpath.push_back(pathelt_prefix + *it);
|
||||||
|
}
|
||||||
|
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq,
|
||||||
|
Xapian::Query(Xapian::Query::OP_PHRASE,
|
||||||
|
pvpath.begin(), pvpath.end()));
|
||||||
|
}
|
||||||
|
|
||||||
*((Xapian::Query *)d) = xq;
|
*((Xapian::Query *)d) = xq;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -18,14 +18,15 @@
|
|||||||
|
|
||||||
# Native fields matching omega uses, which we index without an X first
|
# Native fields matching omega uses, which we index without an X first
|
||||||
# letter. Don't change these. Caption is used for 'title' to keep a last
|
# letter. Don't change these. Caption is used for 'title' to keep a last
|
||||||
# remnant of omega compatibility inside the data record
|
# remnant of omega compatibility inside the data record. Also D,F,M,Q,T,Y
|
||||||
caption = S
|
caption = S
|
||||||
author = A
|
author = A
|
||||||
keywords = K
|
keywords = K
|
||||||
|
|
||||||
# Extension examples. These are actually used by default by Recoll, you can
|
# Extension examples. These are actually used by default by Recoll, you can
|
||||||
# add your own to search for fields produced by the filters and not handled
|
# add your own to search for fields produced by the filters and not handled
|
||||||
# by default.
|
# by default.
|
||||||
|
# Some values are reserved by recoll: XP (for path elements).
|
||||||
ext = XE
|
ext = XE
|
||||||
filename = XSFN
|
filename = XSFN
|
||||||
recipient = XTO
|
recipient = XTO
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user