From 9fb52e83ecd6179fb3da275e5db98347ef7c77cc Mon Sep 17 00:00:00 2001 From: dockes Date: Wed, 19 Oct 2005 14:14:17 +0000 Subject: [PATCH] implemented filtering on file subtree --- src/common/rclconfig.cpp | 7 +- src/qtgui/advsearch.ui.h | 2 +- src/qtgui/recollmain.ui | 3 +- src/qtgui/recollmain.ui.h | 20 ++++- src/rcldb/rcldb.cpp | 154 ++++++++++++++++++++++++++++---------- src/rcldb/rcldb.h | 27 +++++-- src/utils/idfile.cpp | 10 ++- src/utils/idfile.h | 6 +- 8 files changed, 174 insertions(+), 55 deletions(-) diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index c26f86eb..478b9d17 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.8 2005-10-17 13:36:53 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rclconfig.cpp,v 1.9 2005-10-19 14:14:17 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -81,12 +81,15 @@ static ConfSimple::WalkerCode mtypesWalker(void *l, return ConfSimple::WALK_CONTINUE; } +#include "idfile.h" std::list RclConfig::getAllMimeTypes() { - std::list lst; + std::list lst; if (mimemap == 0) return lst; mimemap->sortwalk(mtypesWalker, &lst); + std::list l1 = idFileAllTypes(); + lst.insert(lst.end(), l1.begin(), l1.end()); lst.sort(); lst.unique(); return lst; diff --git a/src/qtgui/advsearch.ui.h b/src/qtgui/advsearch.ui.h index a4757023..1f0a342a 100644 --- a/src/qtgui/advsearch.ui.h +++ b/src/qtgui/advsearch.ui.h @@ -22,13 +22,13 @@ using std::string; #include "rclconfig.h" #include "debuglog.h" - extern RclConfig *rclconfig; // Constructor/initialization void advsearch::init() { list types = rclconfig->getAllMimeTypes(); + QStringList ql; for (list::iterator it = types.begin(); it != types.end(); it++) { ql.append(it->c_str()); diff --git a/src/qtgui/recollmain.ui b/src/qtgui/recollmain.ui index 34e4030b..c0968984 100644 --- a/src/qtgui/recollmain.ui +++ b/src/qtgui/recollmain.ui @@ -304,13 +304,14 @@ clearqPB_clicked() listPrevPB_clicked() listNextPB_clicked() - advSearchPB_clicked() previewClosed( Preview * w ) + advSearchPB_clicked() startAdvSearch( Rcl::AdvSearchData sdata ) init() eventFilter( QObject * target, QEvent * event ) + close( bool alsoDelete ) diff --git a/src/qtgui/recollmain.ui.h b/src/qtgui/recollmain.ui.h index 38c78549..9d46fb72 100644 --- a/src/qtgui/recollmain.ui.h +++ b/src/qtgui/recollmain.ui.h @@ -50,6 +50,16 @@ void RecollMain::init() asearchform = 0; } +// We also want to get rid of the advanced search form when we exit +// (it's not our children so that it's not systematically created over +// the main form). +bool RecollMain::close( bool alsoDelete ) +{ + if (asearchform) + delete asearchform; + return QWidget::close(alsoDelete); +} + // We want to catch ^Q everywhere to mean quit. bool RecollMain::eventFilter( QObject * target, QEvent * event ) { @@ -65,6 +75,8 @@ bool RecollMain::eventFilter( QObject * target, QEvent * event ) void RecollMain::fileExit() { LOGDEB1(("RecollMain: fileExit\n")); + if (asearchform) + delete asearchform; exit(0); } @@ -349,8 +361,8 @@ void RecollMain::listNextPB_clicked() if (i == 0) { reslistTE->append("

"); char line[80]; - sprintf(line, "

Displaying results %d-%d out of %d
", - reslist_winfirst+1, reslist_winfirst+last, resCnt); + sprintf(line, "

Displaying results starting at index %d (maximum set size %d)
", + reslist_winfirst+1, resCnt); reslistTE->append(line); } @@ -422,7 +434,7 @@ void RecollMain::previewClosed(Preview *w) void RecollMain::advSearchPB_clicked() { if (asearchform == 0) { - asearchform = new advsearch(this, "Advanced search", FALSE, + asearchform = new advsearch(0, "Advanced search", FALSE, WStyle_Customize | WStyle_NormalBorder | WStyle_Title | WStyle_SysMenu); asearchform->setSizeGripEnabled(FALSE); @@ -455,3 +467,5 @@ void RecollMain::startAdvSearch(Rcl::AdvSearchData sdata) curPreview = 0; listNextPB_clicked(); } + + diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 174e0101..c7cc105e 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.29 2005-10-19 10:21:47 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.30 2005-10-19 14:14:17 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -42,8 +42,7 @@ class Native { Xapian::Enquire *enquire; Xapian::MSet mset; - Native() : isopen(false), iswritable(false), enquire(0) { - } + Native() : isopen(false), iswritable(false), enquire(0) { } ~Native() { delete enquire; } @@ -388,7 +387,6 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) return true; } - bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) { if (pdata == 0) @@ -611,35 +609,9 @@ bool Rcl::Db::purge() return true; } - -#include - -class wsQData : public TextSplitCB { - public: - vector terms; - string catterms() { - string s; - for (unsigned int i=0;i::iterator it=terms.begin(); it !=terms.end();it++){ - string dumb; - Rcl::dumb_string(*it, dumb); - *it = dumb; - } - } -}; - - -// Expand term to list of all terms which stem to the same term. +/** + * Expand term to list of all terms which stem to the same term. + */ static list stemexpand(Native *ndb, string term, const string& lang) { list explist; @@ -687,10 +659,38 @@ static list stemexpand(Native *ndb, string term, const string& lang) return explist; } + +class wsQData : public TextSplitCB { + public: + vector terms; + string catterms() { + string s; + for (unsigned int i=0;i::iterator it=terms.begin(); it !=terms.end();it++){ + string dumb; + Rcl::dumb_string(*it, dumb); + *it = dumb; + } + } +}; + + +/// // Turn string into possibly complex xapian query. There is little // interpretation done on the string (no +term -term or filename:term // stuff). We just separate words and phrases, and interpret // capitalized terms as wanting no stem expansion +// static void stringToXapianQueries(const string &iq, const string& stemlang, Native *ndb, @@ -762,6 +762,7 @@ static void stringToXapianQueries(const string &iq, } } +// Prepare query out of simple query string bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts, const string& stemlang) { @@ -771,6 +772,8 @@ bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts, if (!ndb) return false; + asdata.erase(); + dbindices.clear(); list pqueries; stringToXapianQueries(iqstring, stemlang, ndb, pqueries, opts); ndb->query = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), @@ -782,6 +785,7 @@ bool Rcl::Db::setQuery(const std::string &iqstring, QueryOpts opts, return true; } +// Prepare query out of "advanced search" data bool Rcl::Db::setQuery(AdvSearchData &sdata, const string& stemlang) { LOGDEB(("Rcl::Db::setQuery: adv:\n")); @@ -797,10 +801,12 @@ bool Rcl::Db::setQuery(AdvSearchData &sdata, const string& stemlang) if (!sdata.topdir.empty()) LOGDEB((" restricted to: %s\n", sdata.topdir.c_str())); + asdata = sdata; + dbindices.clear(); + Native *ndb = (Native *)pdata; if (!ndb) return false; - list pqueries; Xapian::Query xq; @@ -896,25 +902,91 @@ int Rcl::Db::getResCnt() return ndb->mset.get_matches_lower_bound(); } +// This class (friend to RclDb) exists so that we can have functions that +// access private RclDb data and have Xapian-specific parameters (so that we +// don't want them to appear in the public rcldb.h). +class Rcl::DbPops { + public: + static bool filterMatch(Rcl::Db *rdb, Xapian::Document &xdoc) { + // Parse xapian document's data and populate doc fields + string data = xdoc.get_data(); + ConfSimple parms(&data); + + // The only filtering for now is on file path (subtree) + string url; + parms.get(string("url"), url); + url = url.substr(7); + if (url.find(rdb->asdata.topdir) == 0) + return true; + return false; + } +}; + // Get document at rank i in query (i is the index in the whole result // set, as in the enquire class. We check if the current mset has the // doc, else ask for an other one. We use msets of 10 documents. Don't // know if the whole thing makes sense at all but it seems to work. -bool Rcl::Db::getDoc(int i, Doc &doc, int *percent) +// +// If there is a postquery filter (ie: file names), we have to +// maintain a correspondance from the sequential external index +// sequence to the internal Xapian hole-y one (the holes being the documents +// that dont match the filter). +bool Rcl::Db::getDoc(int exti, Doc &doc, int *percent) { - LOGDEB1(("Rcl::Db::getDoc: %d\n", i)); + const int qquantum = 30; + LOGDEB1(("Rcl::Db::getDoc: exti %d\n", exti)); Native *ndb = (Native *)pdata; if (!ndb || !ndb->enquire) { LOGERR(("Rcl::Db::getDoc: no query opened\n")); return false; } + // For now the only post-query filter is on dir subtree + bool postqfilter = !asdata.topdir.empty(); + LOGDEB1(("Topdir %s postqflt %d\n", asdata.topdir.c_str(), postqfilter)); + + int xapi; + if (postqfilter) { + // There is a postquery filter, does this fall in already known area ? + if (exti >= (int)dbindices.size()) { + // Have to fetch xapian docs and filter until we get + // enough or fail + dbindices.reserve(exti+1); + // First xapian doc we fetch is the one after last stored + int first = dbindices.size() > 0 ? dbindices.back() + 1 : 0; + // Loop until we get enough docs + while (exti >= (int)dbindices.size()) { + LOGDEB(("Rcl::Db::getDoc: fetching %d starting at %d\n", + qquantum, first)); + ndb->mset = ndb->enquire->get_mset(first, qquantum); + if (ndb->mset.empty()) { + LOGDEB(("Rcl::Db::getDoc: got empty mset\n")); + return false; + } + first = ndb->mset.get_firstitem(); + for (unsigned int i = 0; i < ndb->mset.size() ; i++) { + LOGDEB(("Rcl::Db::getDoc: [%d]\n", i)); + Xapian::Document xdoc = ndb->mset[i].get_document(); + if (Rcl::DbPops::filterMatch(this, xdoc)) { + dbindices.push_back(first + i); + } + } + first = first + ndb->mset.size(); + } + } + xapi = dbindices[exti]; + } else { + xapi = exti; + } + + + // From there on, we work with a xapian enquire item number. Fetch it int first = ndb->mset.get_firstitem(); int last = first + ndb->mset.size() -1; - if (!(i >= first && i <= last)) { - LOGDEB1(("Fetching for first %d, count 10\n", i)); - ndb->mset = ndb->enquire->get_mset(i, 10); + if (!(xapi >= first && xapi <= last)) { + LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum)); + ndb->mset = ndb->enquire->get_mset(xapi, qquantum); if (ndb->mset.empty()) return false; first = ndb->mset.get_firstitem(); @@ -926,9 +998,9 @@ bool Rcl::Db::getDoc(int i, Doc &doc, int *percent) first, last, ndb->mset.get_matches_lower_bound())); - Xapian::Document xdoc = ndb->mset[i-first].get_document(); + Xapian::Document xdoc = ndb->mset[xapi-first].get_document(); if (percent) - *percent = ndb->mset.convert_to_percent(ndb->mset[i-first]); + *percent = ndb->mset.convert_to_percent(ndb->mset[xapi-first]); // Parse xapian document's data and populate doc fields string data = xdoc.get_data(); diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index fda82053..5f4508e6 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -1,13 +1,14 @@ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.14 2005-10-19 10:21:47 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.15 2005-10-19 14:14:17 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include - +#include #ifndef NO_NAMESPACES using std::string; using std::list; +using std::vector; #endif // rcldb defines an interface for a 'real' text database. The current @@ -71,14 +72,24 @@ class AdvSearchData { string nowords; list filetypes; // restrict to types. Empty if inactive string topdir; // restrict to subtree. Empty if inactive + + void erase() { + allwords.erase();phrase.erase();orwords.erase();nowords.erase(); + filetypes.clear(); topdir.erase(); + } }; + + class DbPops; /** * Wrapper class for the native database. */ class Db { - void *pdata; - Doc curdoc; + AdvSearchData asdata; + vector dbindices; // In case there is a postq filter: sequence of + // db indices that match + void *pdata; // Pointer to private data. We don't want db(ie + // xapian)-specific defs to show in here public: Db(); ~Db(); @@ -104,10 +115,16 @@ class Db { // Get document at rank i. This is probably vastly inferior to the type // of interface in Xapian, but we have to start with something simple - // to experiment with the GUI + // to experiment with the GUI. i is sequential from 0 to some value bool getDoc(int i, Doc &doc, int *percent = 0); // Get results count int getResCnt(); + + friend class Rcl::DbPops; + private: + /* Copyconst and assignemt private and forbidden */ + Db(const Db &) {} + Db & operator=(const Db &) {return *this;}; }; // Unaccent and lowercase data. diff --git a/src/utils/idfile.cpp b/src/utils/idfile.cpp index e48af12d..b7a8eaf4 100644 --- a/src/utils/idfile.cpp +++ b/src/utils/idfile.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: idfile.cpp,v 1.1 2005-04-07 09:05:39 dockes Exp $ (C) 2005 J.F.Dockes"; +static char rcsid[] = "@(#$Id: idfile.cpp,v 1.2 2005-10-19 14:14:17 dockes Exp $ (C) 2005 J.F.Dockes"; #endif #ifndef TEST_IDFILE #include // for access(2) @@ -8,10 +8,18 @@ static char rcsid[] = "@(#$Id: idfile.cpp,v 1.1 2005-04-07 09:05:39 dockes Exp $ #include #include +#include "idfile.h" #include "debuglog.h" using namespace std; +std::list idFileAllTypes() +{ + std::list lst; + lst.push_back("text/x-mail"); + lst.push_back("message/rfc822"); + return lst; +} // Mail headers we compare to: static const char *mailhs[] = {"From: ", "Received: ", "Message-Id: ", "To: ", diff --git a/src/utils/idfile.h b/src/utils/idfile.h index 71d4535a..40fb8855 100644 --- a/src/utils/idfile.h +++ b/src/utils/idfile.h @@ -1,12 +1,16 @@ #ifndef _IDFILE_H_INCLUDED_ #define _IDFILE_H_INCLUDED_ -/* @(#$Id: idfile.h,v 1.1 2005-04-07 09:05:39 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: idfile.h,v 1.2 2005-10-19 14:14:17 dockes Exp $ (C) 2004 J.F.Dockes */ #include +#include // Return mime type for file or empty string. The system's file utility does // a bad job on mail folders. idFile only looks for mail file types for now, // but this may change extern std::string idFile(const char *fn); +// Return all types known to us +extern std::list idFileAllTypes(); + #endif /* _IDFILE_H_INCLUDED_ */