From 2ebb0a689de12385e358b106975ca38ad3d2ac5e Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 10 May 2022 09:17:58 +0200 Subject: [PATCH] GUI directory side filter: compute the directories from the index, not the FS tree, so that things work with external indexes --- src/qtgui/idxmodel.cpp | 73 +++++++++++++++++++-- src/rcldb/rcldb.h | 10 +++ src/rcldb/rclterms.cpp | 85 +++++++++++++++++++++++++ src/testmains/Makefile.am | 5 +- src/testmains/trrcldb.cpp | 131 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 298 insertions(+), 6 deletions(-) create mode 100644 src/testmains/trrcldb.cpp diff --git a/src/qtgui/idxmodel.cpp b/src/qtgui/idxmodel.cpp index 67b7dfe0..a136c245 100644 --- a/src/qtgui/idxmodel.cpp +++ b/src/qtgui/idxmodel.cpp @@ -9,6 +9,7 @@ #include "fstreewalk.h" #include "idxmodel.h" +#undef USE_TREEWALK class WalkerCB : public FsTreeWalkerCB { public: @@ -45,6 +46,7 @@ FsTreeWalker::Status WalkerCB::processone( return FsTreeWalker::FtwOk; } if (flg == FsTreeWalker::FtwDirEnter) { + //std::cerr << "ENTER: " << path << "\n"; if (m_model->columnCount(m_indexes.top()) == 0) { if (!m_model->insertColumn(0, m_indexes.top())) return FsTreeWalker::FtwError; @@ -70,6 +72,7 @@ FsTreeWalker::Status WalkerCB::processone( return FsTreeWalker::FtwOk; } +#ifdef USE_TREEWALK static void populateDir(RclConfig *config, const std::string& topstr, IdxTreeModel *model, const QModelIndex& index, const std::string& path, int depth) { @@ -81,19 +84,70 @@ static void populateDir(RclConfig *config, const std::string& topstr, IdxTreeMod walker.walk(path, cb); } +#else + +// Assemble a path from its components up to lst +std::string toksToPath(std::vector& path, int lst) +{ + std::string out; + for (int i = 0; i <= lst; i++) { + out += "/" + path[i]; + } + if (out.empty()) + out = "/"; + return out; +} + +// Process a sorted list of directory paths, generating a sequence of enter/exit calls equivalent to +// what would happen for a recursive tree walk of the original tree. +static void treelist(const std::string& top, const std::vector& lst, WalkerCB &cb) +{ + if (lst.empty()) { + return; + } + std::vector curpath; + stringToTokens(top, curpath, "/"); + std::cerr << "top " << top << " TOP len is " << curpath.size() << "\n"; + for (const auto& dir : lst) { + // std::cerr << "DIR: " << dir << "\n"; + std::vector npath; + // Compute the new directory stack + stringToTokens(dir, npath, "/"); + // Walk the stacks until we find a differing entry, and then unwind the old stack to the new + // base, and issue enter calls for new entries over the base. + int i = 0; + for (; i < int(std::min(curpath.size(), npath.size())); i++) { + if (npath[i] != curpath[i] && int(curpath.size()) > 0) { + // Differing at i, unwind old stack and break the main loop + for (int j = int(curpath.size()) - 1; j >= i; j--) { + //std::cerr << "Exiting " << toksToPath(curpath, j) << "\n"; + cb.processone(toksToPath(curpath, j), nullptr, FsTreeWalker::FtwDirReturn); + } + break; + } + } + // Callbacks for new entries above the base. + for (int j = i; j < int(npath.size()); j++) { + std::cerr << "Entering " << toksToPath(npath, j) << "\n"; + cb.processone(toksToPath(npath, j), nullptr, FsTreeWalker::FtwDirEnter); + } + curpath.swap(npath); + } +} +#endif // USE_TREEWALK + void IdxTreeModel::populate() { QModelIndex index = this->index(0,0); - auto topdirs = m_config->getTopdirs(); - - auto prefix = commonprefix(topdirs); - if (this->columnCount(index) == 0) { if (!this->insertColumn(0, index)) return; } - int row = 0; + +#ifdef USE_TREEWALK + auto topdirs = m_config->getTopdirs(); + auto prefix = commonprefix(topdirs); for (const auto& topdir : topdirs) { const QModelIndex child = this->index(row, 0, index); std::string topdisp; @@ -106,4 +160,13 @@ void IdxTreeModel::populate() ++row; } sort(0, Qt::AscendingOrder); +#else + std::vector thedirs; + std::string prefix; + rcldb->dirlist(m_depth, prefix, thedirs); + const QModelIndex child = this->index(row, 0, index); + FsTreeWalker walker; + WalkerCB cb(m_config, prefix == "/" ? std::string() : prefix, walker, this, child); + treelist(path_getfather(prefix), thedirs, cb); +#endif } diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index d96517eb..c468d695 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -433,6 +433,16 @@ public: usage. Inserts the types at the end of the parameter */ bool getAllDbMimeTypes(std::vector&); + /** Compute a list of all the directories containing indexed documents, down to a given depth + + @param depth depth belowr a possible computed common prefix, that is, if all + directories are relative to /home/you, a depth of 2 would get you /home/you/1/2 but + not /home/you/1/2/3 + @param[out] commonprefix common prefix path for the list. May be "/". + @param[out] dirs the computed list (full paths including the prefix). + */ + bool dirlist(int depth, std::string& commonprefix, std::vector& dirs); + /** Wildcard expansion specific to file names. Internal/sdata use only */ bool filenameWildExp(const string& exp, vector& names, int max); diff --git a/src/rcldb/rclterms.cpp b/src/rcldb/rclterms.cpp index 35dacba6..c3b7f151 100644 --- a/src/rcldb/rclterms.cpp +++ b/src/rcldb/rclterms.cpp @@ -469,6 +469,91 @@ bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root, return ret; } +// Compute list of directories in index at given depth under the common root (which we also compute) +// This is used for the GUI directory side filter tree. +// +// This is more complicated than it seems because there could be extra_dbs so we can't use topdirs +// (which we did with an fstreewalk() initially), and also inode/directory might be excluded from +// the index (for example by an onlymimetypes parameter). +// +// We look at all the paths, compute a common prefix, and truncate at the given depth under the +// prefix and insert into an std::unordered_set for deduplication. +// +// unordered_set was the (slightly) fastest of "insert all then sort and truncate", std::set, +// std::unordered_set. Other approaches may exist, for example, by skipping same prefix in the list +// (which is sorted). Did not try, as the current approach is reasonably fast. +// +// This is admittedly horrible, and might be too slow on very big indexes, or actually fail if the +// requested depth is such that we reach the max term length and the terms are +// truncated-hashed. We'd need to look at the doc data for the full URLs, but this would be much +// slower. +// +// Still I have no other idea of how to do this, other than disable the side filter if directories +// are not indexed? +// +// We could use less memory by not computing a full list and walking the index twice instead (we +// need two passes in any case because of the common root computation). +// + +bool Db::dirlist(int depth, std::string& root, std::vector& dirs) +{ + // Build a full list of filesystem paths. + Xapian::Database xdb = m_ndb->xrdb; + auto prefix = wrap_prefix("Q"); + std::vector listall; + for (int tries = 0; tries < 2; tries++) { + try { + Xapian::TermIterator it = xdb.allterms_begin(); + it.skip_to(prefix.c_str()); + for (; it != xdb.allterms_end(); it++) { + string ixterm{*it}; + // If we're beyond the Q terms end + if (ixterm.find(prefix) != 0) + break; + ixterm = strip_prefix(ixterm); + // Skip non-paths like Web entries etc. + if (!path_isabsolute(ixterm)) + continue; + // Skip subdocs + auto pos = ixterm.find_first_of('|'); + if (pos < ixterm.size() - 1) + continue; + listall.push_back(ixterm); + } + break; + } catch (const Xapian::DatabaseModifiedError &e) { + m_reason = e.get_msg(); + xdb.reopen(); + continue; + } XCATCHERROR(m_reason); + break; + } + if (!m_reason.empty()) { + LOGERR("Db::dirlist: " << m_reason << "\n"); + return false; + } + + root = commonprefix(listall); + std::unordered_set unics; + for (auto& entry : listall) { + string::size_type pos = root.size(); + for (int i = 0; i < depth; i++) { + auto npos = entry.find("/", pos+1); + if (npos == std::string::npos) { + break; + } + pos = npos; + } + entry.erase(pos); + unics.insert(entry); + } + + dirs.clear(); + dirs.insert(dirs.begin(), unics.begin(), unics.end()); + sort(dirs.begin(), dirs.end()); + return true; +} + /** Term list walking. */ class TermIter { public: diff --git a/src/testmains/Makefile.am b/src/testmains/Makefile.am index c2d72caa..74a6185b 100644 --- a/src/testmains/Makefile.am +++ b/src/testmains/Makefile.am @@ -39,7 +39,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \ $(DEFS) noinst_PROGRAMS = plaintorich textsplit fstreewalk rclconfig hldata unac mbox \ - circache wipedir mimetype fileudi x11mon trqrstore ecrontab + circache wipedir mimetype fileudi x11mon trqrstore ecrontab rcldb ecrontab_SOURCES = trecrontab.cpp ecrontab_LDADD = ../librecoll.la @@ -65,6 +65,9 @@ mimetype_LDADD = ../librecoll.la rclconfig_SOURCES = trrclconfig.cpp rclconfig_LDADD = ../librecoll.la +rcldb_SOURCES = trrcldb.cpp +rcldb_LDADD = ../librecoll.la + textsplit_SOURCES = trtextsplit.cpp textsplit_LDADD = ../librecoll.la diff --git a/src/testmains/trrcldb.cpp b/src/testmains/trrcldb.cpp new file mode 100644 index 00000000..9e0bebb7 --- /dev/null +++ b/src/testmains/trrcldb.cpp @@ -0,0 +1,131 @@ +/* Copyright (C) 2022-2022 J.F.Dockes + * + * License: GPL 2.1 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include + +#include "rclinit.h" +#include "rclconfig.h" +#include "rcldb.h" +#include "pathut.h" +#include "smallut.h" +#include "rclutil.h" +#include "log.h" + +static std::map options { + {"dirlist", 0}, +}; + +static char *thisprog; +static void Usage(FILE *fp = stderr) +{ + string sopts; + for (const auto& opt: options) { + sopts += "--" + opt.first + "\n"; + } + fprintf(fp, "%s: usage: %s\n%s", thisprog, thisprog, sopts.c_str()); + exit(1); +} + +int main(int argc, char *argv[]) +{ + thisprog = *argv; + std::vector long_options; + + for (auto& entry : options) { + struct option opt; + opt.name = entry.first.c_str(); + opt.has_arg = 0; + opt.flag = &entry.second; + opt.val = 1; + long_options.push_back(opt); + } + long_options.push_back({0, 0, 0, 0}); + + std::string confdir; + std::string *argcnf{nullptr}; + int opt; + while ((opt = getopt_long(argc, argv, "c:", &long_options[0], nullptr)) != -1) { + switch (opt) { + case 'c': + confdir = optarg; + argcnf = &confdir; + break; + case 0: + break; + default: + Usage(); + } + } +#if 0 + for (const auto& e : options) { + std::cerr << e.first << " -> " << e.second << "\n"; + } +#endif + + if (options["dirlist"]) { + std::string reason; + RclConfig *rclconfig = recollinit(0, 0, 0, reason, argcnf); + if (!rclconfig || !rclconfig->ok()) { + std::cerr << "Recoll init failed: " << reason << "\n"; + return 1; + } + Rcl::Db rcldb(rclconfig); + if (!rcldb.open(Rcl::Db::DbRO)) { + LOGERR("db open error\n"); + return 1; + } + const char *cp; + if ((cp = getenv("RECOLL_EXTRA_DBS")) != 0) { + vector dbl; + stringToTokens(cp, dbl, ":"); + for (const auto& path : dbl) { + string dbdir = path_canon(path); + path_catslash(dbdir); + bool stripped; + if (!Rcl::Db::testDbDir(dbdir, &stripped)) { + LOGERR("Not a xapian index: [" << dbdir << "]\n"); + return 1; + } + if (!rcldb.addQueryDb(dbdir)) { + LOGERR("Can't add " << dbdir << " as extra index\n"); + return 1; + } + } + } + + std::string prefix; + std::vector dirs; + rcldb.dirlist(1, prefix, dirs); + sort(dirs.begin(), dirs.end()); + std::cout << "Prefix " << prefix << " dirs :\n"; + for (const auto& dir : dirs) { + std::cout << dir << "\n"; + } + return 0; + } else { + std::cerr << "No operation set\n"; + Usage(); + } +}