GUI directory side filter: compute the directories from the index, not the FS tree, so that things work with external indexes

This commit is contained in:
Jean-Francois Dockes 2022-05-10 09:17:58 +02:00
parent be758e2c94
commit 2ebb0a689d
5 changed files with 298 additions and 6 deletions

View File

@ -9,6 +9,7 @@
#include "fstreewalk.h"
#include "idxmodel.h"
#undef USE_TREEWALK
class WalkerCB : public FsTreeWalkerCB {
public:
@ -45,6 +46,7 @@ FsTreeWalker::Status WalkerCB::processone(
return FsTreeWalker::FtwOk;
}
if (flg == FsTreeWalker::FtwDirEnter) {
//std::cerr << "ENTER: " << path << "\n";
if (m_model->columnCount(m_indexes.top()) == 0) {
if (!m_model->insertColumn(0, m_indexes.top()))
return FsTreeWalker::FtwError;
@ -70,6 +72,7 @@ FsTreeWalker::Status WalkerCB::processone(
return FsTreeWalker::FtwOk;
}
#ifdef USE_TREEWALK
static void populateDir(RclConfig *config, const std::string& topstr, IdxTreeModel *model,
const QModelIndex& index, const std::string& path, int depth)
{
@ -81,19 +84,70 @@ static void populateDir(RclConfig *config, const std::string& topstr, IdxTreeMod
walker.walk(path, cb);
}
#else
// Assemble a path from its components up to lst
std::string toksToPath(std::vector<std::string>& path, int lst)
{
std::string out;
for (int i = 0; i <= lst; i++) {
out += "/" + path[i];
}
if (out.empty())
out = "/";
return out;
}
// Process a sorted list of directory paths, generating a sequence of enter/exit calls equivalent to
// what would happen for a recursive tree walk of the original tree.
static void treelist(const std::string& top, const std::vector<std::string>& lst, WalkerCB &cb)
{
if (lst.empty()) {
return;
}
std::vector<std::string> curpath;
stringToTokens(top, curpath, "/");
std::cerr << "top " << top << " TOP len is " << curpath.size() << "\n";
for (const auto& dir : lst) {
// std::cerr << "DIR: " << dir << "\n";
std::vector<std::string> npath;
// Compute the new directory stack
stringToTokens(dir, npath, "/");
// Walk the stacks until we find a differing entry, and then unwind the old stack to the new
// base, and issue enter calls for new entries over the base.
int i = 0;
for (; i < int(std::min(curpath.size(), npath.size())); i++) {
if (npath[i] != curpath[i] && int(curpath.size()) > 0) {
// Differing at i, unwind old stack and break the main loop
for (int j = int(curpath.size()) - 1; j >= i; j--) {
//std::cerr << "Exiting " << toksToPath(curpath, j) << "\n";
cb.processone(toksToPath(curpath, j), nullptr, FsTreeWalker::FtwDirReturn);
}
break;
}
}
// Callbacks for new entries above the base.
for (int j = i; j < int(npath.size()); j++) {
std::cerr << "Entering " << toksToPath(npath, j) << "\n";
cb.processone(toksToPath(npath, j), nullptr, FsTreeWalker::FtwDirEnter);
}
curpath.swap(npath);
}
}
#endif // USE_TREEWALK
void IdxTreeModel::populate()
{
QModelIndex index = this->index(0,0);
auto topdirs = m_config->getTopdirs();
auto prefix = commonprefix(topdirs);
if (this->columnCount(index) == 0) {
if (!this->insertColumn(0, index))
return;
}
int row = 0;
#ifdef USE_TREEWALK
auto topdirs = m_config->getTopdirs();
auto prefix = commonprefix(topdirs);
for (const auto& topdir : topdirs) {
const QModelIndex child = this->index(row, 0, index);
std::string topdisp;
@ -106,4 +160,13 @@ void IdxTreeModel::populate()
++row;
}
sort(0, Qt::AscendingOrder);
#else
std::vector<std::string> thedirs;
std::string prefix;
rcldb->dirlist(m_depth, prefix, thedirs);
const QModelIndex child = this->index(row, 0, index);
FsTreeWalker walker;
WalkerCB cb(m_config, prefix == "/" ? std::string() : prefix, walker, this, child);
treelist(path_getfather(prefix), thedirs, cb);
#endif
}

View File

@ -433,6 +433,16 @@ public:
usage. Inserts the types at the end of the parameter */
bool getAllDbMimeTypes(std::vector<std::string>&);
/** Compute a list of all the directories containing indexed documents, down to a given depth
@param depth depth belowr a possible computed common prefix, that is, if all
directories are relative to /home/you, a depth of 2 would get you /home/you/1/2 but
not /home/you/1/2/3
@param[out] commonprefix common prefix path for the list. May be "/".
@param[out] dirs the computed list (full paths including the prefix).
*/
bool dirlist(int depth, std::string& commonprefix, std::vector<std::string>& dirs);
/** Wildcard expansion specific to file names. Internal/sdata use only */
bool filenameWildExp(const string& exp, vector<string>& names, int max);

View File

@ -469,6 +469,91 @@ bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
return ret;
}
// Compute list of directories in index at given depth under the common root (which we also compute)
// This is used for the GUI directory side filter tree.
//
// This is more complicated than it seems because there could be extra_dbs so we can't use topdirs
// (which we did with an fstreewalk() initially), and also inode/directory might be excluded from
// the index (for example by an onlymimetypes parameter).
//
// We look at all the paths, compute a common prefix, and truncate at the given depth under the
// prefix and insert into an std::unordered_set for deduplication.
//
// unordered_set was the (slightly) fastest of "insert all then sort and truncate", std::set,
// std::unordered_set. Other approaches may exist, for example, by skipping same prefix in the list
// (which is sorted). Did not try, as the current approach is reasonably fast.
//
// This is admittedly horrible, and might be too slow on very big indexes, or actually fail if the
// requested depth is such that we reach the max term length and the terms are
// truncated-hashed. We'd need to look at the doc data for the full URLs, but this would be much
// slower.
//
// Still I have no other idea of how to do this, other than disable the side filter if directories
// are not indexed?
//
// We could use less memory by not computing a full list and walking the index twice instead (we
// need two passes in any case because of the common root computation).
//
bool Db::dirlist(int depth, std::string& root, std::vector<std::string>& dirs)
{
// Build a full list of filesystem paths.
Xapian::Database xdb = m_ndb->xrdb;
auto prefix = wrap_prefix("Q");
std::vector<std::string> listall;
for (int tries = 0; tries < 2; tries++) {
try {
Xapian::TermIterator it = xdb.allterms_begin();
it.skip_to(prefix.c_str());
for (; it != xdb.allterms_end(); it++) {
string ixterm{*it};
// If we're beyond the Q terms end
if (ixterm.find(prefix) != 0)
break;
ixterm = strip_prefix(ixterm);
// Skip non-paths like Web entries etc.
if (!path_isabsolute(ixterm))
continue;
// Skip subdocs
auto pos = ixterm.find_first_of('|');
if (pos < ixterm.size() - 1)
continue;
listall.push_back(ixterm);
}
break;
} catch (const Xapian::DatabaseModifiedError &e) {
m_reason = e.get_msg();
xdb.reopen();
continue;
} XCATCHERROR(m_reason);
break;
}
if (!m_reason.empty()) {
LOGERR("Db::dirlist: " << m_reason << "\n");
return false;
}
root = commonprefix(listall);
std::unordered_set<std::string> unics;
for (auto& entry : listall) {
string::size_type pos = root.size();
for (int i = 0; i < depth; i++) {
auto npos = entry.find("/", pos+1);
if (npos == std::string::npos) {
break;
}
pos = npos;
}
entry.erase(pos);
unics.insert(entry);
}
dirs.clear();
dirs.insert(dirs.begin(), unics.begin(), unics.end());
sort(dirs.begin(), dirs.end());
return true;
}
/** Term list walking. */
class TermIter {
public:

View File

@ -39,7 +39,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
$(DEFS)
noinst_PROGRAMS = plaintorich textsplit fstreewalk rclconfig hldata unac mbox \
circache wipedir mimetype fileudi x11mon trqrstore ecrontab
circache wipedir mimetype fileudi x11mon trqrstore ecrontab rcldb
ecrontab_SOURCES = trecrontab.cpp
ecrontab_LDADD = ../librecoll.la
@ -65,6 +65,9 @@ mimetype_LDADD = ../librecoll.la
rclconfig_SOURCES = trrclconfig.cpp
rclconfig_LDADD = ../librecoll.la
rcldb_SOURCES = trrcldb.cpp
rcldb_LDADD = ../librecoll.la
textsplit_SOURCES = trtextsplit.cpp
textsplit_LDADD = ../librecoll.la

131
src/testmains/trrcldb.cpp Normal file
View File

@ -0,0 +1,131 @@
/* Copyright (C) 2022-2022 J.F.Dockes
*
* License: GPL 2.1
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <string>
#include <iostream>
#include <set>
#include <getopt.h>
#include <malloc.h>
#include <unistd.h>
#include "rclinit.h"
#include "rclconfig.h"
#include "rcldb.h"
#include "pathut.h"
#include "smallut.h"
#include "rclutil.h"
#include "log.h"
static std::map<std::string, int> options {
{"dirlist", 0},
};
static char *thisprog;
static void Usage(FILE *fp = stderr)
{
string sopts;
for (const auto& opt: options) {
sopts += "--" + opt.first + "\n";
}
fprintf(fp, "%s: usage: %s\n%s", thisprog, thisprog, sopts.c_str());
exit(1);
}
int main(int argc, char *argv[])
{
thisprog = *argv;
std::vector<struct option> long_options;
for (auto& entry : options) {
struct option opt;
opt.name = entry.first.c_str();
opt.has_arg = 0;
opt.flag = &entry.second;
opt.val = 1;
long_options.push_back(opt);
}
long_options.push_back({0, 0, 0, 0});
std::string confdir;
std::string *argcnf{nullptr};
int opt;
while ((opt = getopt_long(argc, argv, "c:", &long_options[0], nullptr)) != -1) {
switch (opt) {
case 'c':
confdir = optarg;
argcnf = &confdir;
break;
case 0:
break;
default:
Usage();
}
}
#if 0
for (const auto& e : options) {
std::cerr << e.first << " -> " << e.second << "\n";
}
#endif
if (options["dirlist"]) {
std::string reason;
RclConfig *rclconfig = recollinit(0, 0, 0, reason, argcnf);
if (!rclconfig || !rclconfig->ok()) {
std::cerr << "Recoll init failed: " << reason << "\n";
return 1;
}
Rcl::Db rcldb(rclconfig);
if (!rcldb.open(Rcl::Db::DbRO)) {
LOGERR("db open error\n");
return 1;
}
const char *cp;
if ((cp = getenv("RECOLL_EXTRA_DBS")) != 0) {
vector<string> dbl;
stringToTokens(cp, dbl, ":");
for (const auto& path : dbl) {
string dbdir = path_canon(path);
path_catslash(dbdir);
bool stripped;
if (!Rcl::Db::testDbDir(dbdir, &stripped)) {
LOGERR("Not a xapian index: [" << dbdir << "]\n");
return 1;
}
if (!rcldb.addQueryDb(dbdir)) {
LOGERR("Can't add " << dbdir << " as extra index\n");
return 1;
}
}
}
std::string prefix;
std::vector<std::string> dirs;
rcldb.dirlist(1, prefix, dirs);
sort(dirs.begin(), dirs.end());
std::cout << "Prefix " << prefix << " dirs :\n";
for (const auto& dir : dirs) {
std::cout << dir << "\n";
}
return 0;
} else {
std::cerr << "No operation set\n";
Usage();
}
}