GUI directory side filter: compute the directories from the index, not the FS tree, so that things work with external indexes

2022-05-10 09:17:58 +02:00 · 2022-05-10 09:17:58 +02:00 · 2ebb0a689d
commit 2ebb0a689d
parent be758e2c94
5 changed files with 298 additions and 6 deletions
--- a/src/qtgui/idxmodel.cpp
+++ b/src/qtgui/idxmodel.cpp
@ -9,6 +9,7 @@
 #include "fstreewalk.h"
 #include "idxmodel.h"

+#undef USE_TREEWALK

 class WalkerCB : public FsTreeWalkerCB {
 public:
@ -45,6 +46,7 @@ FsTreeWalker::Status WalkerCB::processone(
        return FsTreeWalker::FtwOk;
    }
    if (flg == FsTreeWalker::FtwDirEnter) {
+        //std::cerr << "ENTER: " << path << "\n";
        if (m_model->columnCount(m_indexes.top()) == 0) {
            if (!m_model->insertColumn(0, m_indexes.top()))
                return FsTreeWalker::FtwError;
@ -70,6 +72,7 @@ FsTreeWalker::Status WalkerCB::processone(
    return FsTreeWalker::FtwOk;
 }

+#ifdef USE_TREEWALK
 static void populateDir(RclConfig *config, const std::string& topstr, IdxTreeModel *model,
                        const QModelIndex& index, const std::string& path, int depth)
 {
@ -81,19 +84,70 @@ static void populateDir(RclConfig *config, const std::string& topstr, IdxTreeMod
    walker.walk(path, cb);
 }

+#else
+
+// Assemble a path from its components up to lst
+std::string toksToPath(std::vector<std::string>& path, int lst)
+{
+    std::string out;
+    for (int i = 0; i <= lst; i++) {
+        out += "/" + path[i];
+    }
+    if (out.empty())
+        out = "/";
+    return out;
+}
+
+// Process a sorted list of directory paths, generating a sequence of enter/exit calls equivalent to
+// what would happen for a recursive tree walk of the original tree.
+static void treelist(const std::string& top, const std::vector<std::string>& lst, WalkerCB &cb)
+{
+    if (lst.empty()) {
+        return;
+    }
+    std::vector<std::string> curpath;
+    stringToTokens(top, curpath, "/");
+    std::cerr << "top " << top << " TOP len is " << curpath.size() << "\n";
+    for (const auto& dir : lst) {
+        // std::cerr << "DIR: " << dir << "\n";
+        std::vector<std::string> npath;
+        // Compute the new directory stack
+        stringToTokens(dir, npath, "/");
+        // Walk the stacks until we find a differing entry, and then unwind the old stack to the new
+        // base, and issue enter calls for new entries over the base.
+        int i = 0;
+        for (; i < int(std::min(curpath.size(), npath.size())); i++) {
+            if (npath[i] != curpath[i] && int(curpath.size()) > 0) {
+                // Differing at i, unwind old stack and break the main loop
+                for (int j = int(curpath.size()) - 1; j >= i; j--) {
+                    //std::cerr << "Exiting  " <<  toksToPath(curpath, j) << "\n";
+                    cb.processone(toksToPath(curpath, j), nullptr, FsTreeWalker::FtwDirReturn);
+                }
+                break;
+            }
+        }
+        // Callbacks for new entries above the base.
+        for (int j = i; j < int(npath.size()); j++) {
+            std::cerr << "Entering " << toksToPath(npath, j) << "\n";
+            cb.processone(toksToPath(npath, j), nullptr, FsTreeWalker::FtwDirEnter);
+        }
+        curpath.swap(npath);
+    }
+}
+#endif // USE_TREEWALK
+
 void IdxTreeModel::populate()
 {
    QModelIndex index = this->index(0,0);
-    auto topdirs = m_config->getTopdirs();
-
-    auto prefix = commonprefix(topdirs);
-
    if (this->columnCount(index) == 0) {
        if (!this->insertColumn(0, index))
            return;
    }
-
    int row = 0;
+
+#ifdef USE_TREEWALK
+    auto topdirs = m_config->getTopdirs();
+    auto prefix = commonprefix(topdirs);
    for (const auto& topdir : topdirs) {
        const QModelIndex child = this->index(row, 0, index);
        std::string topdisp;
@ -106,4 +160,13 @@ void IdxTreeModel::populate()
        ++row;
    }
    sort(0, Qt::AscendingOrder);
+#else
+    std::vector<std::string> thedirs;
+    std::string prefix;
+    rcldb->dirlist(m_depth, prefix, thedirs);
+    const QModelIndex child = this->index(row, 0, index);
+    FsTreeWalker walker;
+    WalkerCB cb(m_config, prefix == "/" ? std::string() : prefix, walker, this, child);
+    treelist(path_getfather(prefix), thedirs, cb);
+#endif
 }
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -433,6 +433,16 @@ public:
        usage. Inserts the types at the end of the parameter */
    bool getAllDbMimeTypes(std::vector<std::string>&);

+    /** Compute a list of all the directories containing indexed documents, down to a given depth
+
+        @param depth depth belowr a possible computed common prefix, that is, if all
+          directories are relative to /home/you, a depth of 2 would get you /home/you/1/2 but 
+          not /home/you/1/2/3
+        @param[out] commonprefix common prefix path for the list. May be "/".
+        @param[out] dirs the computed list (full paths including the prefix).
+    */
+    bool dirlist(int depth, std::string& commonprefix, std::vector<std::string>& dirs);
+
    /** Wildcard expansion specific to file names. Internal/sdata use only */
    bool filenameWildExp(const string& exp, vector<string>& names, int max);

--- a/src/rcldb/rclterms.cpp
+++ b/src/rcldb/rclterms.cpp
@ -469,6 +469,91 @@ bool Db::idxTermMatch(int typ_sens, const string &lang, const string &root,
    return ret;
 }

+// Compute list of directories in index at given depth under the common root (which we also compute)
+// This is used for the GUI directory side filter tree.
+//
+// This is more complicated than it seems because there could be extra_dbs so we can't use topdirs
+// (which we did with an fstreewalk() initially), and also inode/directory might be excluded from
+// the index (for example by an onlymimetypes parameter).
+//
+// We look at all the paths, compute a common prefix, and truncate at the given depth under the
+// prefix and insert into an std::unordered_set for deduplication.
+//
+// unordered_set was the (slightly) fastest of "insert all then sort and truncate", std::set,
+// std::unordered_set. Other approaches may exist, for example, by skipping same prefix in the list
+// (which is sorted). Did not try, as the current approach is reasonably fast.
+//
+// This is admittedly horrible, and might be too slow on very big indexes, or actually fail if the
+// requested depth is such that we reach the max term length and the terms are
+// truncated-hashed. We'd need to look at the doc data for the full URLs, but this would be much
+// slower.
+//
+// Still I have no other idea of how to do this, other than disable the side filter if directories
+// are not indexed?
+//
+// We could use less memory by not computing a full list and walking the index twice instead (we
+// need two passes in any case because of the common root computation).
+//
+
+bool Db::dirlist(int depth, std::string& root, std::vector<std::string>& dirs)
+{
+    // Build a full list of filesystem paths.
+    Xapian::Database xdb = m_ndb->xrdb;
+    auto prefix = wrap_prefix("Q");
+    std::vector<std::string> listall;
+    for (int tries = 0; tries < 2; tries++) { 
+        try {
+            Xapian::TermIterator it = xdb.allterms_begin(); 
+            it.skip_to(prefix.c_str());
+            for (; it != xdb.allterms_end(); it++) {
+                string ixterm{*it};
+                // If we're beyond the Q terms end
+                if (ixterm.find(prefix) != 0)
+                    break;
+                ixterm = strip_prefix(ixterm);
+                // Skip non-paths like Web entries etc.
+                if (!path_isabsolute(ixterm))
+                    continue;
+                // Skip subdocs
+                auto pos = ixterm.find_first_of('|');
+                if (pos < ixterm.size() - 1)
+                    continue;
+                listall.push_back(ixterm);
+            }
+            break;
+        } catch (const Xapian::DatabaseModifiedError &e) {
+            m_reason = e.get_msg();
+            xdb.reopen();
+            continue;
+        } XCATCHERROR(m_reason);
+        break;
+    }
+    if (!m_reason.empty()) {
+        LOGERR("Db::dirlist: " << m_reason << "\n");
+        return false;
+    }
+
+    root = commonprefix(listall);
+    std::unordered_set<std::string> unics;
+    for (auto& entry : listall) {
+        string::size_type pos = root.size();
+        for (int i = 0; i < depth; i++) {
+            auto npos = entry.find("/", pos+1);
+            if (npos == std::string::npos) {
+                break;
+            }
+            pos = npos;
+        }
+        entry.erase(pos);
+        unics.insert(entry);
+    }
+
+    dirs.clear();
+    dirs.insert(dirs.begin(), unics.begin(), unics.end());
+    sort(dirs.begin(), dirs.end());
+    return true;
+}
+
 /** Term list walking. */
 class TermIter {
 public:
--- a/src/testmains/Makefile.am
+++ b/src/testmains/Makefile.am
@ -39,7 +39,7 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
    $(DEFS)

 noinst_PROGRAMS = plaintorich textsplit fstreewalk rclconfig hldata unac mbox \
-    circache wipedir mimetype fileudi x11mon trqrstore ecrontab
+    circache wipedir mimetype fileudi x11mon trqrstore ecrontab rcldb

 ecrontab_SOURCES = trecrontab.cpp
 ecrontab_LDADD = ../librecoll.la
@ -65,6 +65,9 @@ mimetype_LDADD = ../librecoll.la
 rclconfig_SOURCES = trrclconfig.cpp
 rclconfig_LDADD = ../librecoll.la

+rcldb_SOURCES = trrcldb.cpp
+rcldb_LDADD = ../librecoll.la
+
 textsplit_SOURCES = trtextsplit.cpp
 textsplit_LDADD = ../librecoll.la

--- a/src/testmains/trrcldb.cpp
+++ b/src/testmains/trrcldb.cpp
@ -0,0 +1,131 @@
+/* Copyright (C) 2022-2022 J.F.Dockes
+ *
+ * License: GPL 2.1
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the
+ * Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string>
+#include <iostream>
+#include <set>
+#include <getopt.h>
+#include <malloc.h>
+#include <unistd.h>
+
+#include "rclinit.h"
+#include "rclconfig.h"
+#include "rcldb.h"
+#include "pathut.h"
+#include "smallut.h"
+#include "rclutil.h"
+#include "log.h"
+
+static std::map<std::string, int> options {
+    {"dirlist", 0},
+};
+
+static char *thisprog;
+static void Usage(FILE *fp = stderr)
+{
+    string sopts;
+    for (const auto& opt: options) {
+        sopts += "--" + opt.first + "\n";
+    }
+    fprintf(fp, "%s: usage: %s\n%s", thisprog, thisprog, sopts.c_str());
+    exit(1);
+}
+
+int main(int argc, char *argv[])
+{
+    thisprog = *argv;
+    std::vector<struct option> long_options;
+
+    for (auto& entry : options) {
+        struct option opt;
+        opt.name = entry.first.c_str();
+        opt.has_arg = 0;
+        opt.flag = &entry.second;
+        opt.val = 1;
+        long_options.push_back(opt);
+    }
+    long_options.push_back({0, 0, 0, 0});
+
+    std::string confdir;
+    std::string *argcnf{nullptr};
+    int opt;
+    while ((opt = getopt_long(argc, argv, "c:", &long_options[0], nullptr)) != -1) {
+        switch (opt) {
+        case 'c':
+            confdir = optarg;
+            argcnf = &confdir;
+            break;
+        case 0:
+            break;
+        default:
+            Usage();
+        }
+    }
+#if 0
+    for (const auto& e : options) {
+        std::cerr << e.first << " -> " << e.second << "\n";
+    }
+#endif
+
+    if (options["dirlist"]) {
+        std::string reason;
+        RclConfig *rclconfig = recollinit(0, 0, 0, reason, argcnf);
+        if (!rclconfig || !rclconfig->ok()) {
+            std::cerr << "Recoll init failed: " << reason << "\n";
+            return 1;
+        }
+        Rcl::Db rcldb(rclconfig);
+        if (!rcldb.open(Rcl::Db::DbRO)) {
+            LOGERR("db open error\n");
+            return 1;
+        }
+        const char *cp;
+        if ((cp = getenv("RECOLL_EXTRA_DBS")) != 0) {
+            vector<string> dbl;
+            stringToTokens(cp, dbl, ":");
+            for (const auto& path : dbl) {
+                string dbdir = path_canon(path);
+                path_catslash(dbdir);
+                bool stripped;
+                if (!Rcl::Db::testDbDir(dbdir, &stripped)) {
+                    LOGERR("Not a xapian index: [" << dbdir << "]\n");
+                    return 1;
+                }
+                if (!rcldb.addQueryDb(dbdir)) {
+                    LOGERR("Can't add " << dbdir << " as extra index\n");
+                    return 1;
+                }
+            }
+        }
+
+        std::string prefix;
+        std::vector<std::string> dirs;
+        rcldb.dirlist(1, prefix, dirs);
+        sort(dirs.begin(), dirs.end());
+        std::cout << "Prefix " << prefix << " dirs :\n";
+        for (const auto& dir : dirs) {
+            std::cout << dir << "\n";
+        }
+        return 0;
+    } else {
+        std::cerr << "No operation set\n";
+        Usage();
+    }
+}