From 485a0fc650218554274f49007eeab9dcc062877b Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 31 Mar 2021 16:18:11 +0200 Subject: [PATCH] add recollindex option to take a list of files and indicate which are indexed or not --- src/Makefile.am | 2 + src/index/checkindexed.cpp | 90 ++++++++++++++++++++++++++++++++++++++ src/index/checkindexed.h | 34 ++++++++++++++ src/index/recollindex.cpp | 23 +++++++++- 4 files changed, 147 insertions(+), 2 deletions(-) create mode 100644 src/index/checkindexed.cpp create mode 100644 src/index/checkindexed.h diff --git a/src/Makefile.am b/src/Makefile.am index da831b21..6e55710d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -93,6 +93,8 @@ index/webqueue.cpp \ index/webqueue.h \ index/webqueuefetcher.cpp \ index/webqueuefetcher.h \ +index/checkindexed.cpp \ +index/checkindexed.h \ index/checkretryfailed.cpp \ index/checkretryfailed.h \ index/exefetcher.cpp \ diff --git a/src/index/checkindexed.cpp b/src/index/checkindexed.cpp new file mode 100644 index 00000000..83c09209 --- /dev/null +++ b/src/index/checkindexed.cpp @@ -0,0 +1,90 @@ +/* Copyright (C) 2021 J.F.Dockes + * + * License: GPL 2.1 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include "autoconfig.h" + +#include +#include + +#include "rclconfig.h" +#include "fileudi.h" +#include "rcldb.h" +#include "rcldoc.h" +#include "smallut.h" + +class PathYielder { +public: + PathYielder(const std::vector& paths) + : m_paths(paths) { + if (m_paths.size()) { + m_index = 0; + } + } + std::string getPath() { + if (m_index >= 0) { + if (m_index < int(m_paths.size())) { + return m_paths[m_index++]; + } + } else { + char line[1024]; + if (fgets(line, 1023, stdin)) { + std::string sl(line); + trimstring(sl, "\n\r"); + return sl; + } + } + return std::string(); + } + int m_index{-1}; + const std::vector& m_paths; +}; + +bool checkindexed(RclConfig *conf, const std::vector& filepaths) +{ + PathYielder paths(filepaths); + Rcl::Db db(conf); + if (!db.open(Rcl::Db::DbRO)) { + std::cerr << "Could not open index for reading\n"; + return false; + } + for (;;) { + auto path = paths.getPath(); + if (path.empty()) { + break; + } + std::string udi; + make_udi(path, std::string(), udi); + Rcl::Doc doc; + if (!db.getDoc(udi, "", doc)) { + std::cerr << "Unexpected error from getdoc\n"; + return false; + } + // See comments in getdoc + if (doc.pc == -1) { + std::cout << "ABSENT " << path << std::endl; + } else { + std::string sig; + if (!doc.getmeta(Rcl::Doc::keysig, &sig) || + sig.back() == '+') { + std::cout << "ERROR " << path << std::endl; + } + } + } + return true; +} diff --git a/src/index/checkindexed.h b/src/index/checkindexed.h new file mode 100644 index 00000000..6e9f2ea8 --- /dev/null +++ b/src/index/checkindexed.h @@ -0,0 +1,34 @@ +/* Copyright (C) 2021 J.F.Dockes + * + * License: GPL 2.1 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _CHECKINDEXED_H_INCLUDED_ +#define _CHECKINDEXED_H_INCLUDED_ + +#include +#include + +class RclConfig; + +// Diagnostic routine. Reads paths from stdin (one per line) if filepaths is empty. +// For each path, check that the file is indexed, print back its path +// with an ERROR or ABSENT prefix if it's not +extern bool checkindexed(RclConfig *conf, const std::vector& filepaths); + +#endif /* _CHECKINDEXED_H_INCLUDED_ */ diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index b68cc09c..c5ef9471 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -47,6 +47,7 @@ using namespace std; #include "rclmon.h" #include "x11mon.h" #include "cancelcheck.h" +#include "checkindexed.h" #include "rcldb.h" #include "readfile.h" #ifndef DISABLE_WEB_INDEXER @@ -91,10 +92,12 @@ static int op_flags; #define OPTVAL_WEBCACHE_COMPACT 1000 #define OPTVAL_WEBCACHE_BURST 1001 +#define OPTVAL_DIAGS_NOTINDEXED 1002 static struct option long_options[] = { {"webcache-compact", 0, 0, OPTVAL_WEBCACHE_COMPACT}, {"webcache-burst", required_argument, 0, OPTVAL_WEBCACHE_BURST}, + {"notindexed", 0, 0, OPTVAL_DIAGS_NOTINDEXED}, {0, 0, 0, 0} }; @@ -476,6 +479,10 @@ static const char usage [] = " Build stem database for additional language \n" "recollindex -E\n" " Check configuration file for topdirs and other paths existence\n" +"recollindex --webcache-compact : recover wasted space from the Web cache\n" +"recollindex --webcache-burst : extract entries from the Web cache to the target\n" +"recollindex --notindexed [filepath [filepath ...]] : check if the file arguments are indexed\n" +" will read file paths from stdin if there are no arguments\n" #ifdef FUTURE_IMPROVEMENT "recollindex -W\n" " Process the Web queue\n" @@ -626,6 +633,8 @@ int main(int argc, char *argv[]) int ret; bool webcache_compact{false}; bool webcache_burst{false}; + bool diags_notindexed{false}; + std::string burstdir; while ((ret = getopt_long(argc, (char *const*)&args[0], "c:CDdEefhikKlmnPp:rR:sS:w:xZz", long_options, NULL)) != -1) { @@ -666,7 +675,8 @@ int main(int argc, char *argv[]) case OPTVAL_WEBCACHE_COMPACT: webcache_compact = true; break; case OPTVAL_WEBCACHE_BURST: burstdir = optarg; webcache_burst = true;break; - + case OPTVAL_DIAGS_NOTINDEXED: diags_notindexed = true;break; + default: Usage(); break; } } @@ -705,7 +715,7 @@ int main(int argc, char *argv[]) } // Auxiliary, non-index-related things. Avoids having a separate binary. - if (webcache_compact || webcache_burst) { + if (webcache_compact || webcache_burst || diags_notindexed) { std::string ccdir = config->getWebcacheDir(); std::string reason; if (webcache_compact) { @@ -718,7 +728,16 @@ int main(int argc, char *argv[]) std::cerr << "Web cache burst failed: " << reason << "\n"; exit(1); } + } else if (diags_notindexed) { + std::vector filepaths; + while (aremain--) { + filepaths.push_back(args[optind++]); + } + if (!checkindexed(config, filepaths)) { + exit(1); + } } + exit(0); }