diff --git a/src/Makefile.am b/src/Makefile.am index 6e55710d..cd01ff39 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -107,6 +107,8 @@ index/fsindexer.cpp \ index/fsindexer.h \ index/idxstatus.h \ index/idxstatus.cpp \ +index/idxdiags.h \ +index/idxdiags.cpp \ index/mimetype.cpp \ index/mimetype.h \ index/rclmon.h \ diff --git a/src/common/rclconfig.cpp b/src/common/rclconfig.cpp index f2f3f282..94f4c5e4 100644 --- a/src/common/rclconfig.cpp +++ b/src/common/rclconfig.cpp @@ -54,6 +54,7 @@ #include "cpuconf.h" #include "execmd.h" #include "md5.h" +#include "idxdiags.h" using namespace std; @@ -754,6 +755,7 @@ bool RclConfig::inStopSuffixes(const string& fni) if (it != STOPSUFFIXES->end()) { LOGDEB2("RclConfig::inStopSuffixes: Found (" << fni << ") [" << ((*it).m_str) << "]\n"); + IdxDiags::theDiags().record(IdxDiags::NoContentSuffix, fni); return true; } else { LOGDEB2("RclConfig::inStopSuffixes: not found [" << fni << "]\n"); @@ -822,35 +824,38 @@ bool RclConfig::getMimeCatTypes(const string& cat, vector& tps) const return true; } -string RclConfig::getMimeHandlerDef(const string &mtype, bool filtertypes) +string RclConfig::getMimeHandlerDef(const string &mtype, bool filtertypes, const std::string& fn) { string hs; if (filtertypes) { if(m_rmtstate.needrecompute()) { m_restrictMTypes.clear(); - stringToStrings(stringtolower((const string&)m_rmtstate.getvalue()), - m_restrictMTypes); + stringToStrings(stringtolower((const string&)m_rmtstate.getvalue()), m_restrictMTypes); } if (m_xmtstate.needrecompute()) { m_excludeMTypes.clear(); - stringToStrings(stringtolower((const string&)m_xmtstate.getvalue()), - m_excludeMTypes); + stringToStrings(stringtolower((const string&)m_xmtstate.getvalue()), m_excludeMTypes); } - if (!m_restrictMTypes.empty() && - !m_restrictMTypes.count(stringtolower(mtype))) { - LOGDEB2("RclConfig::getMimeHandlerDef: not in mime type list\n"); + if (!m_restrictMTypes.empty() && !m_restrictMTypes.count(stringtolower(mtype))) { + IdxDiags::theDiags().record(IdxDiags::NotIncludedMime, fn, mtype); + LOGDEB1("RclConfig::getMimeHandlerDef: " << mtype << " not in mime type list\n"); return hs; } - if (!m_excludeMTypes.empty() && - m_excludeMTypes.count(stringtolower(mtype))) { - LOGDEB2("RclConfig::getMimeHandlerDef: in excluded mime list\n"); + if (!m_excludeMTypes.empty() && m_excludeMTypes.count(stringtolower(mtype))) { + IdxDiags::theDiags().record(IdxDiags::ExcludedMime, fn, mtype); + LOGDEB1("RclConfig::getMimeHandlerDef: " << mtype << " in excluded mime list (fn " << + fn << ")\n"); return hs; } } if (!mimeconf->get(mtype, hs, "index")) { - LOGDEB1("getMimeHandlerDef: no handler for '" << mtype << "'\n"); + if (mtype != "inode/directory") { + IdxDiags::theDiags().record(IdxDiags::NoHandler, fn, mtype); + LOGDEB1("getMimeHandlerDef: no handler for '" << mtype << "' (fn " << + fn << ")\n"); + } } return hs; } diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index cf00d094..733cab8e 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -248,7 +248,8 @@ public: string getSuffixFromMimeType(const string &mt) const; /** mimeconf: get input filter for mimetype */ - string getMimeHandlerDef(const string &mimetype, bool filtertypes=false); + string getMimeHandlerDef(const string &mimetype, bool filtertypes=false, + const std::string& fn = std::string()); /** For lines like: "name = some value; attr1 = value1; attr2 = val2" * Separate the value and store the attributes in a ConfSimple diff --git a/src/doc/man/recollindex.1 b/src/doc/man/recollindex.1 index 9656c9da..3726194d 100644 --- a/src/doc/man/recollindex.1 +++ b/src/doc/man/recollindex.1 @@ -15,6 +15,9 @@ recollindex \- indexing command for the Recoll full text search system [ .B \-k ] +[ +.B \--diagfile + ] .br .B recollindex [ @@ -93,6 +96,12 @@ pattern ] .B \--webcache-burst +.B recollindex +[ +.B \-c +] +.B \--notindexed +[path [path ...]] .SH DESCRIPTION The @@ -142,7 +151,44 @@ will try again to process all failed files. Please note that .B recollindex may also decide to retry failed files if the auxiliary checking script defined by the "checkneedretryindexscript" configuration variable indicates -that this should happen. +that this should happen. +.PP +If option +.B \--diagfile +is given, the path given as parameter will be truncated and indexing +diagnostics will be written to it. Each line in the file will have a +diagnostic type (reason for the file not to be indexed), the file path, and +a possible additional piece of information, which can be the MIME type or +the archive internal path depending on the issue. The following diagnostic +types are currently defined: +.IP +.B Skipped +: the path matches an element of +.B skippedPaths or +.B skippedNames. +.IP +.B NoContentSuffix +: the file name suffix is found in the +.B noContentSuffixes +list. +.IP +.B MissingHelper +: a helper program is missing. +.IP +.B Error +: general error (see the log). +.IP +.B NoHandler: no handler is defined for the MIME type. +.IP +.B ExcludedMime +: the MIME type is part of the +.B excludedmimetypes +list. +.IP +.B NotIncludedMime +: the +.B onlymimetypes +list is not empty and the the MIME type is not in it. .PP If option .B @@ -297,7 +343,12 @@ cache. .B recollindex \--webcache-burst will extract all entries from the Web cache to files created inside . Each cache entry is extracted as two files, for the data and metadata. - +.PP +.B recollindex \--notindexed [path [path ...]] +will check each path and print out those which are absent from the index +(with an "ABSENT" prefix), or caused an indexing error (with an "ERROR" +prefix). If no paths are given on the command line, the command will read +them, one per line, from stdin. .SH SEE ALSO .PP diff --git a/src/index/checkindexed.cpp b/src/index/checkindexed.cpp index 83c09209..55107912 100644 --- a/src/index/checkindexed.cpp +++ b/src/index/checkindexed.cpp @@ -19,6 +19,8 @@ */ #include "autoconfig.h" +#include "checkindexed.h" + #include #include diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp index 5160e278..62515789 100644 --- a/src/index/fsindexer.cpp +++ b/src/index/fsindexer.cpp @@ -47,6 +47,7 @@ #include "rclinit.h" #include "extrameta.h" #include "utf8fn.h" +#include "idxdiags.h" #if defined(HAVE_POSIX_FADVISE) #include #include @@ -397,8 +398,7 @@ bool FsIndexer::indexFiles(list& files, int flags) continue; } } - if (processone(*it, &stb, FsTreeWalker::FtwRegular) != - FsTreeWalker::FtwOk) { + if (processone(*it, &stb, FsTreeWalker::FtwRegular) != FsTreeWalker::FtwOk) { LOGERR("FsIndexer::indexFiles: processone failed\n"); goto out; } @@ -560,9 +560,8 @@ void *FsIndexerInternfileWorker(void * fsp) return (void*)1; } LOGDEB0("FsIndexerInternfileWorker: task fn " << tsk->fn << "\n"); - if (fip->processonefile(&myconf, tsk->fn, &tsk->statbuf, - tsk->localfields) != - FsTreeWalker::FtwOk) { + if (fip->processonefile( + &myconf, tsk->fn, &tsk->statbuf, tsk->localfields) != FsTreeWalker::FtwOk) { LOGERR("FsIndexerInternfileWorker: processone failed\n"); tqp->workerExit(); return (void*)0; @@ -584,9 +583,8 @@ void *FsIndexerInternfileWorker(void * fsp) /// Accent and majuscule handling are performed by the db module when doing /// the actual indexing work. The Rcl::Doc created by internfile() /// mostly contains pretty raw utf8 data. -FsTreeWalker::Status -FsIndexer::processone(const std::string &fn, const struct PathStat *stp, - FsTreeWalker::CbFlag flg) +FsTreeWalker::Status FsIndexer::processone( + const std::string &fn, const struct PathStat *stp, FsTreeWalker::CbFlag flg) { if (m_updater) { #ifdef IDX_THREADS @@ -610,7 +608,10 @@ FsIndexer::processone(const std::string &fn, const struct PathStat *stp, if (flg == FsTreeWalker::FtwDirReturn) return FsTreeWalker::FtwOk; } - + if (flg == FsTreeWalker::FtwSkipped) { + IdxDiags::theDiags().record(IdxDiags::Skipped, fn); + return FsTreeWalker::FtwOk; + } #ifdef IDX_THREADS if (m_haveInternQ) { InternfileTask *tp = new InternfileTask(fn, stp, m_localfields); @@ -644,10 +645,9 @@ bool FsIndexer::launchAddOrUpdate(const string& udi, const string& parent_udi, return m_db->addOrUpdate(udi, parent_udi, doc); } -FsTreeWalker::Status -FsIndexer::processonefile(RclConfig *config, - const std::string &fn, const struct PathStat *stp, - const map& localfields) +FsTreeWalker::Status FsIndexer::processonefile( + RclConfig *config, const std::string &fn, const struct PathStat *stp, + const map& localfields) { //////////////////// // Check db up to date ? Doing this before file type @@ -693,7 +693,7 @@ FsIndexer::processonefile(RclConfig *config, // If noretryfailed is set, check for a file which previously // failed to index, and avoid re-processing it if (needupdate && m_noretryfailed && existingDoc && - !oldsig.empty() && *oldsig.rbegin() == '+') { + !oldsig.empty() && oldsig.back() == '+') { // Check that the sigs are the same except for the '+'. If the file // actually changed, we always retry (maybe it was fixed) string nold = oldsig.substr(0, oldsig.size()-1); @@ -720,8 +720,7 @@ FsIndexer::processonefile(RclConfig *config, return FsTreeWalker::FtwOk; } - LOGDEB0("processone: processing: [" << - displayableBytes(stp->pst_size) << "] " << fn << "\n"); + LOGDEB0("processone: processing: [" << displayableBytes(stp->pst_size) << "] " << fn << "\n"); // Note that we used to do the full path here, but I ended up // believing that it made more sense to use only the file name @@ -813,6 +812,7 @@ FsIndexer::processonefile(RclConfig *config, // myriads of such files, the ext script is executed for them // and fails every time) if (fis == FileInterner::FIError) { + IdxDiags::theDiags().record(IdxDiags::Error, fn, doc.ipath); doc.sig += cstr_plus; } @@ -822,8 +822,7 @@ FsIndexer::processonefile(RclConfig *config, // Add document to database. If there is an ipath, add it // as a child of the file document. - if (!launchAddOrUpdate(udi, doc.ipath.empty() ? - cstr_null : parent_udi, doc)) { + if (!launchAddOrUpdate(udi, doc.ipath.empty() ? cstr_null : parent_udi, doc)) { return FsTreeWalker::FtwError; } diff --git a/src/index/idxdiags.cpp b/src/index/idxdiags.cpp new file mode 100644 index 00000000..ee9bcb56 --- /dev/null +++ b/src/index/idxdiags.cpp @@ -0,0 +1,98 @@ +/* Copyright (C) 2021 J.F.Dockes + * + * License: GPL 2.1 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "autoconfig.h" + +#include +#include + +#include "idxdiags.h" + +static std::mutex diagmutex; + +class IdxDiags::Internal { +public: + ~Internal() { + if (fp) { + fclose(fp); + } + } + FILE *fp{nullptr}; +}; + +IdxDiags::IdxDiags() +{ + m = new Internal; +} + +IdxDiags::~IdxDiags() +{ + delete m; +} + +bool IdxDiags::flush() +{ + std::unique_lock lock(diagmutex); + if (m && m->fp) { + return fflush(m->fp) ? false : true; + } + return true; +} + +static IdxDiags *theInstance; + +IdxDiags& IdxDiags::theDiags() +{ + if (nullptr == theInstance) { + theInstance = new IdxDiags; + } + return *theInstance; +} + +bool IdxDiags::init(const std::string& outpath) +{ + m->fp = fopen(outpath.c_str(), "w"); + if (nullptr == m->fp) { + return false; + } + return true; +} + +bool IdxDiags::record(DiagKind diag, const std::string& path, const std::string& detail) +{ + if (nullptr == m || nullptr == m->fp || (path.empty() && detail.empty())) { + return true; + } + const char *skind = "Unknown"; + switch (diag) { + case Ok: skind = "Ok";break; + case Skipped: skind = "Skipped";break; + case NoContentSuffix: skind = "NoContentSuffix";break; + case MissingHelper: skind = "MissingHelper";break; + case Error: skind = "Error";break; + case NoHandler: skind = "NoHandler";break; + case ExcludedMime: skind = "ExcludedMime";break; + case NotIncludedMime: skind = "NotIncludedMime";break; + } + + std::unique_lock lock(diagmutex); + fprintf(m->fp, "%s %s | %s\n", skind, path.c_str(), detail.c_str()); + return true; +} diff --git a/src/index/idxdiags.h b/src/index/idxdiags.h new file mode 100644 index 00000000..3518700d --- /dev/null +++ b/src/index/idxdiags.h @@ -0,0 +1,50 @@ +/* Copyright (C) 2021 J.F.Dockes + * + * License: GPL 2.1 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _IDXDIAGS_H_INCLUDED_ +#define _IDXDIAGS_H_INCLUDED_ + +#include + +class IdxDiags { +public: + enum DiagKind {Ok, Skipped, NoContentSuffix, MissingHelper, Error, NoHandler, + ExcludedMime, NotIncludedMime}; + + // Retrieve a reference to the single instance. + static IdxDiags& theDiags(); + + // Initialize, setting the output file path. outpath will be truncated. + // No locking: this must be called from the main thread, before going multithread. + // If init is never called, further calls to record() or flush() will be noops. + bool init(const std::string& outpath); + + // Record a reason for a document not to be indexed. + bool record(DiagKind diag, const std::string& path, const std::string& detail = std::string()); + bool flush(); + + class Internal; +private: + Internal *m; + IdxDiags(); + ~IdxDiags(); +}; + +#endif /* _IDXDIAGS_H_INCLUDED_ */ diff --git a/src/index/rclmonrcv.cpp b/src/index/rclmonrcv.cpp index 2882c703..e3fc0df1 100644 --- a/src/index/rclmonrcv.cpp +++ b/src/index/rclmonrcv.cpp @@ -66,20 +66,16 @@ static RclMonitor *makeMonitor(); */ class WalkCB : public FsTreeWalkerCB { public: - WalkCB(RclConfig *conf, RclMonitor *mon, RclMonEventQueue *queue, - FsTreeWalker& walker) - : m_config(conf), m_mon(mon), m_queue(queue), m_walker(walker) - {} + WalkCB(RclConfig *conf, RclMonitor *mon, RclMonEventQueue *queue, FsTreeWalker& walker) + : m_config(conf), m_mon(mon), m_queue(queue), m_walker(walker) {} virtual ~WalkCB() {} - virtual FsTreeWalker::Status - processone(const string &fn, const struct PathStat *st, - FsTreeWalker::CbFlag flg) { + virtual FsTreeWalker::Status processone( + const string &fn, const struct PathStat *st, FsTreeWalker::CbFlag flg) { MONDEB("rclMonRcvRun: processone " << fn << " m_mon " << m_mon << " m_mon->ok " << (m_mon ? m_mon->ok() : false) << std::endl); - if (flg == FsTreeWalker::FtwDirEnter || - flg == FsTreeWalker::FtwDirReturn) { + if (flg == FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwDirReturn) { m_config->setKeyDir(fn); // Set up skipped patterns for this subtree. m_walker.setSkippedNames(m_config->getSkippedNames()); @@ -106,8 +102,7 @@ public: m_mon->saved_errno != ENOENT) return FsTreeWalker::FtwError; } - } else if (!m_mon->generatesExist() && - flg == FsTreeWalker::FtwRegular) { + } else if (!m_mon->generatesExist() && flg == FsTreeWalker::FtwRegular) { // Have to synthetize events for regular files existence // at startup because the monitor does not do it // Note 2011-09-29: no sure this is actually needed. We just ran diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index c5ef9471..64c5de0c 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -62,6 +62,7 @@ using namespace std; #include "checkretryfailed.h" #include "idxstatus.h" #include "circache.h" +#include "idxdiags.h" // Command line options static int op_flags; @@ -93,11 +94,13 @@ static int op_flags; #define OPTVAL_WEBCACHE_COMPACT 1000 #define OPTVAL_WEBCACHE_BURST 1001 #define OPTVAL_DIAGS_NOTINDEXED 1002 +#define OPTVAL_DIAGS_DIAGFILE 1003 static struct option long_options[] = { {"webcache-compact", 0, 0, OPTVAL_WEBCACHE_COMPACT}, {"webcache-burst", required_argument, 0, OPTVAL_WEBCACHE_BURST}, {"notindexed", 0, 0, OPTVAL_DIAGS_NOTINDEXED}, + {"diagfile", required_argument, 0, OPTVAL_DIAGS_DIAGFILE}, {0, 0, 0, 0} }; @@ -110,6 +113,7 @@ static ConfIndexer *confindexer; static void cleanup() { deleteZ(confindexer); + IdxDiags::theDiags().flush(); recoll_exitready(); } @@ -274,20 +278,15 @@ static void setMyPriority(const RclConfig *config) class MakeListWalkerCB : public FsTreeWalkerCB { public: MakeListWalkerCB(list& files, const vector& selpats) - : m_files(files), m_pats(selpats) - { - } - virtual FsTreeWalker::Status - processone(const string& fn, const struct PathStat *, - FsTreeWalker::CbFlag flg) { + : m_files(files), m_pats(selpats) {} + virtual FsTreeWalker::Status processone( + const string& fn, const struct PathStat *, FsTreeWalker::CbFlag flg) { if (flg== FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwRegular){ if (m_pats.empty()) { - cerr << "Selecting " << fn << endl; m_files.push_back(fn); } else { - for (vector::const_iterator it = m_pats.begin(); - it != m_pats.end(); it++) { - if (fnmatch(it->c_str(), fn.c_str(), 0) == 0) { + for (const auto& pat : m_pats) { + if (fnmatch(pat.c_str(), fn.c_str(), 0) == 0) { m_files.push_back(fn); break; } @@ -451,6 +450,8 @@ static const char usage [] = " -Z : in place reset: consider all documents as changed. Can also\n" " be combined with -i or -r but not -m\n" " -k : retry files on which we previously failed\n" +" --diagfile : list skipped or otherwise not indexed documents to \n" +" will be truncated\n" #ifdef RCL_MONITOR "recollindex -m [-w ] -x [-D] [-C]\n" " Perform real time indexing. Don't become a daemon if -D is set.\n" @@ -636,6 +637,7 @@ int main(int argc, char *argv[]) bool diags_notindexed{false}; std::string burstdir; + std::string diagfile; while ((ret = getopt_long(argc, (char *const*)&args[0], "c:CDdEefhikKlmnPp:rR:sS:w:xZz", long_options, NULL)) != -1) { switch (ret) { @@ -676,7 +678,7 @@ int main(int argc, char *argv[]) case OPTVAL_WEBCACHE_COMPACT: webcache_compact = true; break; case OPTVAL_WEBCACHE_BURST: burstdir = optarg; webcache_burst = true;break; case OPTVAL_DIAGS_NOTINDEXED: diags_notindexed = true;break; - + case OPTVAL_DIAGS_DIAGFILE: diagfile = optarg;break; default: Usage(); break; } } @@ -790,6 +792,12 @@ int main(int argc, char *argv[]) } } + if (!diagfile.empty()) { + if (!IdxDiags::theDiags().init(diagfile)) { + std::cerr << "Could not initialize diags file " << diagfile << "\n"; + LOGERR("recollindex: Could not initialize diags file " << diagfile << "\n"); + } + } bool rezero((op_flags & OPT_z) != 0); bool inPlaceReset((op_flags & OPT_Z) != 0); diff --git a/src/index/webqueue.cpp b/src/index/webqueue.cpp index 4ed890e1..46cfa01a 100644 --- a/src/index/webqueue.cpp +++ b/src/index/webqueue.cpp @@ -411,8 +411,7 @@ WebQueueIndexer::processone( if (flg != FsTreeWalker::FtwRegular) return FsTreeWalker::FtwOk; - string dotpath = path_cat(path_getfather(path), - string(DOTFILEPREFIX) + path_getsimple(path)); + string dotpath = path_cat(path_getfather(path), string(DOTFILEPREFIX) + path_getsimple(path)); LOGDEB("WebQueueIndexer: prc1: [" << path << "]\n"); WebQueueDotFile dotfile(m_config, dotpath); diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 68336a31..c60e1e21 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -240,17 +240,15 @@ void FileInterner::init(const string &f, const struct PathStat *stp, m_mimetype = l_mime; // Look for appropriate handler (might still return empty) - RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview); + RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview, f); if (!df || df->is_unknown()) { // No real handler for this type, for now :( - LOGDEB("FileInterner:: unprocessed mime: [" << l_mime << "] [" << f << - "]\n"); + LOGDEB("FileInterner:: unprocessed mime: [" << l_mime << "] [" << f << "]\n"); if (!df) return; } - df->set_property(Dijon::Filter::OPERATING_MODE, - m_forPreview ? "view" : "index"); + df->set_property(Dijon::Filter::OPERATING_MODE, m_forPreview ? "view" : "index"); df->set_property(Dijon::Filter::DJF_UDI, udi); df->set_docsize(docsize); @@ -271,8 +269,7 @@ FileInterner::FileInterner(const string &data, RclConfig *cnf, init(data, cnf, flags, imime); } -void FileInterner::init(const string &data, RclConfig *, - int, const string& imime) +void FileInterner::init(const string &data, RclConfig *, int, const string& imime) { if (imime.empty()) { LOGERR("FileInterner: inmemory constructor needs input mime type\n"); @@ -281,7 +278,7 @@ void FileInterner::init(const string &data, RclConfig *, m_mimetype = imime; // Look for appropriate handler (might still return empty) - RecollFilter *df = getMimeHandler(m_mimetype, m_cfg, !m_forPreview); + RecollFilter *df = getMimeHandler(m_mimetype, m_cfg, !m_forPreview, m_fn); if (!df) { // No handler for this type, for now :( if indexallfilenames @@ -289,8 +286,7 @@ void FileInterner::init(const string &data, RclConfig *, LOGDEB("FileInterner:: unprocessed mime [" << m_mimetype << "]\n"); return; } - df->set_property(Dijon::Filter::OPERATING_MODE, - m_forPreview ? "view" : "index"); + df->set_property(Dijon::Filter::OPERATING_MODE, m_forPreview ? "view" : "index"); df->set_docsize(data.length()); if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) { @@ -741,12 +737,11 @@ int FileInterner::addHandler() getKeyValue(docdata, cstr_dj_keyipath, ipathel); bool dofilter = !m_forPreview && (mimetype.compare(cstr_texthtml) || !ipathel.empty()); - RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, dofilter); + RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, dofilter, m_fn); if (!newflt) { // If we can't find a handler, this doc can't be handled // but there can be other ones so we go on - LOGINFO("FileInterner::addHandler: no filter for [" << mimetype << - "]\n"); + LOGINFO("FileInterner::addHandler: no filter for [" << mimetype << "]\n"); return ADD_CONTINUE; } newflt->set_property(Dijon::Filter::OPERATING_MODE, diff --git a/src/internfile/mh_exec.cpp b/src/internfile/mh_exec.cpp index 396cea98..47ad9db7 100644 --- a/src/internfile/mh_exec.cpp +++ b/src/internfile/mh_exec.cpp @@ -29,6 +29,7 @@ #include "smallut.h" #include "md5ut.h" #include "rclconfig.h" +#include "idxdiags.h" using namespace std; @@ -186,6 +187,7 @@ bool MimeHandlerExec::next_document() missingHelper = true; m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd; whatHelper = m_reason; + IdxDiags::theDiags().record(IdxDiags::MissingHelper, m_fn); } else if (output.find("RECFILTERROR") == 0) { // If the output string begins with RECFILTERROR, then it's // interpretable error information out from a recoll script @@ -193,6 +195,7 @@ bool MimeHandlerExec::next_document() std::string::size_type pos; if ((pos = output.find("RECFILTERROR ")) == 0) { if (output.find("HELPERNOTFOUND") != string::npos) { + IdxDiags::theDiags().record(IdxDiags::MissingHelper, m_fn); missingHelper = true; whatHelper = output.substr(pos); } diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index 59bf83e5..bf68a9b1 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -33,6 +33,7 @@ #include "mimetype.h" #include "idfile.h" #include "rclutil.h" +#include "idxdiags.h" using namespace std; @@ -72,6 +73,7 @@ bool MimeHandlerExecMultiple::startCmd() vectormyparams(params.begin() + 1, params.end()); if (m_cmd.startExec(cmd, myparams, 1, 1) < 0) { + IdxDiags::theDiags().record(IdxDiags::MissingHelper, m_fn); m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd; missingHelper = true; whatHelper = cmd; @@ -113,6 +115,7 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data) if ((pos = ibuf.find("RECFILTERROR ")) == 0) { m_reason = ibuf; if (ibuf.find("HELPERNOTFOUND") != string::npos) { + IdxDiags::theDiags().record(IdxDiags::MissingHelper, m_fn); missingHelper = true; whatHelper = ibuf.substr(pos); } diff --git a/src/internfile/mimehandler.cpp b/src/internfile/mimehandler.cpp index d587f505..8540f047 100644 --- a/src/internfile/mimehandler.cpp +++ b/src/internfile/mimehandler.cpp @@ -256,8 +256,8 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs, } /* Get handler/filter object for given mime type: */ -RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg, - bool filtertypes) +RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg, + bool filtertypes, const std::string& fn) { LOGDEB("getMimeHandler: mtype [" << mtype << "] filtertypes " << filtertypes << "\n"); @@ -270,7 +270,7 @@ RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg, // indexedmimetypes but an html handler could still be in the // cache because it was needed by some other interning stack). string hs; - hs = cfg->getMimeHandlerDef(mtype, filtertypes); + hs = cfg->getMimeHandlerDef(mtype, filtertypes, fn); string id; if (!hs.empty()) { diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h index 57012771..dc867aa1 100644 --- a/src/internfile/mimehandler.h +++ b/src/internfile/mimehandler.h @@ -169,7 +169,7 @@ protected: * indexedmimetypes (if this is set at all). */ extern RecollFilter *getMimeHandler(const std::string &mtyp, RclConfig *cfg, - bool filtertypes); + bool filtertypes, const std::string& fn = std::string()); /// Free up filter for reuse (you can also delete it) extern void returnMimeHandler(RecollFilter *); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 9676d2a4..3ec9caf0 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -2401,7 +2401,7 @@ bool Db::dbStats(DbStats& res, bool listfailed) try { Xapian::Document doc = xdb.get_document(docid); string sig = doc.get_value(VALUE_SIG); - if (sig.empty() || sig[sig.size()-1] != '+') { + if (sig.empty() || sig.back() != '+') { continue; } string data = doc.get_data(); diff --git a/src/utils/fstreewalk.cpp b/src/utils/fstreewalk.cpp index c36a98e9..939074b1 100644 --- a/src/utils/fstreewalk.cpp +++ b/src/utils/fstreewalk.cpp @@ -411,9 +411,7 @@ FsTreeWalker::Status FsTreeWalker::iwalk(const string &top, // Skipped file names match ? if (!data->skippedNames.empty()) { if (inSkippedNames(dname)) { - if (data->options & FtwOnlySkipped) { - cb.processone(path_cat(top, dname), nullptr, FtwSkipped); - } + cb.processone(path_cat(top, dname), nullptr, FtwSkipped); continue; } } @@ -428,9 +426,7 @@ FsTreeWalker::Status FsTreeWalker::iwalk(const string &top, // this was broken by 1.13.00 and the systematic use of // FNM_LEADING_DIR if (inSkippedPaths(fn, false)) { - if (data->options & FtwOnlySkipped) { - cb.processone(fn, nullptr, FtwSkipped); - } + cb.processone(fn, nullptr, FtwSkipped); continue; } }