Add recollindex option to write file not indexed reasons to diagnostics file

This commit is contained in:
Jean-Francois Dockes 2021-04-01 10:32:04 +02:00
parent 485a0fc650
commit 4756b1252b
18 changed files with 288 additions and 81 deletions

View File

@ -107,6 +107,8 @@ index/fsindexer.cpp \
index/fsindexer.h \ index/fsindexer.h \
index/idxstatus.h \ index/idxstatus.h \
index/idxstatus.cpp \ index/idxstatus.cpp \
index/idxdiags.h \
index/idxdiags.cpp \
index/mimetype.cpp \ index/mimetype.cpp \
index/mimetype.h \ index/mimetype.h \
index/rclmon.h \ index/rclmon.h \

View File

@ -54,6 +54,7 @@
#include "cpuconf.h" #include "cpuconf.h"
#include "execmd.h" #include "execmd.h"
#include "md5.h" #include "md5.h"
#include "idxdiags.h"
using namespace std; using namespace std;
@ -754,6 +755,7 @@ bool RclConfig::inStopSuffixes(const string& fni)
if (it != STOPSUFFIXES->end()) { if (it != STOPSUFFIXES->end()) {
LOGDEB2("RclConfig::inStopSuffixes: Found (" << fni << ") [" << LOGDEB2("RclConfig::inStopSuffixes: Found (" << fni << ") [" <<
((*it).m_str) << "]\n"); ((*it).m_str) << "]\n");
IdxDiags::theDiags().record(IdxDiags::NoContentSuffix, fni);
return true; return true;
} else { } else {
LOGDEB2("RclConfig::inStopSuffixes: not found [" << fni << "]\n"); LOGDEB2("RclConfig::inStopSuffixes: not found [" << fni << "]\n");
@ -822,35 +824,38 @@ bool RclConfig::getMimeCatTypes(const string& cat, vector<string>& tps) const
return true; return true;
} }
string RclConfig::getMimeHandlerDef(const string &mtype, bool filtertypes) string RclConfig::getMimeHandlerDef(const string &mtype, bool filtertypes, const std::string& fn)
{ {
string hs; string hs;
if (filtertypes) { if (filtertypes) {
if(m_rmtstate.needrecompute()) { if(m_rmtstate.needrecompute()) {
m_restrictMTypes.clear(); m_restrictMTypes.clear();
stringToStrings(stringtolower((const string&)m_rmtstate.getvalue()), stringToStrings(stringtolower((const string&)m_rmtstate.getvalue()), m_restrictMTypes);
m_restrictMTypes);
} }
if (m_xmtstate.needrecompute()) { if (m_xmtstate.needrecompute()) {
m_excludeMTypes.clear(); m_excludeMTypes.clear();
stringToStrings(stringtolower((const string&)m_xmtstate.getvalue()), stringToStrings(stringtolower((const string&)m_xmtstate.getvalue()), m_excludeMTypes);
m_excludeMTypes);
} }
if (!m_restrictMTypes.empty() && if (!m_restrictMTypes.empty() && !m_restrictMTypes.count(stringtolower(mtype))) {
!m_restrictMTypes.count(stringtolower(mtype))) { IdxDiags::theDiags().record(IdxDiags::NotIncludedMime, fn, mtype);
LOGDEB2("RclConfig::getMimeHandlerDef: not in mime type list\n"); LOGDEB1("RclConfig::getMimeHandlerDef: " << mtype << " not in mime type list\n");
return hs; return hs;
} }
if (!m_excludeMTypes.empty() && if (!m_excludeMTypes.empty() && m_excludeMTypes.count(stringtolower(mtype))) {
m_excludeMTypes.count(stringtolower(mtype))) { IdxDiags::theDiags().record(IdxDiags::ExcludedMime, fn, mtype);
LOGDEB2("RclConfig::getMimeHandlerDef: in excluded mime list\n"); LOGDEB1("RclConfig::getMimeHandlerDef: " << mtype << " in excluded mime list (fn " <<
fn << ")\n");
return hs; return hs;
} }
} }
if (!mimeconf->get(mtype, hs, "index")) { if (!mimeconf->get(mtype, hs, "index")) {
LOGDEB1("getMimeHandlerDef: no handler for '" << mtype << "'\n"); if (mtype != "inode/directory") {
IdxDiags::theDiags().record(IdxDiags::NoHandler, fn, mtype);
LOGDEB1("getMimeHandlerDef: no handler for '" << mtype << "' (fn " <<
fn << ")\n");
}
} }
return hs; return hs;
} }

View File

@ -248,7 +248,8 @@ public:
string getSuffixFromMimeType(const string &mt) const; string getSuffixFromMimeType(const string &mt) const;
/** mimeconf: get input filter for mimetype */ /** mimeconf: get input filter for mimetype */
string getMimeHandlerDef(const string &mimetype, bool filtertypes=false); string getMimeHandlerDef(const string &mimetype, bool filtertypes=false,
const std::string& fn = std::string());
/** For lines like: "name = some value; attr1 = value1; attr2 = val2" /** For lines like: "name = some value; attr1 = value1; attr2 = val2"
* Separate the value and store the attributes in a ConfSimple * Separate the value and store the attributes in a ConfSimple

View File

@ -15,6 +15,9 @@ recollindex \- indexing command for the Recoll full text search system
[ [
.B \-k .B \-k
] ]
[
.B \--diagfile
<diagpath> ]
.br .br
.B recollindex .B recollindex
[ [
@ -93,6 +96,12 @@ pattern
<cfdir>] <cfdir>]
.B \--webcache-burst .B \--webcache-burst
<destdir> <destdir>
.B recollindex
[
.B \-c
<cfdir>]
.B \--notindexed
[path [path ...]]
.SH DESCRIPTION .SH DESCRIPTION
The The
@ -145,6 +154,43 @@ defined by the "checkneedretryindexscript" configuration variable indicates
that this should happen. that this should happen.
.PP .PP
If option If option
.B \--diagfile
is given, the path given as parameter will be truncated and indexing
diagnostics will be written to it. Each line in the file will have a
diagnostic type (reason for the file not to be indexed), the file path, and
a possible additional piece of information, which can be the MIME type or
the archive internal path depending on the issue. The following diagnostic
types are currently defined:
.IP
.B Skipped
: the path matches an element of
.B skippedPaths or
.B skippedNames.
.IP
.B NoContentSuffix
: the file name suffix is found in the
.B noContentSuffixes
list.
.IP
.B MissingHelper
: a helper program is missing.
.IP
.B Error
: general error (see the log).
.IP
.B NoHandler: no handler is defined for the MIME type.
.IP
.B ExcludedMime
: the MIME type is part of the
.B excludedmimetypes
list.
.IP
.B NotIncludedMime
: the
.B onlymimetypes
list is not empty and the the MIME type is not in it.
.PP
If option
.B .B
\-m \-m
is given, recollindex is started for real time monitoring, using the is given, recollindex is started for real time monitoring, using the
@ -297,7 +343,12 @@ cache.
.B recollindex \--webcache-burst <destdir> .B recollindex \--webcache-burst <destdir>
will extract all entries from the Web cache to files created inside will extract all entries from the Web cache to files created inside
<destdir>. Each cache entry is extracted as two files, for the data and metadata. <destdir>. Each cache entry is extracted as two files, for the data and metadata.
.PP
.B recollindex \--notindexed [path [path ...]]
will check each path and print out those which are absent from the index
(with an "ABSENT" prefix), or caused an indexing error (with an "ERROR"
prefix). If no paths are given on the command line, the command will read
them, one per line, from stdin.
.SH SEE ALSO .SH SEE ALSO
.PP .PP

View File

@ -19,6 +19,8 @@
*/ */
#include "autoconfig.h" #include "autoconfig.h"
#include "checkindexed.h"
#include <stdio.h> #include <stdio.h>
#include <iostream> #include <iostream>

View File

@ -47,6 +47,7 @@
#include "rclinit.h" #include "rclinit.h"
#include "extrameta.h" #include "extrameta.h"
#include "utf8fn.h" #include "utf8fn.h"
#include "idxdiags.h"
#if defined(HAVE_POSIX_FADVISE) #if defined(HAVE_POSIX_FADVISE)
#include <unistd.h> #include <unistd.h>
#include <fcntl.h> #include <fcntl.h>
@ -397,8 +398,7 @@ bool FsIndexer::indexFiles(list<string>& files, int flags)
continue; continue;
} }
} }
if (processone(*it, &stb, FsTreeWalker::FtwRegular) != if (processone(*it, &stb, FsTreeWalker::FtwRegular) != FsTreeWalker::FtwOk) {
FsTreeWalker::FtwOk) {
LOGERR("FsIndexer::indexFiles: processone failed\n"); LOGERR("FsIndexer::indexFiles: processone failed\n");
goto out; goto out;
} }
@ -560,9 +560,8 @@ void *FsIndexerInternfileWorker(void * fsp)
return (void*)1; return (void*)1;
} }
LOGDEB0("FsIndexerInternfileWorker: task fn " << tsk->fn << "\n"); LOGDEB0("FsIndexerInternfileWorker: task fn " << tsk->fn << "\n");
if (fip->processonefile(&myconf, tsk->fn, &tsk->statbuf, if (fip->processonefile(
tsk->localfields) != &myconf, tsk->fn, &tsk->statbuf, tsk->localfields) != FsTreeWalker::FtwOk) {
FsTreeWalker::FtwOk) {
LOGERR("FsIndexerInternfileWorker: processone failed\n"); LOGERR("FsIndexerInternfileWorker: processone failed\n");
tqp->workerExit(); tqp->workerExit();
return (void*)0; return (void*)0;
@ -584,9 +583,8 @@ void *FsIndexerInternfileWorker(void * fsp)
/// Accent and majuscule handling are performed by the db module when doing /// Accent and majuscule handling are performed by the db module when doing
/// the actual indexing work. The Rcl::Doc created by internfile() /// the actual indexing work. The Rcl::Doc created by internfile()
/// mostly contains pretty raw utf8 data. /// mostly contains pretty raw utf8 data.
FsTreeWalker::Status FsTreeWalker::Status FsIndexer::processone(
FsIndexer::processone(const std::string &fn, const struct PathStat *stp, const std::string &fn, const struct PathStat *stp, FsTreeWalker::CbFlag flg)
FsTreeWalker::CbFlag flg)
{ {
if (m_updater) { if (m_updater) {
#ifdef IDX_THREADS #ifdef IDX_THREADS
@ -610,7 +608,10 @@ FsIndexer::processone(const std::string &fn, const struct PathStat *stp,
if (flg == FsTreeWalker::FtwDirReturn) if (flg == FsTreeWalker::FtwDirReturn)
return FsTreeWalker::FtwOk; return FsTreeWalker::FtwOk;
} }
if (flg == FsTreeWalker::FtwSkipped) {
IdxDiags::theDiags().record(IdxDiags::Skipped, fn);
return FsTreeWalker::FtwOk;
}
#ifdef IDX_THREADS #ifdef IDX_THREADS
if (m_haveInternQ) { if (m_haveInternQ) {
InternfileTask *tp = new InternfileTask(fn, stp, m_localfields); InternfileTask *tp = new InternfileTask(fn, stp, m_localfields);
@ -644,10 +645,9 @@ bool FsIndexer::launchAddOrUpdate(const string& udi, const string& parent_udi,
return m_db->addOrUpdate(udi, parent_udi, doc); return m_db->addOrUpdate(udi, parent_udi, doc);
} }
FsTreeWalker::Status FsTreeWalker::Status FsIndexer::processonefile(
FsIndexer::processonefile(RclConfig *config, RclConfig *config, const std::string &fn, const struct PathStat *stp,
const std::string &fn, const struct PathStat *stp, const map<string, string>& localfields)
const map<string, string>& localfields)
{ {
//////////////////// ////////////////////
// Check db up to date ? Doing this before file type // Check db up to date ? Doing this before file type
@ -693,7 +693,7 @@ FsIndexer::processonefile(RclConfig *config,
// If noretryfailed is set, check for a file which previously // If noretryfailed is set, check for a file which previously
// failed to index, and avoid re-processing it // failed to index, and avoid re-processing it
if (needupdate && m_noretryfailed && existingDoc && if (needupdate && m_noretryfailed && existingDoc &&
!oldsig.empty() && *oldsig.rbegin() == '+') { !oldsig.empty() && oldsig.back() == '+') {
// Check that the sigs are the same except for the '+'. If the file // Check that the sigs are the same except for the '+'. If the file
// actually changed, we always retry (maybe it was fixed) // actually changed, we always retry (maybe it was fixed)
string nold = oldsig.substr(0, oldsig.size()-1); string nold = oldsig.substr(0, oldsig.size()-1);
@ -720,8 +720,7 @@ FsIndexer::processonefile(RclConfig *config,
return FsTreeWalker::FtwOk; return FsTreeWalker::FtwOk;
} }
LOGDEB0("processone: processing: [" << LOGDEB0("processone: processing: [" << displayableBytes(stp->pst_size) << "] " << fn << "\n");
displayableBytes(stp->pst_size) << "] " << fn << "\n");
// Note that we used to do the full path here, but I ended up // Note that we used to do the full path here, but I ended up
// believing that it made more sense to use only the file name // believing that it made more sense to use only the file name
@ -813,6 +812,7 @@ FsIndexer::processonefile(RclConfig *config,
// myriads of such files, the ext script is executed for them // myriads of such files, the ext script is executed for them
// and fails every time) // and fails every time)
if (fis == FileInterner::FIError) { if (fis == FileInterner::FIError) {
IdxDiags::theDiags().record(IdxDiags::Error, fn, doc.ipath);
doc.sig += cstr_plus; doc.sig += cstr_plus;
} }
@ -822,8 +822,7 @@ FsIndexer::processonefile(RclConfig *config,
// Add document to database. If there is an ipath, add it // Add document to database. If there is an ipath, add it
// as a child of the file document. // as a child of the file document.
if (!launchAddOrUpdate(udi, doc.ipath.empty() ? if (!launchAddOrUpdate(udi, doc.ipath.empty() ? cstr_null : parent_udi, doc)) {
cstr_null : parent_udi, doc)) {
return FsTreeWalker::FtwError; return FsTreeWalker::FtwError;
} }

98
src/index/idxdiags.cpp Normal file
View File

@ -0,0 +1,98 @@
/* Copyright (C) 2021 J.F.Dockes
*
* License: GPL 2.1
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "autoconfig.h"
#include <stdio.h>
#include <mutex>
#include "idxdiags.h"
static std::mutex diagmutex;
class IdxDiags::Internal {
public:
~Internal() {
if (fp) {
fclose(fp);
}
}
FILE *fp{nullptr};
};
IdxDiags::IdxDiags()
{
m = new Internal;
}
IdxDiags::~IdxDiags()
{
delete m;
}
bool IdxDiags::flush()
{
std::unique_lock<std::mutex> lock(diagmutex);
if (m && m->fp) {
return fflush(m->fp) ? false : true;
}
return true;
}
static IdxDiags *theInstance;
IdxDiags& IdxDiags::theDiags()
{
if (nullptr == theInstance) {
theInstance = new IdxDiags;
}
return *theInstance;
}
bool IdxDiags::init(const std::string& outpath)
{
m->fp = fopen(outpath.c_str(), "w");
if (nullptr == m->fp) {
return false;
}
return true;
}
bool IdxDiags::record(DiagKind diag, const std::string& path, const std::string& detail)
{
if (nullptr == m || nullptr == m->fp || (path.empty() && detail.empty())) {
return true;
}
const char *skind = "Unknown";
switch (diag) {
case Ok: skind = "Ok";break;
case Skipped: skind = "Skipped";break;
case NoContentSuffix: skind = "NoContentSuffix";break;
case MissingHelper: skind = "MissingHelper";break;
case Error: skind = "Error";break;
case NoHandler: skind = "NoHandler";break;
case ExcludedMime: skind = "ExcludedMime";break;
case NotIncludedMime: skind = "NotIncludedMime";break;
}
std::unique_lock<std::mutex> lock(diagmutex);
fprintf(m->fp, "%s %s | %s\n", skind, path.c_str(), detail.c_str());
return true;
}

50
src/index/idxdiags.h Normal file
View File

@ -0,0 +1,50 @@
/* Copyright (C) 2021 J.F.Dockes
*
* License: GPL 2.1
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifndef _IDXDIAGS_H_INCLUDED_
#define _IDXDIAGS_H_INCLUDED_
#include <string>
class IdxDiags {
public:
enum DiagKind {Ok, Skipped, NoContentSuffix, MissingHelper, Error, NoHandler,
ExcludedMime, NotIncludedMime};
// Retrieve a reference to the single instance.
static IdxDiags& theDiags();
// Initialize, setting the output file path. outpath will be truncated.
// No locking: this must be called from the main thread, before going multithread.
// If init is never called, further calls to record() or flush() will be noops.
bool init(const std::string& outpath);
// Record a reason for a document not to be indexed.
bool record(DiagKind diag, const std::string& path, const std::string& detail = std::string());
bool flush();
class Internal;
private:
Internal *m;
IdxDiags();
~IdxDiags();
};
#endif /* _IDXDIAGS_H_INCLUDED_ */

View File

@ -66,20 +66,16 @@ static RclMonitor *makeMonitor();
*/ */
class WalkCB : public FsTreeWalkerCB { class WalkCB : public FsTreeWalkerCB {
public: public:
WalkCB(RclConfig *conf, RclMonitor *mon, RclMonEventQueue *queue, WalkCB(RclConfig *conf, RclMonitor *mon, RclMonEventQueue *queue, FsTreeWalker& walker)
FsTreeWalker& walker) : m_config(conf), m_mon(mon), m_queue(queue), m_walker(walker) {}
: m_config(conf), m_mon(mon), m_queue(queue), m_walker(walker)
{}
virtual ~WalkCB() {} virtual ~WalkCB() {}
virtual FsTreeWalker::Status virtual FsTreeWalker::Status processone(
processone(const string &fn, const struct PathStat *st, const string &fn, const struct PathStat *st, FsTreeWalker::CbFlag flg) {
FsTreeWalker::CbFlag flg) {
MONDEB("rclMonRcvRun: processone " << fn << " m_mon " << m_mon << MONDEB("rclMonRcvRun: processone " << fn << " m_mon " << m_mon <<
" m_mon->ok " << (m_mon ? m_mon->ok() : false) << std::endl); " m_mon->ok " << (m_mon ? m_mon->ok() : false) << std::endl);
if (flg == FsTreeWalker::FtwDirEnter || if (flg == FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwDirReturn) {
flg == FsTreeWalker::FtwDirReturn) {
m_config->setKeyDir(fn); m_config->setKeyDir(fn);
// Set up skipped patterns for this subtree. // Set up skipped patterns for this subtree.
m_walker.setSkippedNames(m_config->getSkippedNames()); m_walker.setSkippedNames(m_config->getSkippedNames());
@ -106,8 +102,7 @@ public:
m_mon->saved_errno != ENOENT) m_mon->saved_errno != ENOENT)
return FsTreeWalker::FtwError; return FsTreeWalker::FtwError;
} }
} else if (!m_mon->generatesExist() && } else if (!m_mon->generatesExist() && flg == FsTreeWalker::FtwRegular) {
flg == FsTreeWalker::FtwRegular) {
// Have to synthetize events for regular files existence // Have to synthetize events for regular files existence
// at startup because the monitor does not do it // at startup because the monitor does not do it
// Note 2011-09-29: no sure this is actually needed. We just ran // Note 2011-09-29: no sure this is actually needed. We just ran

View File

@ -62,6 +62,7 @@ using namespace std;
#include "checkretryfailed.h" #include "checkretryfailed.h"
#include "idxstatus.h" #include "idxstatus.h"
#include "circache.h" #include "circache.h"
#include "idxdiags.h"
// Command line options // Command line options
static int op_flags; static int op_flags;
@ -93,11 +94,13 @@ static int op_flags;
#define OPTVAL_WEBCACHE_COMPACT 1000 #define OPTVAL_WEBCACHE_COMPACT 1000
#define OPTVAL_WEBCACHE_BURST 1001 #define OPTVAL_WEBCACHE_BURST 1001
#define OPTVAL_DIAGS_NOTINDEXED 1002 #define OPTVAL_DIAGS_NOTINDEXED 1002
#define OPTVAL_DIAGS_DIAGFILE 1003
static struct option long_options[] = { static struct option long_options[] = {
{"webcache-compact", 0, 0, OPTVAL_WEBCACHE_COMPACT}, {"webcache-compact", 0, 0, OPTVAL_WEBCACHE_COMPACT},
{"webcache-burst", required_argument, 0, OPTVAL_WEBCACHE_BURST}, {"webcache-burst", required_argument, 0, OPTVAL_WEBCACHE_BURST},
{"notindexed", 0, 0, OPTVAL_DIAGS_NOTINDEXED}, {"notindexed", 0, 0, OPTVAL_DIAGS_NOTINDEXED},
{"diagfile", required_argument, 0, OPTVAL_DIAGS_DIAGFILE},
{0, 0, 0, 0} {0, 0, 0, 0}
}; };
@ -110,6 +113,7 @@ static ConfIndexer *confindexer;
static void cleanup() static void cleanup()
{ {
deleteZ(confindexer); deleteZ(confindexer);
IdxDiags::theDiags().flush();
recoll_exitready(); recoll_exitready();
} }
@ -274,20 +278,15 @@ static void setMyPriority(const RclConfig *config)
class MakeListWalkerCB : public FsTreeWalkerCB { class MakeListWalkerCB : public FsTreeWalkerCB {
public: public:
MakeListWalkerCB(list<string>& files, const vector<string>& selpats) MakeListWalkerCB(list<string>& files, const vector<string>& selpats)
: m_files(files), m_pats(selpats) : m_files(files), m_pats(selpats) {}
{ virtual FsTreeWalker::Status processone(
} const string& fn, const struct PathStat *, FsTreeWalker::CbFlag flg) {
virtual FsTreeWalker::Status
processone(const string& fn, const struct PathStat *,
FsTreeWalker::CbFlag flg) {
if (flg== FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwRegular){ if (flg== FsTreeWalker::FtwDirEnter || flg == FsTreeWalker::FtwRegular){
if (m_pats.empty()) { if (m_pats.empty()) {
cerr << "Selecting " << fn << endl;
m_files.push_back(fn); m_files.push_back(fn);
} else { } else {
for (vector<string>::const_iterator it = m_pats.begin(); for (const auto& pat : m_pats) {
it != m_pats.end(); it++) { if (fnmatch(pat.c_str(), fn.c_str(), 0) == 0) {
if (fnmatch(it->c_str(), fn.c_str(), 0) == 0) {
m_files.push_back(fn); m_files.push_back(fn);
break; break;
} }
@ -451,6 +450,8 @@ static const char usage [] =
" -Z : in place reset: consider all documents as changed. Can also\n" " -Z : in place reset: consider all documents as changed. Can also\n"
" be combined with -i or -r but not -m\n" " be combined with -i or -r but not -m\n"
" -k : retry files on which we previously failed\n" " -k : retry files on which we previously failed\n"
" --diagfile <outputpath> : list skipped or otherwise not indexed documents to <outputpath>\n"
" <outputpath> will be truncated\n"
#ifdef RCL_MONITOR #ifdef RCL_MONITOR
"recollindex -m [-w <secs>] -x [-D] [-C]\n" "recollindex -m [-w <secs>] -x [-D] [-C]\n"
" Perform real time indexing. Don't become a daemon if -D is set.\n" " Perform real time indexing. Don't become a daemon if -D is set.\n"
@ -636,6 +637,7 @@ int main(int argc, char *argv[])
bool diags_notindexed{false}; bool diags_notindexed{false};
std::string burstdir; std::string burstdir;
std::string diagfile;
while ((ret = getopt_long(argc, (char *const*)&args[0], "c:CDdEefhikKlmnPp:rR:sS:w:xZz", while ((ret = getopt_long(argc, (char *const*)&args[0], "c:CDdEefhikKlmnPp:rR:sS:w:xZz",
long_options, NULL)) != -1) { long_options, NULL)) != -1) {
switch (ret) { switch (ret) {
@ -676,7 +678,7 @@ int main(int argc, char *argv[])
case OPTVAL_WEBCACHE_COMPACT: webcache_compact = true; break; case OPTVAL_WEBCACHE_COMPACT: webcache_compact = true; break;
case OPTVAL_WEBCACHE_BURST: burstdir = optarg; webcache_burst = true;break; case OPTVAL_WEBCACHE_BURST: burstdir = optarg; webcache_burst = true;break;
case OPTVAL_DIAGS_NOTINDEXED: diags_notindexed = true;break; case OPTVAL_DIAGS_NOTINDEXED: diags_notindexed = true;break;
case OPTVAL_DIAGS_DIAGFILE: diagfile = optarg;break;
default: Usage(); break; default: Usage(); break;
} }
} }
@ -790,6 +792,12 @@ int main(int argc, char *argv[])
} }
} }
if (!diagfile.empty()) {
if (!IdxDiags::theDiags().init(diagfile)) {
std::cerr << "Could not initialize diags file " << diagfile << "\n";
LOGERR("recollindex: Could not initialize diags file " << diagfile << "\n");
}
}
bool rezero((op_flags & OPT_z) != 0); bool rezero((op_flags & OPT_z) != 0);
bool inPlaceReset((op_flags & OPT_Z) != 0); bool inPlaceReset((op_flags & OPT_Z) != 0);

View File

@ -411,8 +411,7 @@ WebQueueIndexer::processone(
if (flg != FsTreeWalker::FtwRegular) if (flg != FsTreeWalker::FtwRegular)
return FsTreeWalker::FtwOk; return FsTreeWalker::FtwOk;
string dotpath = path_cat(path_getfather(path), string dotpath = path_cat(path_getfather(path), string(DOTFILEPREFIX) + path_getsimple(path));
string(DOTFILEPREFIX) + path_getsimple(path));
LOGDEB("WebQueueIndexer: prc1: [" << path << "]\n"); LOGDEB("WebQueueIndexer: prc1: [" << path << "]\n");
WebQueueDotFile dotfile(m_config, dotpath); WebQueueDotFile dotfile(m_config, dotpath);

View File

@ -240,17 +240,15 @@ void FileInterner::init(const string &f, const struct PathStat *stp,
m_mimetype = l_mime; m_mimetype = l_mime;
// Look for appropriate handler (might still return empty) // Look for appropriate handler (might still return empty)
RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview); RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview, f);
if (!df || df->is_unknown()) { if (!df || df->is_unknown()) {
// No real handler for this type, for now :( // No real handler for this type, for now :(
LOGDEB("FileInterner:: unprocessed mime: [" << l_mime << "] [" << f << LOGDEB("FileInterner:: unprocessed mime: [" << l_mime << "] [" << f << "]\n");
"]\n");
if (!df) if (!df)
return; return;
} }
df->set_property(Dijon::Filter::OPERATING_MODE, df->set_property(Dijon::Filter::OPERATING_MODE, m_forPreview ? "view" : "index");
m_forPreview ? "view" : "index");
df->set_property(Dijon::Filter::DJF_UDI, udi); df->set_property(Dijon::Filter::DJF_UDI, udi);
df->set_docsize(docsize); df->set_docsize(docsize);
@ -271,8 +269,7 @@ FileInterner::FileInterner(const string &data, RclConfig *cnf,
init(data, cnf, flags, imime); init(data, cnf, flags, imime);
} }
void FileInterner::init(const string &data, RclConfig *, void FileInterner::init(const string &data, RclConfig *, int, const string& imime)
int, const string& imime)
{ {
if (imime.empty()) { if (imime.empty()) {
LOGERR("FileInterner: inmemory constructor needs input mime type\n"); LOGERR("FileInterner: inmemory constructor needs input mime type\n");
@ -281,7 +278,7 @@ void FileInterner::init(const string &data, RclConfig *,
m_mimetype = imime; m_mimetype = imime;
// Look for appropriate handler (might still return empty) // Look for appropriate handler (might still return empty)
RecollFilter *df = getMimeHandler(m_mimetype, m_cfg, !m_forPreview); RecollFilter *df = getMimeHandler(m_mimetype, m_cfg, !m_forPreview, m_fn);
if (!df) { if (!df) {
// No handler for this type, for now :( if indexallfilenames // No handler for this type, for now :( if indexallfilenames
@ -289,8 +286,7 @@ void FileInterner::init(const string &data, RclConfig *,
LOGDEB("FileInterner:: unprocessed mime [" << m_mimetype << "]\n"); LOGDEB("FileInterner:: unprocessed mime [" << m_mimetype << "]\n");
return; return;
} }
df->set_property(Dijon::Filter::OPERATING_MODE, df->set_property(Dijon::Filter::OPERATING_MODE, m_forPreview ? "view" : "index");
m_forPreview ? "view" : "index");
df->set_docsize(data.length()); df->set_docsize(data.length());
if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) { if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
@ -741,12 +737,11 @@ int FileInterner::addHandler()
getKeyValue(docdata, cstr_dj_keyipath, ipathel); getKeyValue(docdata, cstr_dj_keyipath, ipathel);
bool dofilter = !m_forPreview && bool dofilter = !m_forPreview &&
(mimetype.compare(cstr_texthtml) || !ipathel.empty()); (mimetype.compare(cstr_texthtml) || !ipathel.empty());
RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, dofilter); RecollFilter *newflt = getMimeHandler(mimetype, m_cfg, dofilter, m_fn);
if (!newflt) { if (!newflt) {
// If we can't find a handler, this doc can't be handled // If we can't find a handler, this doc can't be handled
// but there can be other ones so we go on // but there can be other ones so we go on
LOGINFO("FileInterner::addHandler: no filter for [" << mimetype << LOGINFO("FileInterner::addHandler: no filter for [" << mimetype << "]\n");
"]\n");
return ADD_CONTINUE; return ADD_CONTINUE;
} }
newflt->set_property(Dijon::Filter::OPERATING_MODE, newflt->set_property(Dijon::Filter::OPERATING_MODE,

View File

@ -29,6 +29,7 @@
#include "smallut.h" #include "smallut.h"
#include "md5ut.h" #include "md5ut.h"
#include "rclconfig.h" #include "rclconfig.h"
#include "idxdiags.h"
using namespace std; using namespace std;
@ -186,6 +187,7 @@ bool MimeHandlerExec::next_document()
missingHelper = true; missingHelper = true;
m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd; m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd;
whatHelper = m_reason; whatHelper = m_reason;
IdxDiags::theDiags().record(IdxDiags::MissingHelper, m_fn);
} else if (output.find("RECFILTERROR") == 0) { } else if (output.find("RECFILTERROR") == 0) {
// If the output string begins with RECFILTERROR, then it's // If the output string begins with RECFILTERROR, then it's
// interpretable error information out from a recoll script // interpretable error information out from a recoll script
@ -193,6 +195,7 @@ bool MimeHandlerExec::next_document()
std::string::size_type pos; std::string::size_type pos;
if ((pos = output.find("RECFILTERROR ")) == 0) { if ((pos = output.find("RECFILTERROR ")) == 0) {
if (output.find("HELPERNOTFOUND") != string::npos) { if (output.find("HELPERNOTFOUND") != string::npos) {
IdxDiags::theDiags().record(IdxDiags::MissingHelper, m_fn);
missingHelper = true; missingHelper = true;
whatHelper = output.substr(pos); whatHelper = output.substr(pos);
} }

View File

@ -33,6 +33,7 @@
#include "mimetype.h" #include "mimetype.h"
#include "idfile.h" #include "idfile.h"
#include "rclutil.h" #include "rclutil.h"
#include "idxdiags.h"
using namespace std; using namespace std;
@ -72,6 +73,7 @@ bool MimeHandlerExecMultiple::startCmd()
vector<string>myparams(params.begin() + 1, params.end()); vector<string>myparams(params.begin() + 1, params.end());
if (m_cmd.startExec(cmd, myparams, 1, 1) < 0) { if (m_cmd.startExec(cmd, myparams, 1, 1) < 0) {
IdxDiags::theDiags().record(IdxDiags::MissingHelper, m_fn);
m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd; m_reason = string("RECFILTERROR HELPERNOTFOUND ") + cmd;
missingHelper = true; missingHelper = true;
whatHelper = cmd; whatHelper = cmd;
@ -113,6 +115,7 @@ bool MimeHandlerExecMultiple::readDataElement(string& name, string &data)
if ((pos = ibuf.find("RECFILTERROR ")) == 0) { if ((pos = ibuf.find("RECFILTERROR ")) == 0) {
m_reason = ibuf; m_reason = ibuf;
if (ibuf.find("HELPERNOTFOUND") != string::npos) { if (ibuf.find("HELPERNOTFOUND") != string::npos) {
IdxDiags::theDiags().record(IdxDiags::MissingHelper, m_fn);
missingHelper = true; missingHelper = true;
whatHelper = ibuf.substr(pos); whatHelper = ibuf.substr(pos);
} }

View File

@ -257,7 +257,7 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,
/* Get handler/filter object for given mime type: */ /* Get handler/filter object for given mime type: */
RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg, RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg,
bool filtertypes) bool filtertypes, const std::string& fn)
{ {
LOGDEB("getMimeHandler: mtype [" << mtype << "] filtertypes " << LOGDEB("getMimeHandler: mtype [" << mtype << "] filtertypes " <<
filtertypes << "\n"); filtertypes << "\n");
@ -270,7 +270,7 @@ RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg,
// indexedmimetypes but an html handler could still be in the // indexedmimetypes but an html handler could still be in the
// cache because it was needed by some other interning stack). // cache because it was needed by some other interning stack).
string hs; string hs;
hs = cfg->getMimeHandlerDef(mtype, filtertypes); hs = cfg->getMimeHandlerDef(mtype, filtertypes, fn);
string id; string id;
if (!hs.empty()) { if (!hs.empty()) {

View File

@ -169,7 +169,7 @@ protected:
* indexedmimetypes (if this is set at all). * indexedmimetypes (if this is set at all).
*/ */
extern RecollFilter *getMimeHandler(const std::string &mtyp, RclConfig *cfg, extern RecollFilter *getMimeHandler(const std::string &mtyp, RclConfig *cfg,
bool filtertypes); bool filtertypes, const std::string& fn = std::string());
/// Free up filter for reuse (you can also delete it) /// Free up filter for reuse (you can also delete it)
extern void returnMimeHandler(RecollFilter *); extern void returnMimeHandler(RecollFilter *);

View File

@ -2401,7 +2401,7 @@ bool Db::dbStats(DbStats& res, bool listfailed)
try { try {
Xapian::Document doc = xdb.get_document(docid); Xapian::Document doc = xdb.get_document(docid);
string sig = doc.get_value(VALUE_SIG); string sig = doc.get_value(VALUE_SIG);
if (sig.empty() || sig[sig.size()-1] != '+') { if (sig.empty() || sig.back() != '+') {
continue; continue;
} }
string data = doc.get_data(); string data = doc.get_data();

View File

@ -411,9 +411,7 @@ FsTreeWalker::Status FsTreeWalker::iwalk(const string &top,
// Skipped file names match ? // Skipped file names match ?
if (!data->skippedNames.empty()) { if (!data->skippedNames.empty()) {
if (inSkippedNames(dname)) { if (inSkippedNames(dname)) {
if (data->options & FtwOnlySkipped) { cb.processone(path_cat(top, dname), nullptr, FtwSkipped);
cb.processone(path_cat(top, dname), nullptr, FtwSkipped);
}
continue; continue;
} }
} }
@ -428,9 +426,7 @@ FsTreeWalker::Status FsTreeWalker::iwalk(const string &top,
// this was broken by 1.13.00 and the systematic use of // this was broken by 1.13.00 and the systematic use of
// FNM_LEADING_DIR // FNM_LEADING_DIR
if (inSkippedPaths(fn, false)) { if (inSkippedPaths(fn, false)) {
if (data->options & FtwOnlySkipped) { cb.processone(fn, nullptr, FtwSkipped);
cb.processone(fn, nullptr, FtwSkipped);
}
continue; continue;
} }
} }