recoll/src/index/indexer.cpp
2020-05-30 15:54:49 +02:00

436 lines
13 KiB
C++

/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "autoconfig.h"
#include <stdio.h>
#include <errno.h>
#include <algorithm>
#include "cstr.h"
#include "log.h"
#include "recollindex.h"
#include "indexer.h"
#include "fsindexer.h"
#ifndef DISABLE_WEB_INDEXER
#include "webqueue.h"
#endif
#include "mimehandler.h"
#include "pathut.h"
#include "idxstatus.h"
#include "execmd.h"
#ifdef RCL_USE_ASPELL
#include "rclaspell.h"
#endif
using std::list;
using std::string;
using std::vector;
// Global stop request flag. This is checked in a number of place in the
// indexing routines.
int stopindexing;
// This would more logically live in recollindex.cpp, but then librecoll would
// have an undefined symbol
ConfSimple idxreasons;
void addIdxReason(string who, string reason)
{
reason = neutchars(reason, "\r\n");
if (!idxreasons.set(who, reason)) {
std::cerr << "addIdxReason: confsimple set failed\n";
}
}
#ifndef DISABLE_WEB_INDEXER
bool runWebFilesMoverScript(RclConfig *config)
{
static string downloadsdir;
if (downloadsdir.empty()) {
if (!config->getConfParam("webdownloadsdir", downloadsdir)) {
downloadsdir = path_tildexpand("~/Downloads");
}
}
static string cmdpath;
vector<string> args;
#ifdef _WIN32
const static string cmdnm{"python"};
args.push_back(config->findFilter("recoll-we-move-files.py"));
#else
const static string cmdnm{"recoll-we-move-files.py"};
#endif
if (cmdpath.empty()) {
cmdpath = config->findFilter(cmdnm);
if (cmdpath.empty()) {
LOGERR("runWFMoverScript: recoll-we-move-files.py not found\n");
return false;
}
}
/* Arrange to not actually run the script if the directory did not change */
static time_t dirmtime;
time_t ndirmtime = 0;
struct PathStat st;
if (path_fileprops(downloadsdir.c_str(), &st) == 0) {
ndirmtime = st.pst_mtime;
}
/* If stat fails, presumably Downloads does not exist or is not
accessible, dirmtime and mdirmtime stay at 0, and we never
execute the script, which is the right thing. */
if (dirmtime != ndirmtime) {
/* The script is going to change the directory, so updating
dirmtime before it runs means that we are going to execute
it one time too many (it will run without doing anything),
but we can't set the mtime to after the run in case files
are created during the run. */
dirmtime = ndirmtime;
ExecCmd cmd;
int status = cmd.doexec(cmdpath, args);
return status == 0;
}
return true;
}
#endif
ConfIndexer::ConfIndexer(RclConfig *cnf, DbIxStatusUpdater *updfunc)
: m_config(cnf), m_db(cnf), m_fsindexer(0),
m_doweb(false), m_webindexer(0),
m_updater(updfunc)
{
m_config->getConfParam("processwebqueue", &m_doweb);
}
ConfIndexer::~ConfIndexer()
{
deleteZ(m_fsindexer);
#ifndef DISABLE_WEB_INDEXER
deleteZ(m_webindexer);
#endif
}
// Determine if this is likely the first time that the user runs
// indexing. We don't look at the xapiandb as this may have been
// explicitly removed for valid reasons, but at the indexing status
// file, which should be unexistant-or-empty only before any indexing
// has ever run
bool ConfIndexer::runFirstIndexing()
{
// Indexing status file existing and not empty ?
if (path_filesize(m_config->getIdxStatusFile()) > 0) {
LOGDEB0("ConfIndexer::runFirstIndexing: no: status file not empty\n");
return false;
}
// And only do this if the user has kept the default topdirs (~).
vector<string> tdl = m_config->getTopdirs();
if (tdl.size() != 1 || tdl[0].compare(path_canon(path_tildexpand("~")))) {
LOGDEB0("ConfIndexer::runFirstIndexing: no: not home only\n");
return false;
}
return true;
}
bool ConfIndexer::firstFsIndexingSequence()
{
LOGDEB("ConfIndexer::firstFsIndexingSequence\n");
deleteZ(m_fsindexer);
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
if (!m_fsindexer) {
return false;
}
int flushmb = m_db.getFlushMb();
m_db.setFlushMb(2);
m_fsindexer->index(IxFQuickShallow);
m_db.doFlush();
m_db.setFlushMb(flushmb);
return true;
}
bool ConfIndexer::index(bool resetbefore, ixType typestorun, int flags)
{
Rcl::Db::OpenMode mode = resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
if (!m_db.open(mode)) {
LOGERR("ConfIndexer: error opening database " << m_config->getDbDir() <<
" : " << m_db.getReason() << "\n");
addIdxReason("indexer", m_db.getReason());
return false;
}
m_config->setKeyDir(cstr_null);
if (typestorun & IxTFs) {
if (runFirstIndexing()) {
firstFsIndexingSequence();
}
deleteZ(m_fsindexer);
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
if (!m_fsindexer || !m_fsindexer->index(flags)) {
if (stopindexing) {
addIdxReason("indexer", "Indexing was interrupted.");
} else {
addIdxReason("indexer", "Index creation failed. See log.");
}
m_db.close();
return false;
}
}
#ifndef DISABLE_WEB_INDEXER
if (m_doweb && (typestorun & IxTWebQueue)) {
runWebFilesMoverScript(m_config);
deleteZ(m_webindexer);
m_webindexer = new WebQueueIndexer(m_config, &m_db, m_updater);
if (!m_webindexer || !m_webindexer->index()) {
m_db.close();
addIdxReason("indexer", "Web index creation failed. See log");
return false;
}
}
#endif
if (typestorun == IxTAll) {
// Get rid of all database entries that don't exist in the
// filesystem anymore. Only if all *configured* indexers ran.
if (m_updater && !m_updater->update(DbIxStatus::DBIXS_PURGE, "")) {
m_db.close();
addIdxReason("indexer", "Index purge failed. See log");
return false;
}
m_db.purge();
}
// The close would be done in our destructor, but we want status
// here. Makes no sense to check for cancel, we'll have to close
// anyway
if (m_updater)
m_updater->update(DbIxStatus::DBIXS_CLOSING, string());
if (!m_db.close()) {
LOGERR("ConfIndexer::index: error closing database in " <<
m_config->getDbDir() << "\n");
addIdxReason("indexer", "Index close/flush failed. See log");
return false;
}
if (m_updater && !m_updater->update(DbIxStatus::DBIXS_CLOSING, string()))
return false;
bool ret = true;
if (!createStemmingDatabases()) {
ret = false;
}
if (m_updater && !m_updater->update(DbIxStatus::DBIXS_CLOSING, string()))
return false;
// Don't fail indexing because of an aspell issue: we ignore the status.
// Messages were written to the reasons output
(void)createAspellDict();
clearMimeHandlerCache();
if (m_updater)
m_updater->update(DbIxStatus::DBIXS_DONE, string());
return ret;
}
bool ConfIndexer::indexFiles(list<string>& ifiles, int flag)
{
list<string> myfiles;
string origcwd = m_config->getOrigCwd();
for (const auto& entry : ifiles) {
myfiles.push_back(path_canon(entry, &origcwd));
}
myfiles.sort();
if (!m_db.open(Rcl::Db::DbUpd)) {
LOGERR("ConfIndexer: indexFiles error opening database " <<
m_config->getDbDir() << "\n");
return false;
}
m_config->setKeyDir(cstr_null);
bool ret = false;
if (!m_fsindexer)
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
if (m_fsindexer)
ret = m_fsindexer->indexFiles(myfiles, flag);
LOGDEB2("ConfIndexer::indexFiles: fsindexer returned " << ret << ", " <<
myfiles.size() << " files remainining\n");
#ifndef DISABLE_WEB_INDEXER
if (m_doweb && !myfiles.empty() && !(flag & IxFNoWeb)) {
if (!m_webindexer)
m_webindexer = new WebQueueIndexer(m_config, &m_db, m_updater);
if (m_webindexer) {
ret = ret && m_webindexer->indexFiles(myfiles);
} else {
ret = false;
}
}
#endif
if (flag & IxFDoPurge) {
m_db.purge();
}
// The close would be done in our destructor, but we want status here
if (!m_db.close()) {
LOGERR("ConfIndexer::index: error closing database in " <<
m_config->getDbDir() << "\n");
return false;
}
ifiles = myfiles;
clearMimeHandlerCache();
return ret;
}
// Update index for specific documents. The docs come from an index
// query, so the udi, backend etc. fields are filled.
bool ConfIndexer::updateDocs(vector<Rcl::Doc> &docs, IxFlag flag)
{
vector<string> paths;
docsToPaths(docs, paths);
list<string> files(paths.begin(), paths.end());
if (!files.empty()) {
return indexFiles(files, flag);
}
return true;
}
bool ConfIndexer::purgeFiles(list<string> &files, int flag)
{
list<string> myfiles;
string origcwd = m_config->getOrigCwd();
for (const auto& entry : files) {
myfiles.push_back(path_canon(entry, &origcwd));
}
myfiles.sort();
if (!m_db.open(Rcl::Db::DbUpd)) {
LOGERR("ConfIndexer: purgeFiles error opening database " <<
m_config->getDbDir() << "\n");
return false;
}
bool ret = false;
m_config->setKeyDir(cstr_null);
if (!m_fsindexer)
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
if (m_fsindexer)
ret = m_fsindexer->purgeFiles(myfiles);
#ifndef DISABLE_WEB_INDEXER
if (m_doweb && !myfiles.empty() && !(flag & IxFNoWeb)) {
if (!m_webindexer)
m_webindexer = new WebQueueIndexer(m_config, &m_db, m_updater);
if (m_webindexer) {
ret = ret && m_webindexer->purgeFiles(myfiles);
} else {
ret = false;
}
}
#endif
// The close would be done in our destructor, but we want status here
if (!m_db.close()) {
LOGERR("ConfIndexer::purgefiles: error closing database in " <<
m_config->getDbDir() << "\n");
return false;
}
return ret;
}
// Create stemming databases. We also remove those which are not
// configured.
bool ConfIndexer::createStemmingDatabases()
{
string slangs;
bool ret = true;
if (m_config->getConfParam("indexstemminglanguages", slangs)) {
if (!m_db.open(Rcl::Db::DbUpd)) {
LOGERR("ConfIndexer::createStemmingDb: could not open db\n");
addIdxReason("stemming", "could not open db");
return false;
}
vector<string> langs;
stringToStrings(slangs, langs);
// Get the list of existing stem dbs from the database (some may have
// been manually created, we just keep those from the config
vector<string> dblangs = m_db.getStemLangs();
vector<string>::const_iterator it;
for (it = dblangs.begin(); it != dblangs.end(); it++) {
if (find(langs.begin(), langs.end(), *it) == langs.end())
m_db.deleteStemDb(*it);
}
ret = ret && m_db.createStemDbs(langs);
if (!ret) {
addIdxReason("stemming", "stem db creation failed");
}
}
m_db.close();
return ret;
}
bool ConfIndexer::createStemDb(const string &lang)
{
if (!m_db.open(Rcl::Db::DbUpd))
return false;
vector<string> langs;
stringToStrings(lang, langs);
return m_db.createStemDbs(langs);
}
// The language for the aspell dictionary is handled internally by the aspell
// module, either from a configuration variable or the NLS environment.
bool ConfIndexer::createAspellDict()
{
LOGDEB2("ConfIndexer::createAspellDict()\n");
#ifdef RCL_USE_ASPELL
// For the benefit of the real-time indexer, we only initialize
// noaspell from the configuration once. It can then be set to
// true if dictionary generation fails, which avoids retrying
// it forever.
static int noaspell = -12345;
if (noaspell == -12345) {
noaspell = false;
m_config->getConfParam("noaspell", &noaspell);
}
if (noaspell)
return true;
if (!m_db.open(Rcl::Db::DbRO)) {
LOGERR("ConfIndexer::createAspellDict: could not open db\n");
return false;
}
Aspell aspell(m_config);
string reason;
if (!aspell.init(reason)) {
LOGERR("ConfIndexer::createAspellDict: aspell init failed: " <<
reason << "\n");
noaspell = true;
return false;
}
LOGDEB("ConfIndexer::createAspellDict: creating dictionary\n");
if (!aspell.buildDict(m_db, reason)) {
LOGERR("ConfIndexer::createAspellDict: aspell buildDict failed: " <<
reason << "\n");
addIdxReason("aspell", reason);
noaspell = true;
return false;
}
#endif
return true;
}
vector<string> ConfIndexer::getStemmerNames()
{
return Rcl::Db::getStemmerNames();
}