473 lines
14 KiB
C++
473 lines
14 KiB
C++
#ifndef lint
|
|
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.38 2006-10-16 15:33:08 dockes Exp $ (C) 2004 J.F.Dockes";
|
|
#endif
|
|
/*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the
|
|
* Free Software Foundation, Inc.,
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*/
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "autoconfig.h"
|
|
#endif
|
|
|
|
#include <stdio.h>
|
|
#include <sys/stat.h>
|
|
#include <unistd.h>
|
|
#include <errno.h>
|
|
#include <strings.h>
|
|
#include <fnmatch.h>
|
|
|
|
#include <iostream>
|
|
#include <list>
|
|
#include <map>
|
|
#include <algorithm>
|
|
|
|
#include "pathut.h"
|
|
#include "conftree.h"
|
|
#include "rclconfig.h"
|
|
#include "fstreewalk.h"
|
|
#include "rcldb.h"
|
|
#include "readfile.h"
|
|
#include "indexer.h"
|
|
#include "csguess.h"
|
|
#include "transcode.h"
|
|
#include "debuglog.h"
|
|
#include "internfile.h"
|
|
#include "smallut.h"
|
|
#include "wipedir.h"
|
|
|
|
#ifdef RCL_USE_ASPELL
|
|
#include "rclaspell.h"
|
|
#endif
|
|
|
|
#ifndef NO_NAMESPACES
|
|
using namespace std;
|
|
#endif /* NO_NAMESPACES */
|
|
|
|
#ifndef deleteZ
|
|
#define deleteZ(X) {delete X;X = 0;}
|
|
#endif
|
|
|
|
DbIndexer::~DbIndexer() {
|
|
// Maybe clean up temporary directory
|
|
if (m_tmpdir.length()) {
|
|
wipedir(m_tmpdir);
|
|
if (rmdir(m_tmpdir.c_str()) < 0) {
|
|
LOGERR(("DbIndexer::~DbIndexer: cannot clear temp dir %s\n",
|
|
m_tmpdir.c_str()));
|
|
}
|
|
}
|
|
m_db.close();
|
|
}
|
|
|
|
// Index each directory in the topdirs for a given db
|
|
bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
|
|
{
|
|
if (!init(resetbefore))
|
|
return false;
|
|
|
|
if (m_updater) {
|
|
m_updater->status.reset();
|
|
m_updater->status.dbtotdocs = m_db.docCnt();
|
|
}
|
|
|
|
for (list<string>::const_iterator it = topdirs->begin();
|
|
it != topdirs->end(); it++) {
|
|
LOGDEB(("DbIndexer::index: Indexing %s into %s\n", it->c_str(),
|
|
m_dbdir.c_str()));
|
|
|
|
// Set the current directory in config so that subsequent
|
|
// getConfParams() will get local values
|
|
m_config->setKeyDir(*it);
|
|
int abslen;
|
|
if (m_config->getConfParam("idxabsmlen", &abslen))
|
|
m_db.setAbstractParams(abslen, -1, -1);
|
|
|
|
// Set up skipped patterns for this subtree. This probably should be
|
|
// done in the directory change code in processone() instead.
|
|
m_walker.clearSkippedNames();
|
|
string skipped;
|
|
if (m_config->getConfParam("skippedNames", skipped)) {
|
|
list<string> skpl;
|
|
stringToStrings(skipped, skpl);
|
|
m_walker.setSkippedNames(skpl);
|
|
}
|
|
|
|
// Walk the directory tree
|
|
if (m_walker.walk(*it, *this) != FsTreeWalker::FtwOk) {
|
|
LOGERR(("DbIndexer::index: error while indexing %s: %s\n",
|
|
it->c_str(), m_walker.getReason().c_str()));
|
|
return false;
|
|
}
|
|
}
|
|
if (m_updater) {
|
|
m_updater->status.fn.erase();
|
|
m_updater->status.phase = DbIxStatus::DBIXS_PURGE;
|
|
m_updater->update();
|
|
}
|
|
|
|
// Get rid of all database entries that don't exist in the
|
|
// filesystem anymore.
|
|
m_db.purge();
|
|
|
|
// Create stemming databases. We also remove those which are not
|
|
// configured.
|
|
string slangs;
|
|
if (m_config->getConfParam("indexstemminglanguages", slangs)) {
|
|
list<string> langs;
|
|
stringToStrings(slangs, langs);
|
|
|
|
// Get the list of existing stem dbs from the database (some may have
|
|
// been manually created, we just keep those from the config
|
|
list<string> dblangs = m_db.getStemLangs();
|
|
list<string>::const_iterator it;
|
|
for (it = dblangs.begin(); it != dblangs.end(); it++) {
|
|
if (find(langs.begin(), langs.end(), *it) == langs.end())
|
|
m_db.deleteStemDb(*it);
|
|
}
|
|
for (it = langs.begin(); it != langs.end(); it++) {
|
|
if (m_updater) {
|
|
m_updater->status.phase = DbIxStatus::DBIXS_STEMDB;
|
|
m_updater->status.fn = *it;
|
|
m_updater->update();
|
|
}
|
|
m_db.createStemDb(*it);
|
|
}
|
|
}
|
|
|
|
createAspellDict();
|
|
|
|
// The close would be done in our destructor, but we want status here
|
|
if (m_updater) {
|
|
m_updater->status.phase = DbIxStatus::DBIXS_CLOSING;
|
|
m_updater->status.fn.erase();
|
|
m_updater->update();
|
|
}
|
|
if (!m_db.close()) {
|
|
LOGERR(("DbIndexer::index: error closing database in %s\n",
|
|
m_dbdir.c_str()));
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool DbIndexer::init(bool resetbefore)
|
|
{
|
|
if (!maketmpdir(m_tmpdir)) {
|
|
LOGERR(("DbIndexer: cannot create temporary directory\n"));
|
|
return false;
|
|
}
|
|
if (!m_db.open(m_dbdir, resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd)) {
|
|
LOGERR(("DbIndexer: error opening database in %s\n", m_dbdir.c_str()));
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool DbIndexer::createStemDb(const string &lang)
|
|
{
|
|
if (!init())
|
|
return false;
|
|
return m_db.createStemDb(lang);
|
|
}
|
|
|
|
// The language for the aspell dictionary is handled internally by the aspell
|
|
// module, either from a configuration variable or the NLS environment.
|
|
bool DbIndexer::createAspellDict()
|
|
{
|
|
LOGDEB2(("DbIndexer::createAspellDict()\n"));
|
|
#ifdef RCL_USE_ASPELL
|
|
if (!init())
|
|
return false;
|
|
Aspell aspell(m_config);
|
|
string reason;
|
|
if (!aspell.init(reason)) {
|
|
LOGERR(("DbIndexer::createAspellDict: aspell init failed: %s\n",
|
|
reason.c_str()));
|
|
return false;
|
|
}
|
|
LOGDEB(("DbIndexer::createAspellDict: creating dictionary\n"));
|
|
if (!aspell.buildDict(m_db, reason)) {
|
|
LOGERR(("DbIndexer::createAspellDict: aspell buildDict failed: %s\n",
|
|
reason.c_str()));
|
|
return false;
|
|
}
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Index individual files, out of a full tree run. No database purging
|
|
*/
|
|
bool DbIndexer::indexFiles(const list<string> &filenames)
|
|
{
|
|
if (!init())
|
|
return false;
|
|
|
|
list<string>::const_iterator it;
|
|
for (it = filenames.begin(); it != filenames.end(); it++) {
|
|
string dir = path_getfather(*it);
|
|
m_config->setKeyDir(dir);
|
|
int abslen;
|
|
if (m_config->getConfParam("idxabsmlen", &abslen))
|
|
m_db.setAbstractParams(abslen, -1, -1);
|
|
struct stat stb;
|
|
if (stat(it->c_str(), &stb) != 0) {
|
|
LOGERR(("DbIndexer::indexFiles: stat(%s): %s", it->c_str(),
|
|
strerror(errno)));
|
|
continue;
|
|
}
|
|
if (!S_ISREG(stb.st_mode)) {
|
|
LOGERR(("DbIndexer::indexFiles: %s: not a regular file\n",
|
|
it->c_str()));
|
|
continue;
|
|
}
|
|
|
|
static string lstdir;
|
|
static list<string> skpl;
|
|
if (lstdir.compare(dir)) {
|
|
LOGDEB(("Recomputing list of skipped names\n"));
|
|
string skipped;
|
|
if (m_config->getConfParam("skippedNames", skipped)) {
|
|
stringToStrings(skipped, skpl);
|
|
lstdir = dir;
|
|
}
|
|
}
|
|
if (!skpl.empty()) {
|
|
list<string>::const_iterator skit;
|
|
string fn = path_getsimple(*it);
|
|
for (skit = skpl.begin(); skit != skpl.end(); skit++) {
|
|
if (fnmatch(skit->c_str(), fn.c_str(), 0) == 0) {
|
|
LOGDEB(("Skipping [%s] :matches skip list\n", fn.c_str()));
|
|
goto skipped;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (processone(*it, &stb, FsTreeWalker::FtwRegular) !=
|
|
FsTreeWalker::FtwOk) {
|
|
LOGERR(("DbIndexer::indexFiles: Database error\n"));
|
|
return false;
|
|
}
|
|
skipped:
|
|
false; // Need a statement here to make compiler happy ??
|
|
}
|
|
|
|
// The close would be done in our destructor, but we want status here
|
|
if (!m_db.close()) {
|
|
LOGERR(("DbIndexer::indexfiles: error closing database in %s\n",
|
|
m_dbdir.c_str()));
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// This method gets called for every file and directory found by the
|
|
/// tree walker.
|
|
///
|
|
/// It checks with the db if the file has changed and needs to be
|
|
/// reindexed. If so, it calls internfile() which will identify the
|
|
/// file type and call an appropriate handler to convert the document into
|
|
/// internal format, which we then add to the database.
|
|
///
|
|
/// Accent and majuscule handling are performed by the db module when doing
|
|
/// the actual indexing work. The Rcl::Doc created by internfile()
|
|
/// mostly contains pretty raw utf8 data.
|
|
FsTreeWalker::Status
|
|
DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
|
FsTreeWalker::CbFlag flg)
|
|
{
|
|
if (m_updater && !m_updater->update()) {
|
|
return FsTreeWalker::FtwStop;
|
|
}
|
|
// If we're changing directories, possibly adjust parameters (set
|
|
// the current directory in configuration object)
|
|
if (flg == FsTreeWalker::FtwDirEnter ||
|
|
flg == FsTreeWalker::FtwDirReturn) {
|
|
m_config->setKeyDir(fn);
|
|
int abslen;
|
|
if (m_config->getConfParam("idxabsmlen", &abslen))
|
|
m_db.setAbstractParams(abslen, -1, -1);
|
|
return FsTreeWalker::FtwOk;
|
|
}
|
|
|
|
// Check db up to date ? Doing this before file type
|
|
// identification means that, if usesystemfilecommand is switched
|
|
// from on to off it may happen that some files which are now
|
|
// without mime type will not be purged from the db, resulting
|
|
// in possible 'cannot intern file' messages at query time...
|
|
if (!m_db.needUpdate(fn, stp)) {
|
|
LOGDEB(("indexfile: up to date: %s\n", fn.c_str()));
|
|
if (m_updater) {
|
|
m_updater->status.fn = fn;
|
|
if (!m_updater->update()) {
|
|
return FsTreeWalker::FtwStop;
|
|
}
|
|
}
|
|
return FsTreeWalker::FtwOk;
|
|
}
|
|
|
|
FileInterner interner(fn, m_config, m_tmpdir);
|
|
|
|
// File name transcoded to utf8 for indexation.
|
|
string charset = m_config->getDefCharset(true);
|
|
// If this fails, the file name won't be indexed, no big deal
|
|
// Note that we used to do the full path here, but I ended up believing
|
|
// that it made more sense to use only the file name
|
|
string utf8fn;
|
|
transcode(path_getsimple(fn), utf8fn, charset, "UTF-8");
|
|
|
|
FileInterner::Status fis = FileInterner::FIAgain;
|
|
bool hadNullIpath = false;
|
|
Rcl::Doc doc;
|
|
char ascdate[20];
|
|
sprintf(ascdate, "%ld", long(stp->st_ctime));
|
|
while (fis == FileInterner::FIAgain) {
|
|
doc.erase();
|
|
|
|
string ipath;
|
|
fis = interner.internfile(doc, ipath);
|
|
if (fis == FileInterner::FIError) {
|
|
// We dont stop indexing for one bad doc
|
|
return FsTreeWalker::FtwOk;
|
|
}
|
|
|
|
// Set the date if this was not done in the document handler
|
|
if (doc.fmtime.empty()) {
|
|
doc.fmtime = ascdate;
|
|
}
|
|
|
|
// Internal access path for multi-document files
|
|
if (ipath.empty())
|
|
hadNullIpath = true;
|
|
else
|
|
doc.ipath = ipath;
|
|
|
|
doc.utf8fn = utf8fn;
|
|
|
|
// Add document to database
|
|
if (!m_db.add(fn, doc, stp))
|
|
return FsTreeWalker::FtwError;
|
|
|
|
// Tell what we are doing and check for interrupt request
|
|
if (m_updater) {
|
|
if ((++(m_updater->status.docsdone) % 10) == 0) {
|
|
m_updater->status.fn = fn;
|
|
if (!ipath.empty())
|
|
m_updater->status.fn += "|" + ipath;
|
|
if (!m_updater->update()) {
|
|
return FsTreeWalker::FtwStop;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// If we had no instance with a null ipath, we create an empty
|
|
// document to stand for the file itself, to be used mainly for up
|
|
// to date checks. Typically this happens for an mbox file.
|
|
if (hadNullIpath == false) {
|
|
LOGDEB1(("Creating empty doc for file\n"));
|
|
Rcl::Doc fileDoc;
|
|
fileDoc.fmtime = doc.fmtime;
|
|
fileDoc.utf8fn = doc.utf8fn;
|
|
fileDoc.mimetype = doc.mimetype;
|
|
if (!m_db.add(fn, fileDoc, stp))
|
|
return FsTreeWalker::FtwError;
|
|
}
|
|
|
|
return FsTreeWalker::FtwOk;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
// ConIndexer methods: ConfIndexer is the top-level object, that can index
|
|
// multiple directories to multiple databases.
|
|
|
|
ConfIndexer::~ConfIndexer()
|
|
{
|
|
deleteZ(m_dbindexer);
|
|
}
|
|
|
|
bool ConfIndexer::index(bool resetbefore)
|
|
{
|
|
list<string> tdl = m_config->getTopdirs();
|
|
if (tdl.empty()) {
|
|
m_reason = "Top directory list (topdirs param.) not found in config"
|
|
"or Directory list parse error";
|
|
return false;
|
|
}
|
|
|
|
// Each top level directory to be indexed can be associated with a
|
|
// different database. We first group the directories by database:
|
|
// it is important that all directories for a database be indexed
|
|
// at once so that deleted file cleanup works
|
|
list<string>::iterator dirit;
|
|
map<string, list<string> > dbmap;
|
|
map<string, list<string> >::iterator dbit;
|
|
for (dirit = tdl.begin(); dirit != tdl.end(); dirit++) {
|
|
string dbdir;
|
|
string doctopdir = *dirit;
|
|
{ // Check top dirs. Must not be symlinks
|
|
struct stat st;
|
|
if (lstat(doctopdir.c_str(), &st) < 0) {
|
|
LOGERR(("ConfIndexer::index: cant stat %s\n",
|
|
doctopdir.c_str()));
|
|
m_reason = "Stat error for: " + doctopdir;
|
|
return false;
|
|
}
|
|
if (S_ISLNK(st.st_mode)) {
|
|
LOGERR(("ConfIndexer::index: no symlinks allowed in topdirs: %s\n",
|
|
doctopdir.c_str()));
|
|
m_reason = doctopdir + " is a symbolic link";
|
|
return false;
|
|
}
|
|
}
|
|
m_config->setKeyDir(doctopdir);
|
|
dbdir = m_config->getDbDir();
|
|
if (dbdir.empty()) {
|
|
LOGERR(("ConfIndexer::index: no database directory in "
|
|
"configuration for %s\n", doctopdir.c_str()));
|
|
m_reason = "No database directory set for " + doctopdir;
|
|
return false;
|
|
}
|
|
dbit = dbmap.find(dbdir);
|
|
if (dbit == dbmap.end()) {
|
|
list<string> l;
|
|
l.push_back(doctopdir);
|
|
dbmap[dbdir] = l;
|
|
} else {
|
|
dbit->second.push_back(doctopdir);
|
|
}
|
|
}
|
|
m_config->setKeyDir("");
|
|
|
|
// The dbmap now has dbdir as key and directory lists as values.
|
|
// Index each directory group in turn
|
|
for (dbit = dbmap.begin(); dbit != dbmap.end(); dbit++) {
|
|
//cout << dbit->first << " -> ";
|
|
//list<string>::const_iterator dit;
|
|
//for (dit = dbit->second.begin(); dit != dbit->second.end(); dit++) {
|
|
// cout << *dit << " ";
|
|
//}
|
|
//cout << endl;
|
|
m_dbindexer = new DbIndexer(m_config, dbit->first, m_updater);
|
|
if (!m_dbindexer->indexDb(resetbefore, &dbit->second)) {
|
|
deleteZ(m_dbindexer);
|
|
m_reason = "Failed indexing in " + dbit->first;
|
|
return false;
|
|
}
|
|
deleteZ(m_dbindexer);
|
|
}
|
|
return true;
|
|
}
|