beaglequeue indexFiles
This commit is contained in:
parent
bbba826c06
commit
6ef7b546f2
@ -173,7 +173,8 @@ BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
|
||||
{
|
||||
|
||||
if (!m_config->getConfParam("beaglequeuedir", m_queuedir))
|
||||
m_queuedir = path_tildexpand("~/.beagle/ToIndex");
|
||||
m_queuedir = path_tildexpand("~/.beagle/ToIndex/");
|
||||
path_catslash(m_queuedir);
|
||||
|
||||
if (m_db && m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0) {
|
||||
string reason;
|
||||
@ -336,6 +337,42 @@ bool BeagleQueueIndexer::index()
|
||||
return true;
|
||||
}
|
||||
|
||||
bool BeagleQueueIndexer::indexFiles(list<string>& files)
|
||||
{
|
||||
if (!m_db) {
|
||||
LOGERR(("BeagleQueueIndexer::indexfiles no db??\n"));
|
||||
return false;
|
||||
}
|
||||
for (list<string>::iterator it = files.begin(); it != files.end(); it++) {
|
||||
if (it->empty())
|
||||
continue;//??
|
||||
string father = path_getfather(*it);
|
||||
if (father.compare(m_queuedir)) {
|
||||
LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nq)\n",
|
||||
it->c_str()));
|
||||
continue;
|
||||
}
|
||||
string fn = path_getsimple(*it);
|
||||
if (fn.empty() || fn.at(0) == '.')
|
||||
continue;
|
||||
struct stat st;
|
||||
if (lstat(it->c_str(), &st) != 0) {
|
||||
LOGERR(("BeagleQueueIndexer::indexfiles: cant stat [%s]\n",
|
||||
it->c_str()));
|
||||
continue;
|
||||
}
|
||||
if (!S_ISREG(st.st_mode)) {
|
||||
LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nr)\n",
|
||||
it->c_str()));
|
||||
continue;
|
||||
}
|
||||
|
||||
processone(*it, &st, FsTreeWalker::FtwRegular);
|
||||
files.erase(it);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
FsTreeWalker::Status
|
||||
BeagleQueueIndexer::processone(const string &path,
|
||||
const struct stat *stp,
|
||||
|
||||
@ -51,6 +51,13 @@ public:
|
||||
FsTreeWalker::Status
|
||||
processone(const string &, const struct stat *, FsTreeWalker::CbFlag);
|
||||
|
||||
/** Index a list of files. No db cleaning or stemdb updating */
|
||||
bool indexFiles(list<string>& files);
|
||||
/** Purge a list of files. No way to do this currently and dont want
|
||||
* to do anything as this is mostly called by the monitor when *I* delete
|
||||
* files inside the queue dir */
|
||||
bool purgeFiles(list<string>& files) {return true;}
|
||||
|
||||
bool getFromCache(const string& udi, Rcl::Doc &doc, string& data,
|
||||
string *hittype = 0);
|
||||
private:
|
||||
|
||||
@ -86,18 +86,19 @@ bool FsIndexer::init()
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (m_tdl.empty()) {
|
||||
m_tdl = m_config->getTopdirs();
|
||||
if (m_tdl.empty()) {
|
||||
LOGERR(("FsIndexers: no topdirs list defined\n"));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Recursively index each directory in the topdirs:
|
||||
bool FsIndexer::index()
|
||||
{
|
||||
list<string> topdirs = m_config->getTopdirs();
|
||||
if (topdirs.empty()) {
|
||||
LOGERR(("FsIndexer::indexTrees: no valid topdirs in config\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!init())
|
||||
return false;
|
||||
|
||||
@ -108,8 +109,8 @@ bool FsIndexer::index()
|
||||
|
||||
m_walker.setSkippedPaths(m_config->getSkippedPaths());
|
||||
|
||||
for (list<string>::const_iterator it = topdirs.begin();
|
||||
it != topdirs.end(); it++) {
|
||||
for (list<string>::const_iterator it = m_tdl.begin();
|
||||
it != m_tdl.end(); it++) {
|
||||
LOGDEB(("FsIndexer::index: Indexing %s into %s\n", it->c_str(),
|
||||
getDbDir().c_str()));
|
||||
|
||||
@ -151,60 +152,119 @@ bool FsIndexer::index()
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool matchesSkipped(const list<string>& tdl,
|
||||
const list<string>& skpnl,
|
||||
const list<string>& skppl,
|
||||
const string& path)
|
||||
{
|
||||
// First check what (if any) topdir this is in:
|
||||
string td;
|
||||
for (list<string>::const_iterator it = tdl.begin(); it != tdl.end(); it++) {
|
||||
if (path.find(*it) == 0) {
|
||||
td = *it;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (td.empty()) {
|
||||
LOGDEB(("FsIndexer::indexFiles: skipping [%s] (ntd)\n", path.c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check path against skippedPaths. If we find a system where
|
||||
// FNM_LEADING_DIR is undefined (its unposixy), will have to do this for
|
||||
// all ascendant paths up to the topdir
|
||||
for (list<string>::const_iterator it = skppl.begin();
|
||||
it != skppl.end(); it++) {
|
||||
if (fnmatch(it->c_str(), path.c_str(), FNM_PATHNAME|FNM_LEADING_DIR)
|
||||
== 0) {
|
||||
LOGDEB(("FsIndexer::indexFiles: skipping [%s] (skpp)\n",
|
||||
path.c_str()));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Then check all path components up to the topdir against skippedNames
|
||||
if (!skpnl.empty()) {
|
||||
string mpath = path;
|
||||
while (mpath.length() >= td.length() && mpath.length() > 1) {
|
||||
string fn = path_getsimple(mpath);
|
||||
for (list<string>::const_iterator it = skpnl.begin();
|
||||
it != skpnl.end(); it++) {
|
||||
LOGDEB2(("Checking [%s] against [%s]\n",
|
||||
fn.c_str(), it->c_str()));
|
||||
if (fnmatch(it->c_str(), fn.c_str(), 0) == 0) {
|
||||
LOGDEB(("FsIndexer::indexFiles: skipping [%s] (skpn)\n",
|
||||
path.c_str()));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
string::size_type len = mpath.length();
|
||||
mpath = path_getfather(mpath);
|
||||
// getfather normally returns a path ending with /, getsimple
|
||||
// would then return ''
|
||||
if (!mpath.empty() && mpath[mpath.size()-1] == '/')
|
||||
mpath.erase(mpath.size()-1);
|
||||
// should not be necessary, but lets be prudent. If the
|
||||
// path did not shorten, something is seriously amiss
|
||||
// (could be an assert actually)
|
||||
if (mpath.length() >= len)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Index individual files, out of a full tree run. No database purging
|
||||
*/
|
||||
bool FsIndexer::indexFiles(const list<string> &filenames)
|
||||
bool FsIndexer::indexFiles(list<string>& files)
|
||||
{
|
||||
if (!init())
|
||||
return false;
|
||||
|
||||
list<string>::const_iterator it;
|
||||
for (it = filenames.begin(); it != filenames.end(); it++) {
|
||||
string dir = path_getfather(*it);
|
||||
m_config->setKeyDir(dir);
|
||||
int abslen;
|
||||
if (m_config->getConfParam("idxabsmlen", &abslen))
|
||||
m_db->setAbstractParams(abslen, -1, -1);
|
||||
for (list<string>::iterator it = files.begin();
|
||||
it != files.end(); it++) {
|
||||
|
||||
struct stat stb;
|
||||
if (lstat(it->c_str(), &stb) != 0) {
|
||||
LOGERR(("FsIndexer::indexFiles: lstat(%s): %s", it->c_str(),
|
||||
strerror(errno)));
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we get to indexing directory names one day, will need to test
|
||||
// against dbdir here to avoid modification loops (with rclmon).
|
||||
if (!S_ISREG(stb.st_mode)) {
|
||||
LOGDEB2(("FsIndexer::indexFiles: %s: not a regular file\n",
|
||||
LOGDEB(("FsIndexer::indexFiles: skipping [%s] (nr)\n",
|
||||
it->c_str()));
|
||||
continue;
|
||||
}
|
||||
|
||||
string dir = path_getfather(*it);
|
||||
m_config->setKeyDir(dir);
|
||||
static string lstdir;
|
||||
static list<string> skpl;
|
||||
static list<string> skpnl;
|
||||
static list<string> skppl;
|
||||
if (lstdir.compare(dir)) {
|
||||
LOGDEB(("Recomputing list of skipped names\n"));
|
||||
skpl = m_config->getSkippedNames();
|
||||
skpnl = m_config->getSkippedNames();
|
||||
skppl = m_config->getSkippedPaths();
|
||||
lstdir = dir;
|
||||
}
|
||||
if (!skpl.empty()) {
|
||||
list<string>::const_iterator skit;
|
||||
string fn = path_getsimple(*it);
|
||||
for (skit = skpl.begin(); skit != skpl.end(); skit++) {
|
||||
if (fnmatch(skit->c_str(), fn.c_str(), 0) == 0) {
|
||||
LOGDEB(("Skipping [%s] :matches skip list\n", fn.c_str()));
|
||||
goto skipped;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check path against indexed areas and skipped names/paths
|
||||
if (matchesSkipped(m_tdl, skpnl, skppl, *it))
|
||||
continue;
|
||||
|
||||
int abslen;
|
||||
if (m_config->getConfParam("idxabsmlen", &abslen))
|
||||
m_db->setAbstractParams(abslen, -1, -1);
|
||||
|
||||
if (processone(*it, &stb, FsTreeWalker::FtwRegular) !=
|
||||
FsTreeWalker::FtwOk) {
|
||||
LOGERR(("FsIndexer::indexFiles: processone failed\n"));
|
||||
return false;
|
||||
}
|
||||
skipped:
|
||||
false; // Need a statement here to make compiler happy ??
|
||||
files.erase(it);
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -212,19 +272,25 @@ bool FsIndexer::indexFiles(const list<string> &filenames)
|
||||
|
||||
|
||||
/** Purge docs for given files out of the database */
|
||||
bool FsIndexer::purgeFiles(const list<string> &filenames)
|
||||
bool FsIndexer::purgeFiles(list<string>& files)
|
||||
{
|
||||
if (!init())
|
||||
return false;
|
||||
|
||||
list<string>::const_iterator it;
|
||||
for (it = filenames.begin(); it != filenames.end(); it++) {
|
||||
for (list<string>::iterator it = files.begin();
|
||||
it != files.end(); it++) {
|
||||
string udi;
|
||||
make_udi(*it, "", udi);
|
||||
if (!m_db->purgeFile(udi)) {
|
||||
// rcldb::purgefile returns true if the udi was either not
|
||||
// found or deleted, false only in case of actual error
|
||||
bool existed;
|
||||
if (!m_db->purgeFile(udi, &existed)) {
|
||||
LOGERR(("FsIndexer::purgeFiles: Database error\n"));
|
||||
return false;
|
||||
}
|
||||
// If we actually deleted something, take it off the list
|
||||
if (existed) {
|
||||
files.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
@ -18,6 +18,11 @@
|
||||
#define _fsindexer_h_included_
|
||||
/* @(#$Id: $ (C) 2009 J.F.Dockes */
|
||||
|
||||
#include <list>
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::list;
|
||||
#endif
|
||||
|
||||
#include "fstreewalk.h"
|
||||
#include "rcldb.h"
|
||||
|
||||
@ -58,10 +63,10 @@ class FsIndexer : public FsTreeWalkerCB {
|
||||
bool index();
|
||||
|
||||
/** Index a list of files. No db cleaning or stemdb updating */
|
||||
bool indexFiles(const std::list<string> &files);
|
||||
bool indexFiles(list<string> &files);
|
||||
|
||||
/** Purge a list of files. */
|
||||
bool purgeFiles(const std::list<string> &files);
|
||||
bool purgeFiles(list<string> &files);
|
||||
|
||||
/** Tree walker callback method */
|
||||
FsTreeWalker::Status
|
||||
@ -74,6 +79,7 @@ class FsIndexer : public FsTreeWalkerCB {
|
||||
string m_tmpdir;
|
||||
string m_reason;
|
||||
DbIxStatusUpdater *m_updater;
|
||||
list<string> m_tdl;
|
||||
|
||||
// The configuration can set attribute fields to be inherited by
|
||||
// all files in a file system area. Ie: set "apptag = thunderbird"
|
||||
|
||||
@ -26,6 +26,8 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.71 2008-12-17 08:01:40 dockes Exp
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "debuglog.h"
|
||||
#include "indexer.h"
|
||||
#include "fsindexer.h"
|
||||
@ -104,44 +106,14 @@ bool ConfIndexer::index(bool resetbefore, ixType typestorun)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ConfIndexer::initTopDirs()
|
||||
bool ConfIndexer::indexFiles(std::list<string> &files)
|
||||
{
|
||||
if (m_tdl.empty()) {
|
||||
m_tdl = m_config->getTopdirs();
|
||||
if (m_tdl.empty()) {
|
||||
m_reason = "Top directory list (topdirs param.) "
|
||||
"not found in config or Directory list parse error";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ConfIndexer::indexFiles(const std::list<string> &files)
|
||||
{
|
||||
if (!initTopDirs())
|
||||
return false;
|
||||
|
||||
list<string> myfiles;
|
||||
for (list<string>::const_iterator it = files.begin();
|
||||
it != files.end(); it++) {
|
||||
string fn = path_canon(*it);
|
||||
bool ok = false;
|
||||
// Check that this file name belongs to one of our subtrees
|
||||
for (list<string>::iterator dit = m_tdl.begin();
|
||||
dit != m_tdl.end(); dit++) {
|
||||
if (fn.find(*dit) == 0) {
|
||||
myfiles.push_back(fn);
|
||||
ok = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!ok) {
|
||||
m_reason += string("File ") + fn + string(" not in indexed area\n");
|
||||
}
|
||||
myfiles.push_back(path_canon(*it));
|
||||
}
|
||||
if (myfiles.empty())
|
||||
return true;
|
||||
myfiles.sort();
|
||||
|
||||
if (!m_db.open(Rcl::Db::DbUpd)) {
|
||||
LOGERR(("ConfIndexer: indexFiles error opening database %s\n",
|
||||
@ -149,9 +121,21 @@ bool ConfIndexer::indexFiles(const std::list<string> &files)
|
||||
return false;
|
||||
}
|
||||
m_config->setKeyDir("");
|
||||
bool ret = false;
|
||||
if (!m_fsindexer)
|
||||
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
|
||||
bool ret = m_fsindexer->indexFiles(files);
|
||||
if (m_fsindexer)
|
||||
ret = m_fsindexer->indexFiles(files);
|
||||
|
||||
if (m_dobeagle && !myfiles.empty()) {
|
||||
if (!m_beagler)
|
||||
m_beagler = new BeagleQueueIndexer(m_config, &m_db, m_updater);
|
||||
if (m_beagler) {
|
||||
ret = ret && m_beagler->indexFiles(myfiles);
|
||||
} else {
|
||||
ret = false;
|
||||
}
|
||||
}
|
||||
|
||||
// The close would be done in our destructor, but we want status here
|
||||
if (!m_db.close()) {
|
||||
@ -162,31 +146,40 @@ bool ConfIndexer::indexFiles(const std::list<string> &files)
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool ConfIndexer::purgeFiles(const std::list<string> &files)
|
||||
bool ConfIndexer::purgeFiles(std::list<string> &files)
|
||||
{
|
||||
if (!initTopDirs())
|
||||
return false;
|
||||
|
||||
list<string> myfiles;
|
||||
for (list<string>::const_iterator it = files.begin();
|
||||
it != files.end(); it++) {
|
||||
myfiles.push_back(path_canon(*it));
|
||||
}
|
||||
myfiles.sort();
|
||||
|
||||
if (!m_db.open(Rcl::Db::DbUpd)) {
|
||||
LOGERR(("ConfIndexer: purgeFiles error opening database %s\n",
|
||||
m_config->getDbDir().c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ret = false;
|
||||
m_config->setKeyDir("");
|
||||
if (!m_fsindexer)
|
||||
m_fsindexer = new FsIndexer(m_config, &m_db, m_updater);
|
||||
bool ret = m_fsindexer->purgeFiles(files);
|
||||
if (m_fsindexer)
|
||||
ret = m_fsindexer->purgeFiles(myfiles);
|
||||
|
||||
if (m_dobeagle && !myfiles.empty()) {
|
||||
if (!m_beagler)
|
||||
m_beagler = new BeagleQueueIndexer(m_config, &m_db, m_updater);
|
||||
if (m_beagler) {
|
||||
ret = ret && m_beagler->purgeFiles(myfiles);
|
||||
} else {
|
||||
ret = false;
|
||||
}
|
||||
}
|
||||
|
||||
// The close would be done in our destructor, but we want status here
|
||||
if (!m_db.close()) {
|
||||
LOGERR(("ConfIndexer::index: error closing database in %s\n",
|
||||
LOGERR(("ConfIndexer::purgefiles: error closing database in %s\n",
|
||||
m_config->getDbDir().c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -85,10 +85,10 @@ class ConfIndexer {
|
||||
static list<string> getStemmerNames();
|
||||
|
||||
/** Index a list of files. No db cleaning or stemdb updating */
|
||||
bool indexFiles(const std::list<string> &files);
|
||||
bool indexFiles(std::list<string> &files);
|
||||
|
||||
/** Purge a list of files. */
|
||||
bool purgeFiles(const std::list<string> &files);
|
||||
bool purgeFiles(std::list<string> &files);
|
||||
|
||||
private:
|
||||
RclConfig *m_config;
|
||||
@ -97,10 +97,7 @@ class ConfIndexer {
|
||||
bool m_dobeagle;
|
||||
BeagleQueueIndexer *m_beagler;
|
||||
DbIxStatusUpdater *m_updater;
|
||||
string m_reason;
|
||||
list<string> m_tdl;
|
||||
|
||||
bool initTopDirs();
|
||||
string m_reason;
|
||||
};
|
||||
|
||||
#endif /* _INDEXER_H_INCLUDED_ */
|
||||
|
||||
@ -97,7 +97,7 @@ static bool makeIndexer(RclConfig *config)
|
||||
// this case we're called repeatedly in the same process, and the
|
||||
// confindexer is only created once by makeIndexer (but the db closed and
|
||||
// flushed every time)
|
||||
bool indexfiles(RclConfig *config, const list<string> &filenames)
|
||||
bool indexfiles(RclConfig *config, list<string> &filenames)
|
||||
{
|
||||
if (filenames.empty())
|
||||
return true;
|
||||
@ -107,7 +107,7 @@ bool indexfiles(RclConfig *config, const list<string> &filenames)
|
||||
}
|
||||
|
||||
// Delete a list of files. Same comments about call contexts as indexfiles.
|
||||
bool purgefiles(RclConfig *config, const list<string> &filenames)
|
||||
bool purgefiles(RclConfig *config, list<string> &filenames)
|
||||
{
|
||||
if (filenames.empty())
|
||||
return true;
|
||||
|
||||
@ -20,8 +20,8 @@
|
||||
|
||||
/** Helper methods in recollindex.cpp for initial checks/setup to index
|
||||
* a list of files (either from the monitor or the command line) */
|
||||
extern bool indexfiles(RclConfig *config, const list<string> &filenames);
|
||||
extern bool purgefiles(RclConfig *config, const list<string> &filenames);
|
||||
extern bool indexfiles(RclConfig *config, list<string> &filenames);
|
||||
extern bool purgefiles(RclConfig *config, list<string> &filenames);
|
||||
extern bool createAuxDbs(RclConfig *config);
|
||||
|
||||
extern int stopindexing;
|
||||
|
||||
@ -1270,7 +1270,7 @@ bool Db::purge()
|
||||
}
|
||||
|
||||
/* Delete document(s) for given unique identifier (doc and descendents) */
|
||||
bool Db::purgeFile(const string &udi)
|
||||
bool Db::purgeFile(const string &udi, bool *existed)
|
||||
{
|
||||
LOGDEB(("Db:purgeFile: [%s]\n", udi.c_str()));
|
||||
if (m_ndb == 0 || !m_ndb->m_iswritable)
|
||||
@ -1280,8 +1280,12 @@ bool Db::purgeFile(const string &udi)
|
||||
string ermsg;
|
||||
try {
|
||||
Xapian::PostingIterator docid = db.postlist_begin(uniterm);
|
||||
if (docid == db.postlist_end(uniterm))
|
||||
if (docid == db.postlist_end(uniterm)) {
|
||||
if (existed)
|
||||
*existed = false;
|
||||
return true;
|
||||
}
|
||||
*existed = true;
|
||||
LOGDEB(("purgeFile: delete docid %d\n", *docid));
|
||||
db.delete_document(*docid);
|
||||
vector<Xapian::docid> docids;
|
||||
|
||||
@ -121,7 +121,7 @@ class Db {
|
||||
const Doc &doc);
|
||||
|
||||
/** Delete document(s) for given UDI, including subdocs */
|
||||
bool purgeFile(const string &udi);
|
||||
bool purgeFile(const string &udi, bool *existed = 0);
|
||||
|
||||
/** Remove documents that no longer exist in the file system. This
|
||||
* depends on the update map, which is built during
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user