allow independant creation / deletion of stem dbs
This commit is contained in:
parent
c4ce5cf691
commit
dac569ab51
@ -1,29 +1,29 @@
|
||||
#*
|
||||
*.cache
|
||||
*.core
|
||||
*.o
|
||||
*~
|
||||
*.core
|
||||
*.cache
|
||||
#*
|
||||
.#*
|
||||
.#*
|
||||
.moc
|
||||
.obj
|
||||
.ui
|
||||
.#*
|
||||
CVS
|
||||
alldeps
|
||||
.#*
|
||||
autom4*
|
||||
TAGS
|
||||
alldeps
|
||||
autom4*
|
||||
config.cache
|
||||
config.log
|
||||
config.status
|
||||
excludefile
|
||||
lib/librcl.a
|
||||
makesrcdist.sh
|
||||
recollinstall
|
||||
mk/localdefs
|
||||
sysconf
|
||||
qtgui/Makefile
|
||||
qtgui/preview/Makefile
|
||||
qtgui/preview/preview.pro
|
||||
qtgui/preview/pvmain.cpp
|
||||
lib/librcl.a
|
||||
recollinstall
|
||||
sampleconf/recoll.conf
|
||||
sysconf
|
||||
wxgui
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
@ -10,6 +10,7 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
|
||||
#include "pathut.h"
|
||||
#include "conftree.h"
|
||||
@ -87,13 +88,22 @@ bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
|
||||
// filesystem anymore.
|
||||
db.purge();
|
||||
|
||||
// Create stemming databases
|
||||
// Create stemming databases. We also remove those which are not
|
||||
// configured.
|
||||
string slangs;
|
||||
if (config->getConfParam("indexstemminglanguages", slangs)) {
|
||||
list<string> langs;
|
||||
stringToStrings(slangs, langs);
|
||||
for (list<string>::const_iterator it = langs.begin();
|
||||
it != langs.end(); it++) {
|
||||
|
||||
// Get the list of existing stem dbs from the database (some may have
|
||||
// been manually created, we just keep those from the config
|
||||
list<string> dblangs = db.getStemLangs();
|
||||
list<string>::const_iterator it;
|
||||
for (it = dblangs.begin(); it != dblangs.end(); it++) {
|
||||
if (find(langs.begin(), langs.end(), *it) == langs.end())
|
||||
db.deleteStemDb(*it);
|
||||
}
|
||||
for (it = langs.begin(); it != langs.end(); it++) {
|
||||
db.createStemDb(*it);
|
||||
}
|
||||
}
|
||||
@ -120,6 +130,16 @@ bool DbIndexer::init(bool resetbefore)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DbIndexer::createStemDb(const string &lang)
|
||||
{
|
||||
if (!init())
|
||||
return false;
|
||||
return db.createStemDb(lang);
|
||||
}
|
||||
|
||||
/**
|
||||
Index individual files, out of a full tree run. No database purging
|
||||
*/
|
||||
bool DbIndexer::indexFiles(const list<string> &filenames)
|
||||
{
|
||||
if (!init())
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#ifndef _INDEXER_H_INCLUDED_
|
||||
#define _INDEXER_H_INCLUDED_
|
||||
/* @(#$Id: indexer.h,v 1.8 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: indexer.h,v 1.9 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
@ -24,10 +24,12 @@ class DbIndexer;
|
||||
class ConfIndexer {
|
||||
public:
|
||||
enum runStatus {IndexerOk, IndexerError};
|
||||
ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) {}
|
||||
virtual ~ConfIndexer();
|
||||
/** Worker function: doe the actual indexing */
|
||||
bool index(bool resetbefore = false);
|
||||
ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0)
|
||||
{
|
||||
}
|
||||
virtual ~ConfIndexer();
|
||||
/** Worker function: doe the actual indexing */
|
||||
bool index(bool resetbefore = false);
|
||||
private:
|
||||
RclConfig *config;
|
||||
DbIndexer *dbindexer; // Object to process directories for a given db
|
||||
@ -36,10 +38,10 @@ class ConfIndexer {
|
||||
/** Index things into one database
|
||||
|
||||
Tree indexing: we inherits FsTreeWalkerCB so that, the processone()
|
||||
method is called by the file-system tree walk code for each file and
|
||||
directory. We keep all state needed while indexing, and finally call
|
||||
the methods to purge the db of stale entries and create the stemming
|
||||
databases.
|
||||
method is called by the file-system tree walk code for each file and
|
||||
directory. We keep all state needed while indexing, and finally call
|
||||
the methods to purge the db of stale entries and create the stemming
|
||||
databases.
|
||||
|
||||
Single file(s) indexing: no database purging or stem db updating.
|
||||
*/
|
||||
@ -67,6 +69,9 @@ class DbIndexer : public FsTreeWalkerCB {
|
||||
/** Index a list of files. No db cleaning or stemdb updating */
|
||||
bool indexFiles(const std::list<std::string> &files);
|
||||
|
||||
/** Create stem database for given language */
|
||||
bool createStemDb(const string &lang);
|
||||
|
||||
/** Tree walker callback method */
|
||||
FsTreeWalker::Status
|
||||
processone(const std::string &, const struct stat *,
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.13 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.14 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
@ -19,10 +19,12 @@ using namespace std;
|
||||
#include "pathut.h"
|
||||
|
||||
|
||||
// Globals for exit cleanup
|
||||
ConfIndexer *confindexer;
|
||||
DbIndexer *dbindexer;
|
||||
|
||||
bool indexfiles(RclConfig *config, const list<string> &filenames)
|
||||
// Index a list of files
|
||||
static bool indexfiles(RclConfig *config, const list<string> &filenames)
|
||||
{
|
||||
if (filenames.empty())
|
||||
return true;
|
||||
@ -42,6 +44,21 @@ bool indexfiles(RclConfig *config, const list<string> &filenames)
|
||||
return dbindexer->indexFiles(filenames);
|
||||
}
|
||||
|
||||
// Create additional stem database
|
||||
static bool createstemdb(RclConfig *config, const string &lang)
|
||||
{
|
||||
// Note that we do not bother to check for multiple databases,
|
||||
// which are currently a fiction anyway.
|
||||
string dbdir;
|
||||
if (!config->getConfParam("dbdir", dbdir)) {
|
||||
LOGERR(("createstemdb: no database directory in configuration\n"));
|
||||
return false;
|
||||
}
|
||||
dbdir = path_tildexpand(dbdir);
|
||||
dbindexer = new DbIndexer(config, dbdir);
|
||||
return dbindexer->createStemDb(lang);
|
||||
}
|
||||
|
||||
static void cleanup()
|
||||
{
|
||||
delete confindexer;
|
||||
@ -63,15 +80,19 @@ static int op_flags;
|
||||
#define OPT_z 0x2
|
||||
#define OPT_h 0x4
|
||||
#define OPT_i 0x8
|
||||
#define OPT_s 0x10
|
||||
|
||||
static const char usage [] =
|
||||
" recollindex [-hz] \n"
|
||||
" recollindex -i <filename [filename ...]>\n"
|
||||
"\n"
|
||||
"recollindex [-hz] \n"
|
||||
" Normal index run\n"
|
||||
"recollindex -i <filename [filename ...]>\n"
|
||||
" Index individual files. No db purge or stem database updates\n"
|
||||
"recollindex -s <lang>\n"
|
||||
" Build stem database for language <lang>\n"
|
||||
"Options:\n"
|
||||
" -h : print this message\n"
|
||||
" -z : reset database before starting indexation\n\n"
|
||||
" -i <filename [filename ...]> : index individual files. No db purge or stem\n"
|
||||
" database updates in this case\n"
|
||||
;
|
||||
|
||||
static void
|
||||
@ -97,6 +118,7 @@ int main(int argc, const char **argv)
|
||||
case 'z': op_flags |= OPT_z; break;
|
||||
case 'h': op_flags |= OPT_h; break;
|
||||
case 'i': op_flags |= OPT_i; break;
|
||||
case 's': op_flags |= OPT_s; break;
|
||||
default: Usage(); break;
|
||||
}
|
||||
b1: argc--; argv++;
|
||||
@ -108,7 +130,6 @@ int main(int argc, const char **argv)
|
||||
|
||||
string reason;
|
||||
RclConfig *config = recollinit(cleanup, sigcleanup, reason);
|
||||
|
||||
if (config == 0 || !config->ok()) {
|
||||
cerr << "Configuration problem: " << reason << endl;
|
||||
exit(1);
|
||||
@ -130,6 +151,11 @@ int main(int argc, const char **argv)
|
||||
}
|
||||
}
|
||||
exit(!indexfiles(config, filenames));
|
||||
} else if (op_flags & OPT_s) {
|
||||
if (argc != 1)
|
||||
Usage();
|
||||
string lang = *argv++; argc--;
|
||||
exit(!createstemdb(config, lang));
|
||||
} else {
|
||||
confindexer = new ConfIndexer(config);
|
||||
bool rezero(op_flags & OPT_z);
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.48 2006-01-06 13:55:44 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.49 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
@ -23,6 +23,7 @@ using namespace std;
|
||||
#include "smallut.h"
|
||||
#include "pathhash.h"
|
||||
#include "utf8iter.h"
|
||||
#include "wipedir.h"
|
||||
|
||||
#include "xapian.h"
|
||||
#include <xapian/stem.h>
|
||||
@ -67,23 +68,24 @@ Rcl::Db::~Db()
|
||||
ndb->iswritable));
|
||||
if (ndb->isopen == false)
|
||||
return;
|
||||
string ermsg;
|
||||
const char *ermsg = "Unknown error";
|
||||
try {
|
||||
LOGDEB(("Rcl::Db::~Db: closing native database\n"));
|
||||
if (ndb->iswritable == true)
|
||||
if (ndb->iswritable == true) {
|
||||
ndb->wdb.flush();
|
||||
}
|
||||
delete ndb;
|
||||
return;
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg();
|
||||
ermsg = e.get_msg().c_str();
|
||||
} catch (const string &s) {
|
||||
ermsg = s;
|
||||
ermsg = s.c_str();
|
||||
} catch (const char *s) {
|
||||
ermsg = s;
|
||||
} catch (...) {
|
||||
ermsg = "Caught unknown exception";
|
||||
}
|
||||
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg.c_str()));
|
||||
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
|
||||
}
|
||||
|
||||
bool Rcl::Db::open(const string& dir, OpenMode mode)
|
||||
@ -98,7 +100,7 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
|
||||
LOGERR(("Rcl::Db::open: already open\n"));
|
||||
return false;
|
||||
}
|
||||
string ermsg;
|
||||
const char *ermsg = "Unknown";
|
||||
try {
|
||||
switch (mode) {
|
||||
case DbUpd:
|
||||
@ -125,16 +127,16 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
|
||||
ndb->basedir = dir;
|
||||
return true;
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg();
|
||||
ermsg = e.get_msg().c_str();
|
||||
} catch (const string &s) {
|
||||
ermsg = s;
|
||||
ermsg = s.c_str();
|
||||
} catch (const char *s) {
|
||||
ermsg = s;
|
||||
} catch (...) {
|
||||
ermsg = "Caught unknown exception";
|
||||
}
|
||||
LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n",
|
||||
dir.c_str(), ermsg.c_str()));
|
||||
dir.c_str(), ermsg));
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -148,7 +150,7 @@ bool Rcl::Db::close()
|
||||
ndb->iswritable));
|
||||
if (ndb->isopen == false)
|
||||
return true;
|
||||
string ermsg;
|
||||
const char *ermsg = "Unknown";
|
||||
try {
|
||||
if (ndb->iswritable == true) {
|
||||
ndb->wdb.flush();
|
||||
@ -159,16 +161,15 @@ bool Rcl::Db::close()
|
||||
if (pdata)
|
||||
return true;
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg();
|
||||
ermsg = e.get_msg().c_str();
|
||||
} catch (const string &s) {
|
||||
ermsg = s;
|
||||
ermsg = s.c_str();
|
||||
} catch (const char *s) {
|
||||
ermsg = s;
|
||||
} catch (...) {
|
||||
ermsg = "Caught unknown exception";
|
||||
}
|
||||
LOGERR(("Rcl::Db:close: exception while deleting db: %s\n",
|
||||
ermsg.c_str()));
|
||||
LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", ermsg));
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -194,21 +195,29 @@ class mySplitterCB : public TextSplitCB {
|
||||
// Callback for the document to word splitting class during indexation
|
||||
bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
|
||||
{
|
||||
// cerr << "splitCb: term " << term << endl;
|
||||
//string printable;
|
||||
//transcode(term, printable, "UTF-8", "ISO-8859-1");
|
||||
//cerr << "Adding " << printable << endl;
|
||||
#if 0
|
||||
LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
|
||||
string printable;
|
||||
if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
|
||||
LOGDEB((" [%s]\n", printable.c_str()));
|
||||
}
|
||||
#endif
|
||||
|
||||
const char *ermsg;
|
||||
try {
|
||||
// 1 is the value for wdfinc in index_text when called from omindex
|
||||
// TOBEDONE: check what this is used for
|
||||
// Note: 1 is the within document frequency increment. It would
|
||||
// be possible to assign different weigths to doc parts (ie title)
|
||||
// by using a higher value
|
||||
curpos = pos;
|
||||
doc.add_posting(term, basepos + curpos, 1);
|
||||
return true;
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg().c_str();
|
||||
} catch (...) {
|
||||
LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n"));
|
||||
return false;
|
||||
ermsg= "Unknown error";
|
||||
}
|
||||
return true;
|
||||
LOGERR(("Rcl::Db: xapian add_posting error %s\n", ermsg));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Unaccent and lowercase data, replace \n\r with spaces
|
||||
@ -239,7 +248,7 @@ bool Rcl::dumb_string(const string &in, string &out)
|
||||
return true;
|
||||
}
|
||||
|
||||
/* omindex direct */
|
||||
/* From omindex direct */
|
||||
/* Truncate a string to a given maxlength, avoiding cutting off midword
|
||||
* if reasonably possible. */
|
||||
string
|
||||
@ -266,17 +275,13 @@ truncate_to_word(string & input, string::size_type maxlen)
|
||||
|
||||
output += " ...";
|
||||
}
|
||||
|
||||
// replace newlines with spaces
|
||||
size_t i = 0;
|
||||
while ((i = output.find('\n', i)) != string::npos) output[i] = ' ';
|
||||
// No need to replace newlines with spaces, we do this in dumb_string()
|
||||
return output;
|
||||
}
|
||||
|
||||
// Truncate longer path and uniquize with hash . The goad for this is
|
||||
// Truncate longer path and uniquize with hash . The goal for this is
|
||||
// to avoid xapian max term length limitations, not to gain space (we
|
||||
// gain very little even with very short maxlens like 30)
|
||||
#define HASHPATH
|
||||
#define PATHHASHLEN 150
|
||||
|
||||
// Add document in internal form to the database: index the terms in
|
||||
@ -310,7 +315,8 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
||||
|
||||
// Split and index file name. This supposes that it's either ascii
|
||||
// or utf-8. If this fails, we just go on. We need a config
|
||||
// parameter for file name charset
|
||||
// parameter for file name charset.
|
||||
// Do we really want to fold case here ?
|
||||
if (dumb_string(fn, noacc)) {
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
@ -324,7 +330,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
|
||||
// Split body and index terms
|
||||
// Split and index body
|
||||
if (!dumb_string(doc.text, noacc)) {
|
||||
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
@ -332,7 +338,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
|
||||
// Split keywords and index terms
|
||||
// Split and index keywords
|
||||
if (!dumb_string(doc.keywords, noacc)) {
|
||||
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
@ -340,7 +346,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
|
||||
// Split abstract and index terms
|
||||
// Split and index abstract
|
||||
if (!dumb_string(doc.abstract, noacc)) {
|
||||
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
|
||||
return false;
|
||||
@ -354,18 +360,13 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
||||
|
||||
// Path name
|
||||
string hash;
|
||||
#ifdef HASHPATH
|
||||
pathHash(fn, hash, PATHHASHLEN);
|
||||
#else
|
||||
hash = fn;
|
||||
#endif
|
||||
LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
|
||||
|
||||
string pathterm = "P" + hash;
|
||||
newdocument.add_term(pathterm);
|
||||
|
||||
// File path + internal path: document unique identifier for
|
||||
// documents inside multidocument files.
|
||||
// Internal path: with path, makes unique identifier for documents
|
||||
// inside multidocument files.
|
||||
string uniterm;
|
||||
if (!doc.ipath.empty()) {
|
||||
uniterm = "Q" + hash + "|" + doc.ipath;
|
||||
@ -395,8 +396,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
||||
string record = "url=file://" + fn;
|
||||
record += "\nmtype=" + doc.mimetype;
|
||||
record += "\nfmtime=" + doc.fmtime;
|
||||
if (!doc.dmtime.empty())
|
||||
if (!doc.dmtime.empty()) {
|
||||
record += "\ndmtime=" + doc.dmtime;
|
||||
}
|
||||
record += "\norigcharset=" + doc.origcharset;
|
||||
record += "\ncaption=" + doc.title;
|
||||
record += "\nkeywords=" + doc.keywords;
|
||||
@ -405,12 +407,10 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
||||
record += "\nipath=" + doc.ipath;
|
||||
}
|
||||
record += "\n";
|
||||
|
||||
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
|
||||
newdocument.set_data(record);
|
||||
|
||||
const char *fnc = fn.c_str();
|
||||
|
||||
// Add db entry or update existing entry:
|
||||
try {
|
||||
Xapian::docid did =
|
||||
@ -426,13 +426,19 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
||||
}
|
||||
} catch (...) {
|
||||
// FIXME: is this ever actually needed?
|
||||
ndb->wdb.add_document(newdocument);
|
||||
LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n",
|
||||
fnc));
|
||||
try {
|
||||
ndb->wdb.add_document(newdocument);
|
||||
LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n",
|
||||
fnc));
|
||||
} catch (...) {
|
||||
LOGERR(("Rcl::Db::add: failed again after replace_document\n"));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Test if given filename has changed since last indexed:
|
||||
bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
||||
{
|
||||
if (pdata == 0)
|
||||
@ -441,16 +447,9 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
||||
|
||||
// If no document exist with this path, we do need update
|
||||
string hash;
|
||||
#ifdef HASHPATH
|
||||
pathHash(filename, hash, PATHHASHLEN);
|
||||
#else
|
||||
hash = filename;
|
||||
#endif
|
||||
string pathterm = "P" + hash;
|
||||
if (!ndb->wdb.term_exists(pathterm)) {
|
||||
LOGDEB1(("Db::needUpdate: path inexistant: %s\n", pathterm.c_str()));
|
||||
return true;
|
||||
}
|
||||
const char *ermsg;
|
||||
|
||||
// Look for all documents with this path. We need to look at all
|
||||
// to set their existence flag. We check the update time on the
|
||||
@ -459,6 +458,11 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
||||
// file changed)
|
||||
Xapian::PostingIterator doc;
|
||||
try {
|
||||
if (!ndb->wdb.term_exists(pathterm)) {
|
||||
LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm);
|
||||
for (Xapian::PostingIterator docid = docid0;
|
||||
docid != ndb->wdb.postlist_end(pathterm); docid++) {
|
||||
@ -491,21 +495,26 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
||||
if (*docid < ndb->updated.size())
|
||||
ndb->updated[*docid] = true;
|
||||
}
|
||||
return false;
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg().c_str();
|
||||
} catch (...) {
|
||||
return true;
|
||||
ermsg= "Unknown error";
|
||||
}
|
||||
|
||||
return false;
|
||||
LOGERR(("Db::needUpdate: error while checking existence: %s\n", ermsg));
|
||||
return true;
|
||||
}
|
||||
|
||||
const static string stemdirstem = "stem_";
|
||||
/// Compute name of stem db for given base database and language
|
||||
static string stemdbname(const string& basename, string lang)
|
||||
{
|
||||
string nm = path_cat(basename, string("stem_") + lang);
|
||||
string nm = path_cat(basename, stemdirstem + lang);
|
||||
return nm;
|
||||
}
|
||||
|
||||
// Is char non-lowercase ascii ?
|
||||
// Deciding if we try to stem the term. If it has numerals or capitals
|
||||
// we don't
|
||||
inline static bool
|
||||
p_notlowerorutf(unsigned int c)
|
||||
{
|
||||
@ -514,6 +523,24 @@ p_notlowerorutf(unsigned int c)
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete stem db for given language
|
||||
*/
|
||||
bool Rcl::Db::deleteStemDb(const string& lang)
|
||||
{
|
||||
LOGDEB(("Rcl::Db::deleteStemDb(%s)\n", lang.c_str()));
|
||||
if (pdata == 0)
|
||||
return false;
|
||||
Native *ndb = (Native *)pdata;
|
||||
if (ndb->isopen == false)
|
||||
return false;
|
||||
|
||||
string dir = stemdbname(ndb->basedir, lang);
|
||||
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create database of stem to parents associations for a given language.
|
||||
* We walk the list of all terms, stem them, and create another Xapian db
|
||||
@ -526,7 +553,7 @@ bool Rcl::Db::createStemDb(const string& lang)
|
||||
if (pdata == 0)
|
||||
return false;
|
||||
Native *ndb = (Native *)pdata;
|
||||
if (ndb->isopen == false || ndb->iswritable == false)
|
||||
if (ndb->isopen == false)
|
||||
return false;
|
||||
|
||||
// First build the in-memory stem database:
|
||||
@ -562,23 +589,41 @@ bool Rcl::Db::createStemDb(const string& lang)
|
||||
}
|
||||
assocs.insert(pair<string,string>(stem, *it));
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
|
||||
return false;
|
||||
} catch (...) {
|
||||
LOGERR(("Stem database build failed: no stemmer for %s ? \n",
|
||||
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
|
||||
lang.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
class DirWiper {
|
||||
public:
|
||||
string dir;
|
||||
bool do_it;
|
||||
DirWiper(string d) : dir(d), do_it(true) {}
|
||||
~DirWiper() {
|
||||
if (do_it) {
|
||||
wipedir(dir);
|
||||
rmdir(dir.c_str());
|
||||
}
|
||||
}
|
||||
};
|
||||
// Create xapian database for stem relations
|
||||
string stemdbdir = stemdbname(ndb->basedir, lang);
|
||||
string ermsg = "NOERROR";
|
||||
// We want to get rid of the db dir in case of error. This gets disarmed
|
||||
// just before success return.
|
||||
DirWiper wiper(stemdbdir);
|
||||
const char *ermsg = "NOERROR";
|
||||
Xapian::WritableDatabase sdb;
|
||||
try {
|
||||
sdb = Xapian::WritableDatabase(stemdbdir,
|
||||
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg();
|
||||
ermsg = e.get_msg().c_str();
|
||||
} catch (const string &s) {
|
||||
ermsg = s;
|
||||
ermsg = s.c_str();
|
||||
} catch (const char *s) {
|
||||
ermsg = s;
|
||||
} catch (...) {
|
||||
@ -586,7 +631,7 @@ bool Rcl::Db::createStemDb(const string& lang)
|
||||
}
|
||||
if (ermsg != "NOERROR") {
|
||||
LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n",
|
||||
stemdbdir.c_str(), ermsg.c_str()));
|
||||
stemdbdir.c_str(), ermsg));
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -632,9 +677,27 @@ bool Rcl::Db::createStemDb(const string& lang)
|
||||
}
|
||||
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
|
||||
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
||||
wiper.do_it = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
list<string> Rcl::Db::getStemLangs()
|
||||
{
|
||||
list<string> dirs;
|
||||
LOGDEB(("Rcl::Db::getStemLang\n"));
|
||||
if (pdata == 0)
|
||||
return dirs;
|
||||
Native *ndb = (Native *)pdata;
|
||||
string pattern = stemdirstem + "*";
|
||||
dirs = path_dirglob(ndb->basedir, pattern);
|
||||
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
||||
*it = path_basename(*it);
|
||||
*it = it->substr(stemdirstem.length(), string::npos);
|
||||
}
|
||||
return dirs;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This is called at the end of an indexing session, to delete the
|
||||
* documents for files that are no longer there. We also build the
|
||||
@ -658,7 +721,11 @@ bool Rcl::Db::purge()
|
||||
// and does nothing). Maybe related to the exceptions below when
|
||||
// trying to delete an unexistant document ?
|
||||
// Flushing before trying the deletes seeems to work around the problem
|
||||
ndb->wdb.flush();
|
||||
try {
|
||||
ndb->wdb.flush();
|
||||
} catch (...) {
|
||||
LOGDEB(("Rcl::Db::purge: 1st flush failed\n"));
|
||||
}
|
||||
for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) {
|
||||
if (!ndb->updated[docid]) {
|
||||
try {
|
||||
@ -669,7 +736,11 @@ bool Rcl::Db::purge()
|
||||
}
|
||||
}
|
||||
}
|
||||
ndb->wdb.flush();
|
||||
try {
|
||||
ndb->wdb.flush();
|
||||
} catch (...) {
|
||||
LOGDEB(("Rcl::Db::purge: 2nd flush failed\n"));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -749,7 +820,6 @@ class wsQData : public TextSplitCB {
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// Turn string into list of xapian queries. There is little
|
||||
// interpretation done on the string (no +term -term or filename:term
|
||||
// stuff). We just separate words and phrases, and interpret
|
||||
@ -1124,21 +1194,18 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
|
||||
Native *ndb = (Native *)pdata;
|
||||
|
||||
string hash;
|
||||
#ifdef HASHPATH
|
||||
pathHash(fn, hash, PATHHASHLEN);
|
||||
#else
|
||||
hash = fn;
|
||||
#endif
|
||||
string pathterm = "P" + hash;
|
||||
if (!ndb->db.term_exists(pathterm)) {
|
||||
LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n",
|
||||
pathterm.c_str(), pathterm.length()));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Look for all documents with this path, searching for the one
|
||||
// with the appropriate ipath. This is very inefficient.
|
||||
const char *ermsg = "";
|
||||
try {
|
||||
if (!ndb->db.term_exists(pathterm)) {
|
||||
LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n",
|
||||
pathterm.c_str(), pathterm.length()));
|
||||
return false;
|
||||
}
|
||||
for (Xapian::PostingIterator docid =
|
||||
ndb->db.postlist_begin(pathterm);
|
||||
docid != ndb->db.postlist_end(pathterm); docid++) {
|
||||
@ -1148,8 +1215,17 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
|
||||
if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
|
||||
return true;
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg().c_str();
|
||||
} catch (const string &s) {
|
||||
ermsg = s.c_str();
|
||||
} catch (const char *s) {
|
||||
ermsg = s;
|
||||
} catch (...) {
|
||||
return false;
|
||||
ermsg = "Caught unknown exception";
|
||||
}
|
||||
if (*ermsg) {
|
||||
LOGERR(("Rcl::Db::getDoc: %s\n", ermsg));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#ifndef _DB_H_INCLUDED_
|
||||
#define _DB_H_INCLUDED_
|
||||
/* @(#$Id: rcldb.h,v 1.20 2005-12-02 16:18:20 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: rcldb.h,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
@ -102,6 +102,7 @@ public:
|
||||
bool needUpdate(const string &filename, const struct stat *stp);
|
||||
bool purge();
|
||||
bool createStemDb(const string &lang);
|
||||
bool deleteStemDb(const string &lang);
|
||||
|
||||
// Query-related functions
|
||||
|
||||
@ -127,6 +128,10 @@ public:
|
||||
/** Get results count for current query */
|
||||
int getResCnt();
|
||||
|
||||
/** Get a list of existing stemming databases */
|
||||
std::list<std::string> getStemLangs();
|
||||
|
||||
/** Things we don't want to have here. */
|
||||
friend class Rcl::DbPops;
|
||||
|
||||
private:
|
||||
|
||||
@ -15,8 +15,8 @@ trfstreewalk.o : fstreewalk.cpp fstreewalk.h
|
||||
$(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \
|
||||
-DTEST_FSTREEWALK fstreewalk.cpp
|
||||
|
||||
PATHUT_OBJS= trpathut.o pathut.o
|
||||
trpathut : $(PATHUT_OBJS)
|
||||
PATHUT_OBJS= trpathut.o pathut.o $(BIGLIB)
|
||||
trpathut : $(PATHUT_OBJS)
|
||||
$(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS)
|
||||
trpathut.o : pathut.cpp pathut.h
|
||||
$(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp
|
||||
|
||||
@ -1,15 +1,21 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.6 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.7 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
|
||||
#ifndef TEST_PATHUT
|
||||
#include <unistd.h>
|
||||
#include <sys/param.h>
|
||||
#include <pwd.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <stack>
|
||||
|
||||
#include "pathut.h"
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::string;
|
||||
using std::list;
|
||||
using std::stack;
|
||||
#endif /* NO_NAMESPACES */
|
||||
|
||||
void path_catslash(std::string &s) {
|
||||
@ -61,6 +67,18 @@ string path_getsimple(const string &s) {
|
||||
return simple;
|
||||
}
|
||||
|
||||
string path_basename(const string &s, const string &suff)
|
||||
{
|
||||
string simple = path_getsimple(s);
|
||||
string::size_type pos = string::npos;
|
||||
if (suff.length() && simple.length() > suff.length()) {
|
||||
pos = simple.rfind(suff);
|
||||
if (pos != string::npos && pos + suff.length() == simple.length())
|
||||
return simple.substr(0, pos);
|
||||
}
|
||||
return simple;
|
||||
}
|
||||
|
||||
string path_home()
|
||||
{
|
||||
uid_t uid = getuid();
|
||||
@ -98,6 +116,64 @@ extern string path_tildexpand(const string &s)
|
||||
return o;
|
||||
}
|
||||
|
||||
#include <smallut.h>
|
||||
extern std::string path_canon(const std::string &is)
|
||||
{
|
||||
if (is.length() == 0)
|
||||
return is;
|
||||
string s = is;
|
||||
if (s[0] != '/') {
|
||||
char buf[MAXPATHLEN];
|
||||
if (!getcwd(buf, MAXPATHLEN)) {
|
||||
return "";
|
||||
}
|
||||
s = path_cat(string(buf), s);
|
||||
}
|
||||
list<string>elems;
|
||||
stringToTokens(s, elems, "/");
|
||||
list<string> cleaned;
|
||||
for (list<string>::const_iterator it = elems.begin();
|
||||
it != elems.end(); it++){
|
||||
if (*it == "..") {
|
||||
if (!cleaned.empty())
|
||||
cleaned.pop_back();
|
||||
} else if (it->empty() || *it == ".") {
|
||||
} else {
|
||||
cleaned.push_back(*it);
|
||||
}
|
||||
}
|
||||
string ret;
|
||||
if (!cleaned.empty()) {
|
||||
for (list<string>::const_iterator it = cleaned.begin();
|
||||
it != cleaned.end(); it++) {
|
||||
ret += "/";
|
||||
ret += *it;
|
||||
}
|
||||
} else {
|
||||
ret = "/";
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#include <glob.h>
|
||||
#include <sys/stat.h>
|
||||
list<std::string> path_dirglob(const std::string &dir,
|
||||
const std::string pattern)
|
||||
{
|
||||
list<string> res;
|
||||
glob_t mglob;
|
||||
string mypat=path_cat(dir, pattern);
|
||||
if (glob(mypat.c_str(), 0, 0, &mglob)) {
|
||||
return res;
|
||||
}
|
||||
for (int i = 0; i < mglob.gl_pathc; i++) {
|
||||
res.push_back(mglob.gl_pathv[i]);
|
||||
}
|
||||
globfree(&mglob);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
#else // TEST_PATHUT
|
||||
|
||||
#include <iostream>
|
||||
@ -108,7 +184,7 @@ using namespace std;
|
||||
const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2",
|
||||
"/dir1/dir2",
|
||||
"./dir", "./dir1/", "dir", "../dir", "/dir/toto.c",
|
||||
"/dir/.c",
|
||||
"/dir/.c", "/dir/toto.txt", "toto.txt1"
|
||||
};
|
||||
|
||||
const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub",
|
||||
@ -117,22 +193,51 @@ int nttvec = sizeof(ttvec) / sizeof(string);
|
||||
|
||||
int main(int argc, const char **argv)
|
||||
{
|
||||
string s;
|
||||
list<string>::const_iterator it;
|
||||
#if 0
|
||||
for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
|
||||
cout << tstvec[i] << " FATHER " << path_getfather(tstvec[i]) << endl;
|
||||
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
|
||||
cout << tstvec[i] << " Father " << path_getfather(tstvec[i]) << endl;
|
||||
}
|
||||
for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
|
||||
cout << tstvec[i] << " SIMPLE " << path_getsimple(tstvec[i]) << endl;
|
||||
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
|
||||
cout << tstvec[i] << " Simple " << path_getsimple(tstvec[i]) << endl;
|
||||
}
|
||||
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
|
||||
cout << tstvec[i] << " Basename " <<
|
||||
path_basename(tstvec[i], ".txt") << endl;
|
||||
}
|
||||
#endif
|
||||
string s;
|
||||
|
||||
#if 0
|
||||
for (int i = 0; i < nttvec; i++) {
|
||||
cout << "tildexp: '" << ttvec[i] << "' -> '" <<
|
||||
path_tildexpand(ttvec[i]) << "'" << endl;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
const string canontst[] = {"/dir1/../../..", "/////", "",
|
||||
"/dir1/../../.././/////dir2///////",
|
||||
"../../",
|
||||
"../../../../../../../../../../"
|
||||
};
|
||||
unsigned int nttvec = sizeof(canontst) / sizeof(string);
|
||||
for (unsigned int i = 0; i < nttvec; i++) {
|
||||
cout << "canon: '" << canontst[i] << "' -> '" <<
|
||||
path_canon(canontst[i]) << "'" << endl;
|
||||
}
|
||||
#endif
|
||||
#if 1
|
||||
if (argc != 3) {
|
||||
fprintf(stderr, "Usage: trpathut <dir> <pattern>\n");
|
||||
exit(1);
|
||||
}
|
||||
string dir=argv[1], pattern=argv[2];
|
||||
list<string> matched = path_dirglob(dir, pattern);
|
||||
for (it = matched.begin(); it != matched.end();it++) {
|
||||
cout << *it << endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1,14 +1,19 @@
|
||||
#ifndef _PATHUT_H_INCLUDED_
|
||||
#define _PATHUT_H_INCLUDED_
|
||||
/* @(#$Id: pathut.h,v 1.4 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: pathut.h,v 1.5 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
|
||||
extern void path_catslash(std::string &s);
|
||||
extern std::string path_cat(const std::string &s1, const std::string &s2);
|
||||
extern std::string path_getsimple(const std::string &s);
|
||||
extern std::string path_basename(const std::string &s, const std::string &suff="");
|
||||
extern std::string path_getfather(const std::string &s);
|
||||
extern std::string path_home();
|
||||
extern std::string path_tildexpand(const std::string &s);
|
||||
|
||||
extern std::string path_canon(const std::string &s);
|
||||
extern std::list<std::string> path_dirglob(const std::string &dir,
|
||||
const std::string pattern);
|
||||
#endif /* _PATHUT_H_INCLUDED_ */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user