allow independant creation / deletion of stem dbs

This commit is contained in:
dockes 2006-01-09 16:53:31 +00:00
parent c4ce5cf691
commit dac569ab51
9 changed files with 364 additions and 122 deletions

View File

@ -1,29 +1,29 @@
#*
*.cache
*.core
*.o *.o
*~ *~
*.core .#*
*.cache .#*
#*
.moc .moc
.obj .obj
.ui .ui
.#*
CVS CVS
alldeps
.#*
autom4*
TAGS TAGS
alldeps
autom4*
config.cache config.cache
config.log config.log
config.status config.status
excludefile excludefile
lib/librcl.a
makesrcdist.sh makesrcdist.sh
recollinstall
mk/localdefs mk/localdefs
sysconf
qtgui/Makefile qtgui/Makefile
qtgui/preview/Makefile qtgui/preview/Makefile
qtgui/preview/preview.pro qtgui/preview/preview.pro
qtgui/preview/pvmain.cpp qtgui/preview/pvmain.cpp
lib/librcl.a recollinstall
sampleconf/recoll.conf sampleconf/recoll.conf
sysconf
wxgui wxgui

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: indexer.cpp,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <sys/stat.h> #include <sys/stat.h>
@ -10,6 +10,7 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp
#include <iostream> #include <iostream>
#include <list> #include <list>
#include <map> #include <map>
#include <algorithm>
#include "pathut.h" #include "pathut.h"
#include "conftree.h" #include "conftree.h"
@ -87,13 +88,22 @@ bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
// filesystem anymore. // filesystem anymore.
db.purge(); db.purge();
// Create stemming databases // Create stemming databases. We also remove those which are not
// configured.
string slangs; string slangs;
if (config->getConfParam("indexstemminglanguages", slangs)) { if (config->getConfParam("indexstemminglanguages", slangs)) {
list<string> langs; list<string> langs;
stringToStrings(slangs, langs); stringToStrings(slangs, langs);
for (list<string>::const_iterator it = langs.begin();
it != langs.end(); it++) { // Get the list of existing stem dbs from the database (some may have
// been manually created, we just keep those from the config
list<string> dblangs = db.getStemLangs();
list<string>::const_iterator it;
for (it = dblangs.begin(); it != dblangs.end(); it++) {
if (find(langs.begin(), langs.end(), *it) == langs.end())
db.deleteStemDb(*it);
}
for (it = langs.begin(); it != langs.end(); it++) {
db.createStemDb(*it); db.createStemDb(*it);
} }
} }
@ -120,6 +130,16 @@ bool DbIndexer::init(bool resetbefore)
return true; return true;
} }
bool DbIndexer::createStemDb(const string &lang)
{
if (!init())
return false;
return db.createStemDb(lang);
}
/**
Index individual files, out of a full tree run. No database purging
*/
bool DbIndexer::indexFiles(const list<string> &filenames) bool DbIndexer::indexFiles(const list<string> &filenames)
{ {
if (!init()) if (!init())

View File

@ -1,6 +1,6 @@
#ifndef _INDEXER_H_INCLUDED_ #ifndef _INDEXER_H_INCLUDED_
#define _INDEXER_H_INCLUDED_ #define _INDEXER_H_INCLUDED_
/* @(#$Id: indexer.h,v 1.8 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: indexer.h,v 1.9 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list> #include <list>
@ -24,10 +24,12 @@ class DbIndexer;
class ConfIndexer { class ConfIndexer {
public: public:
enum runStatus {IndexerOk, IndexerError}; enum runStatus {IndexerOk, IndexerError};
ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) {} ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0)
virtual ~ConfIndexer(); {
/** Worker function: doe the actual indexing */ }
bool index(bool resetbefore = false); virtual ~ConfIndexer();
/** Worker function: doe the actual indexing */
bool index(bool resetbefore = false);
private: private:
RclConfig *config; RclConfig *config;
DbIndexer *dbindexer; // Object to process directories for a given db DbIndexer *dbindexer; // Object to process directories for a given db
@ -36,10 +38,10 @@ class ConfIndexer {
/** Index things into one database /** Index things into one database
Tree indexing: we inherits FsTreeWalkerCB so that, the processone() Tree indexing: we inherits FsTreeWalkerCB so that, the processone()
method is called by the file-system tree walk code for each file and method is called by the file-system tree walk code for each file and
directory. We keep all state needed while indexing, and finally call directory. We keep all state needed while indexing, and finally call
the methods to purge the db of stale entries and create the stemming the methods to purge the db of stale entries and create the stemming
databases. databases.
Single file(s) indexing: no database purging or stem db updating. Single file(s) indexing: no database purging or stem db updating.
*/ */
@ -67,6 +69,9 @@ class DbIndexer : public FsTreeWalkerCB {
/** Index a list of files. No db cleaning or stemdb updating */ /** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(const std::list<std::string> &files); bool indexFiles(const std::list<std::string> &files);
/** Create stem database for given language */
bool createStemDb(const string &lang);
/** Tree walker callback method */ /** Tree walker callback method */
FsTreeWalker::Status FsTreeWalker::Status
processone(const std::string &, const struct stat *, processone(const std::string &, const struct stat *,

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.13 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.14 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <stdio.h> #include <stdio.h>
@ -19,10 +19,12 @@ using namespace std;
#include "pathut.h" #include "pathut.h"
// Globals for exit cleanup
ConfIndexer *confindexer; ConfIndexer *confindexer;
DbIndexer *dbindexer; DbIndexer *dbindexer;
bool indexfiles(RclConfig *config, const list<string> &filenames) // Index a list of files
static bool indexfiles(RclConfig *config, const list<string> &filenames)
{ {
if (filenames.empty()) if (filenames.empty())
return true; return true;
@ -42,6 +44,21 @@ bool indexfiles(RclConfig *config, const list<string> &filenames)
return dbindexer->indexFiles(filenames); return dbindexer->indexFiles(filenames);
} }
// Create additional stem database
static bool createstemdb(RclConfig *config, const string &lang)
{
// Note that we do not bother to check for multiple databases,
// which are currently a fiction anyway.
string dbdir;
if (!config->getConfParam("dbdir", dbdir)) {
LOGERR(("createstemdb: no database directory in configuration\n"));
return false;
}
dbdir = path_tildexpand(dbdir);
dbindexer = new DbIndexer(config, dbdir);
return dbindexer->createStemDb(lang);
}
static void cleanup() static void cleanup()
{ {
delete confindexer; delete confindexer;
@ -63,15 +80,19 @@ static int op_flags;
#define OPT_z 0x2 #define OPT_z 0x2
#define OPT_h 0x4 #define OPT_h 0x4
#define OPT_i 0x8 #define OPT_i 0x8
#define OPT_s 0x10
static const char usage [] = static const char usage [] =
" recollindex [-hz] \n" "\n"
" recollindex -i <filename [filename ...]>\n" "recollindex [-hz] \n"
" Normal index run\n"
"recollindex -i <filename [filename ...]>\n"
" Index individual files. No db purge or stem database updates\n"
"recollindex -s <lang>\n"
" Build stem database for language <lang>\n"
"Options:\n" "Options:\n"
" -h : print this message\n" " -h : print this message\n"
" -z : reset database before starting indexation\n\n" " -z : reset database before starting indexation\n\n"
" -i <filename [filename ...]> : index individual files. No db purge or stem\n"
" database updates in this case\n"
; ;
static void static void
@ -97,6 +118,7 @@ int main(int argc, const char **argv)
case 'z': op_flags |= OPT_z; break; case 'z': op_flags |= OPT_z; break;
case 'h': op_flags |= OPT_h; break; case 'h': op_flags |= OPT_h; break;
case 'i': op_flags |= OPT_i; break; case 'i': op_flags |= OPT_i; break;
case 's': op_flags |= OPT_s; break;
default: Usage(); break; default: Usage(); break;
} }
b1: argc--; argv++; b1: argc--; argv++;
@ -108,7 +130,6 @@ int main(int argc, const char **argv)
string reason; string reason;
RclConfig *config = recollinit(cleanup, sigcleanup, reason); RclConfig *config = recollinit(cleanup, sigcleanup, reason);
if (config == 0 || !config->ok()) { if (config == 0 || !config->ok()) {
cerr << "Configuration problem: " << reason << endl; cerr << "Configuration problem: " << reason << endl;
exit(1); exit(1);
@ -130,6 +151,11 @@ int main(int argc, const char **argv)
} }
} }
exit(!indexfiles(config, filenames)); exit(!indexfiles(config, filenames));
} else if (op_flags & OPT_s) {
if (argc != 1)
Usage();
string lang = *argv++; argc--;
exit(!createstemdb(config, lang));
} else { } else {
confindexer = new ConfIndexer(config); confindexer = new ConfIndexer(config);
bool rezero(op_flags & OPT_z); bool rezero(op_flags & OPT_z);

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.48 2006-01-06 13:55:44 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.49 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <sys/stat.h> #include <sys/stat.h>
@ -23,6 +23,7 @@ using namespace std;
#include "smallut.h" #include "smallut.h"
#include "pathhash.h" #include "pathhash.h"
#include "utf8iter.h" #include "utf8iter.h"
#include "wipedir.h"
#include "xapian.h" #include "xapian.h"
#include <xapian/stem.h> #include <xapian/stem.h>
@ -67,23 +68,24 @@ Rcl::Db::~Db()
ndb->iswritable)); ndb->iswritable));
if (ndb->isopen == false) if (ndb->isopen == false)
return; return;
string ermsg; const char *ermsg = "Unknown error";
try { try {
LOGDEB(("Rcl::Db::~Db: closing native database\n")); LOGDEB(("Rcl::Db::~Db: closing native database\n"));
if (ndb->iswritable == true) if (ndb->iswritable == true) {
ndb->wdb.flush(); ndb->wdb.flush();
}
delete ndb; delete ndb;
return; return;
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
ermsg = e.get_msg(); ermsg = e.get_msg().c_str();
} catch (const string &s) { } catch (const string &s) {
ermsg = s; ermsg = s.c_str();
} catch (const char *s) { } catch (const char *s) {
ermsg = s; ermsg = s;
} catch (...) { } catch (...) {
ermsg = "Caught unknown exception"; ermsg = "Caught unknown exception";
} }
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg.c_str())); LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
} }
bool Rcl::Db::open(const string& dir, OpenMode mode) bool Rcl::Db::open(const string& dir, OpenMode mode)
@ -98,7 +100,7 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
LOGERR(("Rcl::Db::open: already open\n")); LOGERR(("Rcl::Db::open: already open\n"));
return false; return false;
} }
string ermsg; const char *ermsg = "Unknown";
try { try {
switch (mode) { switch (mode) {
case DbUpd: case DbUpd:
@ -125,16 +127,16 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
ndb->basedir = dir; ndb->basedir = dir;
return true; return true;
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
ermsg = e.get_msg(); ermsg = e.get_msg().c_str();
} catch (const string &s) { } catch (const string &s) {
ermsg = s; ermsg = s.c_str();
} catch (const char *s) { } catch (const char *s) {
ermsg = s; ermsg = s;
} catch (...) { } catch (...) {
ermsg = "Caught unknown exception"; ermsg = "Caught unknown exception";
} }
LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n", LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n",
dir.c_str(), ermsg.c_str())); dir.c_str(), ermsg));
return false; return false;
} }
@ -148,7 +150,7 @@ bool Rcl::Db::close()
ndb->iswritable)); ndb->iswritable));
if (ndb->isopen == false) if (ndb->isopen == false)
return true; return true;
string ermsg; const char *ermsg = "Unknown";
try { try {
if (ndb->iswritable == true) { if (ndb->iswritable == true) {
ndb->wdb.flush(); ndb->wdb.flush();
@ -159,16 +161,15 @@ bool Rcl::Db::close()
if (pdata) if (pdata)
return true; return true;
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
ermsg = e.get_msg(); ermsg = e.get_msg().c_str();
} catch (const string &s) { } catch (const string &s) {
ermsg = s; ermsg = s.c_str();
} catch (const char *s) { } catch (const char *s) {
ermsg = s; ermsg = s;
} catch (...) { } catch (...) {
ermsg = "Caught unknown exception"; ermsg = "Caught unknown exception";
} }
LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", ermsg));
ermsg.c_str()));
return false; return false;
} }
@ -194,21 +195,29 @@ class mySplitterCB : public TextSplitCB {
// Callback for the document to word splitting class during indexation // Callback for the document to word splitting class during indexation
bool mySplitterCB::takeword(const std::string &term, int pos, int, int) bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
{ {
// cerr << "splitCb: term " << term << endl; #if 0
//string printable; LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
//transcode(term, printable, "UTF-8", "ISO-8859-1"); string printable;
//cerr << "Adding " << printable << endl; if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
LOGDEB((" [%s]\n", printable.c_str()));
}
#endif
const char *ermsg;
try { try {
// 1 is the value for wdfinc in index_text when called from omindex // Note: 1 is the within document frequency increment. It would
// TOBEDONE: check what this is used for // be possible to assign different weigths to doc parts (ie title)
// by using a higher value
curpos = pos; curpos = pos;
doc.add_posting(term, basepos + curpos, 1); doc.add_posting(term, basepos + curpos, 1);
return true;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (...) { } catch (...) {
LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n")); ermsg= "Unknown error";
return false;
} }
return true; LOGERR(("Rcl::Db: xapian add_posting error %s\n", ermsg));
return false;
} }
// Unaccent and lowercase data, replace \n\r with spaces // Unaccent and lowercase data, replace \n\r with spaces
@ -239,7 +248,7 @@ bool Rcl::dumb_string(const string &in, string &out)
return true; return true;
} }
/* omindex direct */ /* From omindex direct */
/* Truncate a string to a given maxlength, avoiding cutting off midword /* Truncate a string to a given maxlength, avoiding cutting off midword
* if reasonably possible. */ * if reasonably possible. */
string string
@ -266,17 +275,13 @@ truncate_to_word(string & input, string::size_type maxlen)
output += " ..."; output += " ...";
} }
// No need to replace newlines with spaces, we do this in dumb_string()
// replace newlines with spaces
size_t i = 0;
while ((i = output.find('\n', i)) != string::npos) output[i] = ' ';
return output; return output;
} }
// Truncate longer path and uniquize with hash . The goad for this is // Truncate longer path and uniquize with hash . The goal for this is
// to avoid xapian max term length limitations, not to gain space (we // to avoid xapian max term length limitations, not to gain space (we
// gain very little even with very short maxlens like 30) // gain very little even with very short maxlens like 30)
#define HASHPATH
#define PATHHASHLEN 150 #define PATHHASHLEN 150
// Add document in internal form to the database: index the terms in // Add document in internal form to the database: index the terms in
@ -310,7 +315,8 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
// Split and index file name. This supposes that it's either ascii // Split and index file name. This supposes that it's either ascii
// or utf-8. If this fails, we just go on. We need a config // or utf-8. If this fails, we just go on. We need a config
// parameter for file name charset // parameter for file name charset.
// Do we really want to fold case here ?
if (dumb_string(fn, noacc)) { if (dumb_string(fn, noacc)) {
splitter.text_to_words(noacc); splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
@ -324,7 +330,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
splitter.text_to_words(noacc); splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
// Split body and index terms // Split and index body
if (!dumb_string(doc.text, noacc)) { if (!dumb_string(doc.text, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n")); LOGERR(("Rcl::Db::add: dumb_string failed\n"));
return false; return false;
@ -332,7 +338,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
splitter.text_to_words(noacc); splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
// Split keywords and index terms // Split and index keywords
if (!dumb_string(doc.keywords, noacc)) { if (!dumb_string(doc.keywords, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n")); LOGERR(("Rcl::Db::add: dumb_string failed\n"));
return false; return false;
@ -340,7 +346,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
splitter.text_to_words(noacc); splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100; splitData.basepos += splitData.curpos + 100;
// Split abstract and index terms // Split and index abstract
if (!dumb_string(doc.abstract, noacc)) { if (!dumb_string(doc.abstract, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n")); LOGERR(("Rcl::Db::add: dumb_string failed\n"));
return false; return false;
@ -354,18 +360,13 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
// Path name // Path name
string hash; string hash;
#ifdef HASHPATH
pathHash(fn, hash, PATHHASHLEN); pathHash(fn, hash, PATHHASHLEN);
#else
hash = fn;
#endif
LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str())); LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
string pathterm = "P" + hash; string pathterm = "P" + hash;
newdocument.add_term(pathterm); newdocument.add_term(pathterm);
// File path + internal path: document unique identifier for // Internal path: with path, makes unique identifier for documents
// documents inside multidocument files. // inside multidocument files.
string uniterm; string uniterm;
if (!doc.ipath.empty()) { if (!doc.ipath.empty()) {
uniterm = "Q" + hash + "|" + doc.ipath; uniterm = "Q" + hash + "|" + doc.ipath;
@ -395,8 +396,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
string record = "url=file://" + fn; string record = "url=file://" + fn;
record += "\nmtype=" + doc.mimetype; record += "\nmtype=" + doc.mimetype;
record += "\nfmtime=" + doc.fmtime; record += "\nfmtime=" + doc.fmtime;
if (!doc.dmtime.empty()) if (!doc.dmtime.empty()) {
record += "\ndmtime=" + doc.dmtime; record += "\ndmtime=" + doc.dmtime;
}
record += "\norigcharset=" + doc.origcharset; record += "\norigcharset=" + doc.origcharset;
record += "\ncaption=" + doc.title; record += "\ncaption=" + doc.title;
record += "\nkeywords=" + doc.keywords; record += "\nkeywords=" + doc.keywords;
@ -405,12 +407,10 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
record += "\nipath=" + doc.ipath; record += "\nipath=" + doc.ipath;
} }
record += "\n"; record += "\n";
LOGDEB1(("Newdocument data: %s\n", record.c_str())); LOGDEB1(("Newdocument data: %s\n", record.c_str()));
newdocument.set_data(record); newdocument.set_data(record);
const char *fnc = fn.c_str(); const char *fnc = fn.c_str();
// Add db entry or update existing entry: // Add db entry or update existing entry:
try { try {
Xapian::docid did = Xapian::docid did =
@ -426,13 +426,19 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
} }
} catch (...) { } catch (...) {
// FIXME: is this ever actually needed? // FIXME: is this ever actually needed?
ndb->wdb.add_document(newdocument); try {
LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n", ndb->wdb.add_document(newdocument);
fnc)); LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n",
fnc));
} catch (...) {
LOGERR(("Rcl::Db::add: failed again after replace_document\n"));
return false;
}
} }
return true; return true;
} }
// Test if given filename has changed since last indexed:
bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
{ {
if (pdata == 0) if (pdata == 0)
@ -441,16 +447,9 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
// If no document exist with this path, we do need update // If no document exist with this path, we do need update
string hash; string hash;
#ifdef HASHPATH
pathHash(filename, hash, PATHHASHLEN); pathHash(filename, hash, PATHHASHLEN);
#else
hash = filename;
#endif
string pathterm = "P" + hash; string pathterm = "P" + hash;
if (!ndb->wdb.term_exists(pathterm)) { const char *ermsg;
LOGDEB1(("Db::needUpdate: path inexistant: %s\n", pathterm.c_str()));
return true;
}
// Look for all documents with this path. We need to look at all // Look for all documents with this path. We need to look at all
// to set their existence flag. We check the update time on the // to set their existence flag. We check the update time on the
@ -459,6 +458,11 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
// file changed) // file changed)
Xapian::PostingIterator doc; Xapian::PostingIterator doc;
try { try {
if (!ndb->wdb.term_exists(pathterm)) {
LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
return true;
}
Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm); Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm);
for (Xapian::PostingIterator docid = docid0; for (Xapian::PostingIterator docid = docid0;
docid != ndb->wdb.postlist_end(pathterm); docid++) { docid != ndb->wdb.postlist_end(pathterm); docid++) {
@ -491,21 +495,26 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
if (*docid < ndb->updated.size()) if (*docid < ndb->updated.size())
ndb->updated[*docid] = true; ndb->updated[*docid] = true;
} }
return false;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (...) { } catch (...) {
return true; ermsg= "Unknown error";
} }
LOGERR(("Db::needUpdate: error while checking existence: %s\n", ermsg));
return false; return true;
} }
const static string stemdirstem = "stem_";
/// Compute name of stem db for given base database and language /// Compute name of stem db for given base database and language
static string stemdbname(const string& basename, string lang) static string stemdbname(const string& basename, string lang)
{ {
string nm = path_cat(basename, string("stem_") + lang); string nm = path_cat(basename, stemdirstem + lang);
return nm; return nm;
} }
// Is char non-lowercase ascii ? // Deciding if we try to stem the term. If it has numerals or capitals
// we don't
inline static bool inline static bool
p_notlowerorutf(unsigned int c) p_notlowerorutf(unsigned int c)
{ {
@ -514,6 +523,24 @@ p_notlowerorutf(unsigned int c)
return false; return false;
} }
/**
* Delete stem db for given language
*/
bool Rcl::Db::deleteStemDb(const string& lang)
{
LOGDEB(("Rcl::Db::deleteStemDb(%s)\n", lang.c_str()));
if (pdata == 0)
return false;
Native *ndb = (Native *)pdata;
if (ndb->isopen == false)
return false;
string dir = stemdbname(ndb->basedir, lang);
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
return true;
return false;
}
/** /**
* Create database of stem to parents associations for a given language. * Create database of stem to parents associations for a given language.
* We walk the list of all terms, stem them, and create another Xapian db * We walk the list of all terms, stem them, and create another Xapian db
@ -526,7 +553,7 @@ bool Rcl::Db::createStemDb(const string& lang)
if (pdata == 0) if (pdata == 0)
return false; return false;
Native *ndb = (Native *)pdata; Native *ndb = (Native *)pdata;
if (ndb->isopen == false || ndb->iswritable == false) if (ndb->isopen == false)
return false; return false;
// First build the in-memory stem database: // First build the in-memory stem database:
@ -562,23 +589,41 @@ bool Rcl::Db::createStemDb(const string& lang)
} }
assocs.insert(pair<string,string>(stem, *it)); assocs.insert(pair<string,string>(stem, *it));
} }
} catch (const Xapian::Error &e) {
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
return false;
} catch (...) { } catch (...) {
LOGERR(("Stem database build failed: no stemmer for %s ? \n", LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
lang.c_str())); lang.c_str()));
return false; return false;
} }
class DirWiper {
public:
string dir;
bool do_it;
DirWiper(string d) : dir(d), do_it(true) {}
~DirWiper() {
if (do_it) {
wipedir(dir);
rmdir(dir.c_str());
}
}
};
// Create xapian database for stem relations // Create xapian database for stem relations
string stemdbdir = stemdbname(ndb->basedir, lang); string stemdbdir = stemdbname(ndb->basedir, lang);
string ermsg = "NOERROR"; // We want to get rid of the db dir in case of error. This gets disarmed
// just before success return.
DirWiper wiper(stemdbdir);
const char *ermsg = "NOERROR";
Xapian::WritableDatabase sdb; Xapian::WritableDatabase sdb;
try { try {
sdb = Xapian::WritableDatabase(stemdbdir, sdb = Xapian::WritableDatabase(stemdbdir,
Xapian::DB_CREATE_OR_OVERWRITE); Xapian::DB_CREATE_OR_OVERWRITE);
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
ermsg = e.get_msg(); ermsg = e.get_msg().c_str();
} catch (const string &s) { } catch (const string &s) {
ermsg = s; ermsg = s.c_str();
} catch (const char *s) { } catch (const char *s) {
ermsg = s; ermsg = s;
} catch (...) { } catch (...) {
@ -586,7 +631,7 @@ bool Rcl::Db::createStemDb(const string& lang)
} }
if (ermsg != "NOERROR") { if (ermsg != "NOERROR") {
LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n", LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n",
stemdbdir.c_str(), ermsg.c_str())); stemdbdir.c_str(), ermsg));
return false; return false;
} }
@ -632,9 +677,27 @@ bool Rcl::Db::createStemDb(const string& lang)
} }
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
assocs.size(), stemdiff, stemmultiple, nostem, stemconst)); assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
wiper.do_it = false;
return true; return true;
} }
list<string> Rcl::Db::getStemLangs()
{
list<string> dirs;
LOGDEB(("Rcl::Db::getStemLang\n"));
if (pdata == 0)
return dirs;
Native *ndb = (Native *)pdata;
string pattern = stemdirstem + "*";
dirs = path_dirglob(ndb->basedir, pattern);
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
*it = path_basename(*it);
*it = it->substr(stemdirstem.length(), string::npos);
}
return dirs;
}
/** /**
* This is called at the end of an indexing session, to delete the * This is called at the end of an indexing session, to delete the
* documents for files that are no longer there. We also build the * documents for files that are no longer there. We also build the
@ -658,7 +721,11 @@ bool Rcl::Db::purge()
// and does nothing). Maybe related to the exceptions below when // and does nothing). Maybe related to the exceptions below when
// trying to delete an unexistant document ? // trying to delete an unexistant document ?
// Flushing before trying the deletes seeems to work around the problem // Flushing before trying the deletes seeems to work around the problem
ndb->wdb.flush(); try {
ndb->wdb.flush();
} catch (...) {
LOGDEB(("Rcl::Db::purge: 1st flush failed\n"));
}
for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) { for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) {
if (!ndb->updated[docid]) { if (!ndb->updated[docid]) {
try { try {
@ -669,7 +736,11 @@ bool Rcl::Db::purge()
} }
} }
} }
ndb->wdb.flush(); try {
ndb->wdb.flush();
} catch (...) {
LOGDEB(("Rcl::Db::purge: 2nd flush failed\n"));
}
return true; return true;
} }
@ -749,7 +820,6 @@ class wsQData : public TextSplitCB {
}; };
//
// Turn string into list of xapian queries. There is little // Turn string into list of xapian queries. There is little
// interpretation done on the string (no +term -term or filename:term // interpretation done on the string (no +term -term or filename:term
// stuff). We just separate words and phrases, and interpret // stuff). We just separate words and phrases, and interpret
@ -1124,21 +1194,18 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
Native *ndb = (Native *)pdata; Native *ndb = (Native *)pdata;
string hash; string hash;
#ifdef HASHPATH
pathHash(fn, hash, PATHHASHLEN); pathHash(fn, hash, PATHHASHLEN);
#else
hash = fn;
#endif
string pathterm = "P" + hash; string pathterm = "P" + hash;
if (!ndb->db.term_exists(pathterm)) {
LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n",
pathterm.c_str(), pathterm.length()));
return false;
}
// Look for all documents with this path, searching for the one // Look for all documents with this path, searching for the one
// with the appropriate ipath. This is very inefficient. // with the appropriate ipath. This is very inefficient.
const char *ermsg = "";
try { try {
if (!ndb->db.term_exists(pathterm)) {
LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n",
pathterm.c_str(), pathterm.length()));
return false;
}
for (Xapian::PostingIterator docid = for (Xapian::PostingIterator docid =
ndb->db.postlist_begin(pathterm); ndb->db.postlist_begin(pathterm);
docid != ndb->db.postlist_end(pathterm); docid++) { docid != ndb->db.postlist_end(pathterm); docid++) {
@ -1148,8 +1215,17 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
if (dbDataToRclDoc(data, doc) && doc.ipath == ipath) if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
return true; return true;
} }
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) { } catch (...) {
return false; ermsg = "Caught unknown exception";
}
if (*ermsg) {
LOGERR(("Rcl::Db::getDoc: %s\n", ermsg));
} }
return false; return false;
} }

View File

@ -1,6 +1,6 @@
#ifndef _DB_H_INCLUDED_ #ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_ #define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.20 2005-12-02 16:18:20 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: rcldb.h,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list> #include <list>
@ -102,6 +102,7 @@ public:
bool needUpdate(const string &filename, const struct stat *stp); bool needUpdate(const string &filename, const struct stat *stp);
bool purge(); bool purge();
bool createStemDb(const string &lang); bool createStemDb(const string &lang);
bool deleteStemDb(const string &lang);
// Query-related functions // Query-related functions
@ -127,6 +128,10 @@ public:
/** Get results count for current query */ /** Get results count for current query */
int getResCnt(); int getResCnt();
/** Get a list of existing stemming databases */
std::list<std::string> getStemLangs();
/** Things we don't want to have here. */
friend class Rcl::DbPops; friend class Rcl::DbPops;
private: private:

View File

@ -15,8 +15,8 @@ trfstreewalk.o : fstreewalk.cpp fstreewalk.h
$(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \ $(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \
-DTEST_FSTREEWALK fstreewalk.cpp -DTEST_FSTREEWALK fstreewalk.cpp
PATHUT_OBJS= trpathut.o pathut.o PATHUT_OBJS= trpathut.o pathut.o $(BIGLIB)
trpathut : $(PATHUT_OBJS) trpathut : $(PATHUT_OBJS)
$(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS) $(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS)
trpathut.o : pathut.cpp pathut.h trpathut.o : pathut.cpp pathut.h
$(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp $(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp

View File

@ -1,15 +1,21 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.6 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: pathut.cpp,v 1.7 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
#ifndef TEST_PATHUT #ifndef TEST_PATHUT
#include <unistd.h> #include <unistd.h>
#include <sys/param.h>
#include <pwd.h> #include <pwd.h>
#include <iostream> #include <iostream>
#include <list>
#include <stack>
#include "pathut.h" #include "pathut.h"
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
using std::string; using std::string;
using std::list;
using std::stack;
#endif /* NO_NAMESPACES */ #endif /* NO_NAMESPACES */
void path_catslash(std::string &s) { void path_catslash(std::string &s) {
@ -61,6 +67,18 @@ string path_getsimple(const string &s) {
return simple; return simple;
} }
string path_basename(const string &s, const string &suff)
{
string simple = path_getsimple(s);
string::size_type pos = string::npos;
if (suff.length() && simple.length() > suff.length()) {
pos = simple.rfind(suff);
if (pos != string::npos && pos + suff.length() == simple.length())
return simple.substr(0, pos);
}
return simple;
}
string path_home() string path_home()
{ {
uid_t uid = getuid(); uid_t uid = getuid();
@ -98,6 +116,64 @@ extern string path_tildexpand(const string &s)
return o; return o;
} }
#include <smallut.h>
extern std::string path_canon(const std::string &is)
{
if (is.length() == 0)
return is;
string s = is;
if (s[0] != '/') {
char buf[MAXPATHLEN];
if (!getcwd(buf, MAXPATHLEN)) {
return "";
}
s = path_cat(string(buf), s);
}
list<string>elems;
stringToTokens(s, elems, "/");
list<string> cleaned;
for (list<string>::const_iterator it = elems.begin();
it != elems.end(); it++){
if (*it == "..") {
if (!cleaned.empty())
cleaned.pop_back();
} else if (it->empty() || *it == ".") {
} else {
cleaned.push_back(*it);
}
}
string ret;
if (!cleaned.empty()) {
for (list<string>::const_iterator it = cleaned.begin();
it != cleaned.end(); it++) {
ret += "/";
ret += *it;
}
} else {
ret = "/";
}
return ret;
}
#include <glob.h>
#include <sys/stat.h>
list<std::string> path_dirglob(const std::string &dir,
const std::string pattern)
{
list<string> res;
glob_t mglob;
string mypat=path_cat(dir, pattern);
if (glob(mypat.c_str(), 0, 0, &mglob)) {
return res;
}
for (int i = 0; i < mglob.gl_pathc; i++) {
res.push_back(mglob.gl_pathv[i]);
}
globfree(&mglob);
return res;
}
#else // TEST_PATHUT #else // TEST_PATHUT
#include <iostream> #include <iostream>
@ -108,7 +184,7 @@ using namespace std;
const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2", const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2",
"/dir1/dir2", "/dir1/dir2",
"./dir", "./dir1/", "dir", "../dir", "/dir/toto.c", "./dir", "./dir1/", "dir", "../dir", "/dir/toto.c",
"/dir/.c", "/dir/.c", "/dir/toto.txt", "toto.txt1"
}; };
const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub", const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub",
@ -117,22 +193,51 @@ int nttvec = sizeof(ttvec) / sizeof(string);
int main(int argc, const char **argv) int main(int argc, const char **argv)
{ {
string s;
list<string>::const_iterator it;
#if 0 #if 0
for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) { for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " FATHER " << path_getfather(tstvec[i]) << endl; cout << tstvec[i] << " Father " << path_getfather(tstvec[i]) << endl;
} }
for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) { for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " SIMPLE " << path_getsimple(tstvec[i]) << endl; cout << tstvec[i] << " Simple " << path_getsimple(tstvec[i]) << endl;
}
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " Basename " <<
path_basename(tstvec[i], ".txt") << endl;
} }
#endif #endif
string s;
#if 0
for (int i = 0; i < nttvec; i++) { for (int i = 0; i < nttvec; i++) {
cout << "tildexp: '" << ttvec[i] << "' -> '" << cout << "tildexp: '" << ttvec[i] << "' -> '" <<
path_tildexpand(ttvec[i]) << "'" << endl; path_tildexpand(ttvec[i]) << "'" << endl;
} }
#endif
#if 0
const string canontst[] = {"/dir1/../../..", "/////", "",
"/dir1/../../.././/////dir2///////",
"../../",
"../../../../../../../../../../"
};
unsigned int nttvec = sizeof(canontst) / sizeof(string);
for (unsigned int i = 0; i < nttvec; i++) {
cout << "canon: '" << canontst[i] << "' -> '" <<
path_canon(canontst[i]) << "'" << endl;
}
#endif
#if 1
if (argc != 3) {
fprintf(stderr, "Usage: trpathut <dir> <pattern>\n");
exit(1);
}
string dir=argv[1], pattern=argv[2];
list<string> matched = path_dirglob(dir, pattern);
for (it = matched.begin(); it != matched.end();it++) {
cout << *it << endl;
}
#endif
return 0; return 0;
} }

View File

@ -1,14 +1,19 @@
#ifndef _PATHUT_H_INCLUDED_ #ifndef _PATHUT_H_INCLUDED_
#define _PATHUT_H_INCLUDED_ #define _PATHUT_H_INCLUDED_
/* @(#$Id: pathut.h,v 1.4 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: pathut.h,v 1.5 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#include <list>
extern void path_catslash(std::string &s); extern void path_catslash(std::string &s);
extern std::string path_cat(const std::string &s1, const std::string &s2); extern std::string path_cat(const std::string &s1, const std::string &s2);
extern std::string path_getsimple(const std::string &s); extern std::string path_getsimple(const std::string &s);
extern std::string path_basename(const std::string &s, const std::string &suff="");
extern std::string path_getfather(const std::string &s); extern std::string path_getfather(const std::string &s);
extern std::string path_home(); extern std::string path_home();
extern std::string path_tildexpand(const std::string &s); extern std::string path_tildexpand(const std::string &s);
extern std::string path_canon(const std::string &s);
extern std::list<std::string> path_dirglob(const std::string &dir,
const std::string pattern);
#endif /* _PATHUT_H_INCLUDED_ */ #endif /* _PATHUT_H_INCLUDED_ */