allow independant creation / deletion of stem dbs

This commit is contained in:
dockes 2006-01-09 16:53:31 +00:00
parent c4ce5cf691
commit dac569ab51
9 changed files with 364 additions and 122 deletions

View File

@ -1,29 +1,29 @@
#*
*.cache
*.core
*.o
*~
*.core
*.cache
#*
.#*
.#*
.moc
.obj
.ui
.#*
CVS
alldeps
.#*
autom4*
TAGS
alldeps
autom4*
config.cache
config.log
config.status
excludefile
lib/librcl.a
makesrcdist.sh
recollinstall
mk/localdefs
sysconf
qtgui/Makefile
qtgui/preview/Makefile
qtgui/preview/preview.pro
qtgui/preview/pvmain.cpp
lib/librcl.a
recollinstall
sampleconf/recoll.conf
sysconf
wxgui

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -10,6 +10,7 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp
#include <iostream>
#include <list>
#include <map>
#include <algorithm>
#include "pathut.h"
#include "conftree.h"
@ -87,13 +88,22 @@ bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
// filesystem anymore.
db.purge();
// Create stemming databases
// Create stemming databases. We also remove those which are not
// configured.
string slangs;
if (config->getConfParam("indexstemminglanguages", slangs)) {
list<string> langs;
stringToStrings(slangs, langs);
for (list<string>::const_iterator it = langs.begin();
it != langs.end(); it++) {
// Get the list of existing stem dbs from the database (some may have
// been manually created, we just keep those from the config
list<string> dblangs = db.getStemLangs();
list<string>::const_iterator it;
for (it = dblangs.begin(); it != dblangs.end(); it++) {
if (find(langs.begin(), langs.end(), *it) == langs.end())
db.deleteStemDb(*it);
}
for (it = langs.begin(); it != langs.end(); it++) {
db.createStemDb(*it);
}
}
@ -120,6 +130,16 @@ bool DbIndexer::init(bool resetbefore)
return true;
}
bool DbIndexer::createStemDb(const string &lang)
{
if (!init())
return false;
return db.createStemDb(lang);
}
/**
Index individual files, out of a full tree run. No database purging
*/
bool DbIndexer::indexFiles(const list<string> &filenames)
{
if (!init())

View File

@ -1,6 +1,6 @@
#ifndef _INDEXER_H_INCLUDED_
#define _INDEXER_H_INCLUDED_
/* @(#$Id: indexer.h,v 1.8 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: indexer.h,v 1.9 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -24,10 +24,12 @@ class DbIndexer;
class ConfIndexer {
public:
enum runStatus {IndexerOk, IndexerError};
ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) {}
virtual ~ConfIndexer();
/** Worker function: doe the actual indexing */
bool index(bool resetbefore = false);
ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0)
{
}
virtual ~ConfIndexer();
/** Worker function: doe the actual indexing */
bool index(bool resetbefore = false);
private:
RclConfig *config;
DbIndexer *dbindexer; // Object to process directories for a given db
@ -36,10 +38,10 @@ class ConfIndexer {
/** Index things into one database
Tree indexing: we inherits FsTreeWalkerCB so that, the processone()
method is called by the file-system tree walk code for each file and
directory. We keep all state needed while indexing, and finally call
the methods to purge the db of stale entries and create the stemming
databases.
method is called by the file-system tree walk code for each file and
directory. We keep all state needed while indexing, and finally call
the methods to purge the db of stale entries and create the stemming
databases.
Single file(s) indexing: no database purging or stem db updating.
*/
@ -67,6 +69,9 @@ class DbIndexer : public FsTreeWalkerCB {
/** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(const std::list<std::string> &files);
/** Create stem database for given language */
bool createStemDb(const string &lang);
/** Tree walker callback method */
FsTreeWalker::Status
processone(const std::string &, const struct stat *,

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.13 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.14 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
@ -19,10 +19,12 @@ using namespace std;
#include "pathut.h"
// Globals for exit cleanup
ConfIndexer *confindexer;
DbIndexer *dbindexer;
bool indexfiles(RclConfig *config, const list<string> &filenames)
// Index a list of files
static bool indexfiles(RclConfig *config, const list<string> &filenames)
{
if (filenames.empty())
return true;
@ -42,6 +44,21 @@ bool indexfiles(RclConfig *config, const list<string> &filenames)
return dbindexer->indexFiles(filenames);
}
// Create additional stem database
static bool createstemdb(RclConfig *config, const string &lang)
{
// Note that we do not bother to check for multiple databases,
// which are currently a fiction anyway.
string dbdir;
if (!config->getConfParam("dbdir", dbdir)) {
LOGERR(("createstemdb: no database directory in configuration\n"));
return false;
}
dbdir = path_tildexpand(dbdir);
dbindexer = new DbIndexer(config, dbdir);
return dbindexer->createStemDb(lang);
}
static void cleanup()
{
delete confindexer;
@ -63,15 +80,19 @@ static int op_flags;
#define OPT_z 0x2
#define OPT_h 0x4
#define OPT_i 0x8
#define OPT_s 0x10
static const char usage [] =
" recollindex [-hz] \n"
" recollindex -i <filename [filename ...]>\n"
"\n"
"recollindex [-hz] \n"
" Normal index run\n"
"recollindex -i <filename [filename ...]>\n"
" Index individual files. No db purge or stem database updates\n"
"recollindex -s <lang>\n"
" Build stem database for language <lang>\n"
"Options:\n"
" -h : print this message\n"
" -z : reset database before starting indexation\n\n"
" -i <filename [filename ...]> : index individual files. No db purge or stem\n"
" database updates in this case\n"
;
static void
@ -97,6 +118,7 @@ int main(int argc, const char **argv)
case 'z': op_flags |= OPT_z; break;
case 'h': op_flags |= OPT_h; break;
case 'i': op_flags |= OPT_i; break;
case 's': op_flags |= OPT_s; break;
default: Usage(); break;
}
b1: argc--; argv++;
@ -108,7 +130,6 @@ int main(int argc, const char **argv)
string reason;
RclConfig *config = recollinit(cleanup, sigcleanup, reason);
if (config == 0 || !config->ok()) {
cerr << "Configuration problem: " << reason << endl;
exit(1);
@ -130,6 +151,11 @@ int main(int argc, const char **argv)
}
}
exit(!indexfiles(config, filenames));
} else if (op_flags & OPT_s) {
if (argc != 1)
Usage();
string lang = *argv++; argc--;
exit(!createstemdb(config, lang));
} else {
confindexer = new ConfIndexer(config);
bool rezero(op_flags & OPT_z);

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.48 2006-01-06 13:55:44 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.49 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -23,6 +23,7 @@ using namespace std;
#include "smallut.h"
#include "pathhash.h"
#include "utf8iter.h"
#include "wipedir.h"
#include "xapian.h"
#include <xapian/stem.h>
@ -67,23 +68,24 @@ Rcl::Db::~Db()
ndb->iswritable));
if (ndb->isopen == false)
return;
string ermsg;
const char *ermsg = "Unknown error";
try {
LOGDEB(("Rcl::Db::~Db: closing native database\n"));
if (ndb->iswritable == true)
if (ndb->iswritable == true) {
ndb->wdb.flush();
}
delete ndb;
return;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s;
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
ermsg = "Caught unknown exception";
}
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg.c_str()));
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
}
bool Rcl::Db::open(const string& dir, OpenMode mode)
@ -98,7 +100,7 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
LOGERR(("Rcl::Db::open: already open\n"));
return false;
}
string ermsg;
const char *ermsg = "Unknown";
try {
switch (mode) {
case DbUpd:
@ -125,16 +127,16 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
ndb->basedir = dir;
return true;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s;
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
ermsg = "Caught unknown exception";
}
LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n",
dir.c_str(), ermsg.c_str()));
dir.c_str(), ermsg));
return false;
}
@ -148,7 +150,7 @@ bool Rcl::Db::close()
ndb->iswritable));
if (ndb->isopen == false)
return true;
string ermsg;
const char *ermsg = "Unknown";
try {
if (ndb->iswritable == true) {
ndb->wdb.flush();
@ -159,16 +161,15 @@ bool Rcl::Db::close()
if (pdata)
return true;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s;
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
ermsg = "Caught unknown exception";
}
LOGERR(("Rcl::Db:close: exception while deleting db: %s\n",
ermsg.c_str()));
LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", ermsg));
return false;
}
@ -194,21 +195,29 @@ class mySplitterCB : public TextSplitCB {
// Callback for the document to word splitting class during indexation
bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
{
// cerr << "splitCb: term " << term << endl;
//string printable;
//transcode(term, printable, "UTF-8", "ISO-8859-1");
//cerr << "Adding " << printable << endl;
#if 0
LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
string printable;
if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
LOGDEB((" [%s]\n", printable.c_str()));
}
#endif
const char *ermsg;
try {
// 1 is the value for wdfinc in index_text when called from omindex
// TOBEDONE: check what this is used for
// Note: 1 is the within document frequency increment. It would
// be possible to assign different weigths to doc parts (ie title)
// by using a higher value
curpos = pos;
doc.add_posting(term, basepos + curpos, 1);
return true;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (...) {
LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n"));
return false;
ermsg= "Unknown error";
}
return true;
LOGERR(("Rcl::Db: xapian add_posting error %s\n", ermsg));
return false;
}
// Unaccent and lowercase data, replace \n\r with spaces
@ -239,7 +248,7 @@ bool Rcl::dumb_string(const string &in, string &out)
return true;
}
/* omindex direct */
/* From omindex direct */
/* Truncate a string to a given maxlength, avoiding cutting off midword
* if reasonably possible. */
string
@ -266,17 +275,13 @@ truncate_to_word(string & input, string::size_type maxlen)
output += " ...";
}
// replace newlines with spaces
size_t i = 0;
while ((i = output.find('\n', i)) != string::npos) output[i] = ' ';
// No need to replace newlines with spaces, we do this in dumb_string()
return output;
}
// Truncate longer path and uniquize with hash . The goad for this is
// Truncate longer path and uniquize with hash . The goal for this is
// to avoid xapian max term length limitations, not to gain space (we
// gain very little even with very short maxlens like 30)
#define HASHPATH
#define PATHHASHLEN 150
// Add document in internal form to the database: index the terms in
@ -310,7 +315,8 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
// Split and index file name. This supposes that it's either ascii
// or utf-8. If this fails, we just go on. We need a config
// parameter for file name charset
// parameter for file name charset.
// Do we really want to fold case here ?
if (dumb_string(fn, noacc)) {
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
@ -324,7 +330,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
// Split body and index terms
// Split and index body
if (!dumb_string(doc.text, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
return false;
@ -332,7 +338,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
// Split keywords and index terms
// Split and index keywords
if (!dumb_string(doc.keywords, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
return false;
@ -340,7 +346,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
// Split abstract and index terms
// Split and index abstract
if (!dumb_string(doc.abstract, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
return false;
@ -354,18 +360,13 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
// Path name
string hash;
#ifdef HASHPATH
pathHash(fn, hash, PATHHASHLEN);
#else
hash = fn;
#endif
LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
string pathterm = "P" + hash;
newdocument.add_term(pathterm);
// File path + internal path: document unique identifier for
// documents inside multidocument files.
// Internal path: with path, makes unique identifier for documents
// inside multidocument files.
string uniterm;
if (!doc.ipath.empty()) {
uniterm = "Q" + hash + "|" + doc.ipath;
@ -395,8 +396,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
string record = "url=file://" + fn;
record += "\nmtype=" + doc.mimetype;
record += "\nfmtime=" + doc.fmtime;
if (!doc.dmtime.empty())
if (!doc.dmtime.empty()) {
record += "\ndmtime=" + doc.dmtime;
}
record += "\norigcharset=" + doc.origcharset;
record += "\ncaption=" + doc.title;
record += "\nkeywords=" + doc.keywords;
@ -405,12 +407,10 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
record += "\nipath=" + doc.ipath;
}
record += "\n";
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
newdocument.set_data(record);
const char *fnc = fn.c_str();
// Add db entry or update existing entry:
try {
Xapian::docid did =
@ -426,13 +426,19 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
}
} catch (...) {
// FIXME: is this ever actually needed?
ndb->wdb.add_document(newdocument);
LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n",
fnc));
try {
ndb->wdb.add_document(newdocument);
LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n",
fnc));
} catch (...) {
LOGERR(("Rcl::Db::add: failed again after replace_document\n"));
return false;
}
}
return true;
}
// Test if given filename has changed since last indexed:
bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
{
if (pdata == 0)
@ -441,16 +447,9 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
// If no document exist with this path, we do need update
string hash;
#ifdef HASHPATH
pathHash(filename, hash, PATHHASHLEN);
#else
hash = filename;
#endif
string pathterm = "P" + hash;
if (!ndb->wdb.term_exists(pathterm)) {
LOGDEB1(("Db::needUpdate: path inexistant: %s\n", pathterm.c_str()));
return true;
}
const char *ermsg;
// Look for all documents with this path. We need to look at all
// to set their existence flag. We check the update time on the
@ -459,6 +458,11 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
// file changed)
Xapian::PostingIterator doc;
try {
if (!ndb->wdb.term_exists(pathterm)) {
LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
return true;
}
Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm);
for (Xapian::PostingIterator docid = docid0;
docid != ndb->wdb.postlist_end(pathterm); docid++) {
@ -491,21 +495,26 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
if (*docid < ndb->updated.size())
ndb->updated[*docid] = true;
}
return false;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (...) {
return true;
ermsg= "Unknown error";
}
return false;
LOGERR(("Db::needUpdate: error while checking existence: %s\n", ermsg));
return true;
}
const static string stemdirstem = "stem_";
/// Compute name of stem db for given base database and language
static string stemdbname(const string& basename, string lang)
{
string nm = path_cat(basename, string("stem_") + lang);
string nm = path_cat(basename, stemdirstem + lang);
return nm;
}
// Is char non-lowercase ascii ?
// Deciding if we try to stem the term. If it has numerals or capitals
// we don't
inline static bool
p_notlowerorutf(unsigned int c)
{
@ -514,6 +523,24 @@ p_notlowerorutf(unsigned int c)
return false;
}
/**
* Delete stem db for given language
*/
bool Rcl::Db::deleteStemDb(const string& lang)
{
LOGDEB(("Rcl::Db::deleteStemDb(%s)\n", lang.c_str()));
if (pdata == 0)
return false;
Native *ndb = (Native *)pdata;
if (ndb->isopen == false)
return false;
string dir = stemdbname(ndb->basedir, lang);
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
return true;
return false;
}
/**
* Create database of stem to parents associations for a given language.
* We walk the list of all terms, stem them, and create another Xapian db
@ -526,7 +553,7 @@ bool Rcl::Db::createStemDb(const string& lang)
if (pdata == 0)
return false;
Native *ndb = (Native *)pdata;
if (ndb->isopen == false || ndb->iswritable == false)
if (ndb->isopen == false)
return false;
// First build the in-memory stem database:
@ -562,23 +589,41 @@ bool Rcl::Db::createStemDb(const string& lang)
}
assocs.insert(pair<string,string>(stem, *it));
}
} catch (const Xapian::Error &e) {
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
return false;
} catch (...) {
LOGERR(("Stem database build failed: no stemmer for %s ? \n",
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
lang.c_str()));
return false;
}
class DirWiper {
public:
string dir;
bool do_it;
DirWiper(string d) : dir(d), do_it(true) {}
~DirWiper() {
if (do_it) {
wipedir(dir);
rmdir(dir.c_str());
}
}
};
// Create xapian database for stem relations
string stemdbdir = stemdbname(ndb->basedir, lang);
string ermsg = "NOERROR";
// We want to get rid of the db dir in case of error. This gets disarmed
// just before success return.
DirWiper wiper(stemdbdir);
const char *ermsg = "NOERROR";
Xapian::WritableDatabase sdb;
try {
sdb = Xapian::WritableDatabase(stemdbdir,
Xapian::DB_CREATE_OR_OVERWRITE);
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s;
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
@ -586,7 +631,7 @@ bool Rcl::Db::createStemDb(const string& lang)
}
if (ermsg != "NOERROR") {
LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n",
stemdbdir.c_str(), ermsg.c_str()));
stemdbdir.c_str(), ermsg));
return false;
}
@ -632,9 +677,27 @@ bool Rcl::Db::createStemDb(const string& lang)
}
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
wiper.do_it = false;
return true;
}
list<string> Rcl::Db::getStemLangs()
{
list<string> dirs;
LOGDEB(("Rcl::Db::getStemLang\n"));
if (pdata == 0)
return dirs;
Native *ndb = (Native *)pdata;
string pattern = stemdirstem + "*";
dirs = path_dirglob(ndb->basedir, pattern);
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
*it = path_basename(*it);
*it = it->substr(stemdirstem.length(), string::npos);
}
return dirs;
}
/**
* This is called at the end of an indexing session, to delete the
* documents for files that are no longer there. We also build the
@ -658,7 +721,11 @@ bool Rcl::Db::purge()
// and does nothing). Maybe related to the exceptions below when
// trying to delete an unexistant document ?
// Flushing before trying the deletes seeems to work around the problem
ndb->wdb.flush();
try {
ndb->wdb.flush();
} catch (...) {
LOGDEB(("Rcl::Db::purge: 1st flush failed\n"));
}
for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) {
if (!ndb->updated[docid]) {
try {
@ -669,7 +736,11 @@ bool Rcl::Db::purge()
}
}
}
ndb->wdb.flush();
try {
ndb->wdb.flush();
} catch (...) {
LOGDEB(("Rcl::Db::purge: 2nd flush failed\n"));
}
return true;
}
@ -749,7 +820,6 @@ class wsQData : public TextSplitCB {
};
//
// Turn string into list of xapian queries. There is little
// interpretation done on the string (no +term -term or filename:term
// stuff). We just separate words and phrases, and interpret
@ -1124,21 +1194,18 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
Native *ndb = (Native *)pdata;
string hash;
#ifdef HASHPATH
pathHash(fn, hash, PATHHASHLEN);
#else
hash = fn;
#endif
string pathterm = "P" + hash;
if (!ndb->db.term_exists(pathterm)) {
LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n",
pathterm.c_str(), pathterm.length()));
return false;
}
// Look for all documents with this path, searching for the one
// with the appropriate ipath. This is very inefficient.
const char *ermsg = "";
try {
if (!ndb->db.term_exists(pathterm)) {
LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n",
pathterm.c_str(), pathterm.length()));
return false;
}
for (Xapian::PostingIterator docid =
ndb->db.postlist_begin(pathterm);
docid != ndb->db.postlist_end(pathterm); docid++) {
@ -1148,8 +1215,17 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
return true;
}
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
return false;
ermsg = "Caught unknown exception";
}
if (*ermsg) {
LOGERR(("Rcl::Db::getDoc: %s\n", ermsg));
}
return false;
}

View File

@ -1,6 +1,6 @@
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.20 2005-12-02 16:18:20 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -102,6 +102,7 @@ public:
bool needUpdate(const string &filename, const struct stat *stp);
bool purge();
bool createStemDb(const string &lang);
bool deleteStemDb(const string &lang);
// Query-related functions
@ -127,6 +128,10 @@ public:
/** Get results count for current query */
int getResCnt();
/** Get a list of existing stemming databases */
std::list<std::string> getStemLangs();
/** Things we don't want to have here. */
friend class Rcl::DbPops;
private:

View File

@ -15,8 +15,8 @@ trfstreewalk.o : fstreewalk.cpp fstreewalk.h
$(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \
-DTEST_FSTREEWALK fstreewalk.cpp
PATHUT_OBJS= trpathut.o pathut.o
trpathut : $(PATHUT_OBJS)
PATHUT_OBJS= trpathut.o pathut.o $(BIGLIB)
trpathut : $(PATHUT_OBJS)
$(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS)
trpathut.o : pathut.cpp pathut.h
$(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp

View File

@ -1,15 +1,21 @@
#ifndef lint
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.6 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.7 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_PATHUT
#include <unistd.h>
#include <sys/param.h>
#include <pwd.h>
#include <iostream>
#include <list>
#include <stack>
#include "pathut.h"
#ifndef NO_NAMESPACES
using std::string;
using std::list;
using std::stack;
#endif /* NO_NAMESPACES */
void path_catslash(std::string &s) {
@ -61,6 +67,18 @@ string path_getsimple(const string &s) {
return simple;
}
string path_basename(const string &s, const string &suff)
{
string simple = path_getsimple(s);
string::size_type pos = string::npos;
if (suff.length() && simple.length() > suff.length()) {
pos = simple.rfind(suff);
if (pos != string::npos && pos + suff.length() == simple.length())
return simple.substr(0, pos);
}
return simple;
}
string path_home()
{
uid_t uid = getuid();
@ -98,6 +116,64 @@ extern string path_tildexpand(const string &s)
return o;
}
#include <smallut.h>
extern std::string path_canon(const std::string &is)
{
if (is.length() == 0)
return is;
string s = is;
if (s[0] != '/') {
char buf[MAXPATHLEN];
if (!getcwd(buf, MAXPATHLEN)) {
return "";
}
s = path_cat(string(buf), s);
}
list<string>elems;
stringToTokens(s, elems, "/");
list<string> cleaned;
for (list<string>::const_iterator it = elems.begin();
it != elems.end(); it++){
if (*it == "..") {
if (!cleaned.empty())
cleaned.pop_back();
} else if (it->empty() || *it == ".") {
} else {
cleaned.push_back(*it);
}
}
string ret;
if (!cleaned.empty()) {
for (list<string>::const_iterator it = cleaned.begin();
it != cleaned.end(); it++) {
ret += "/";
ret += *it;
}
} else {
ret = "/";
}
return ret;
}
#include <glob.h>
#include <sys/stat.h>
list<std::string> path_dirglob(const std::string &dir,
const std::string pattern)
{
list<string> res;
glob_t mglob;
string mypat=path_cat(dir, pattern);
if (glob(mypat.c_str(), 0, 0, &mglob)) {
return res;
}
for (int i = 0; i < mglob.gl_pathc; i++) {
res.push_back(mglob.gl_pathv[i]);
}
globfree(&mglob);
return res;
}
#else // TEST_PATHUT
#include <iostream>
@ -108,7 +184,7 @@ using namespace std;
const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2",
"/dir1/dir2",
"./dir", "./dir1/", "dir", "../dir", "/dir/toto.c",
"/dir/.c",
"/dir/.c", "/dir/toto.txt", "toto.txt1"
};
const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub",
@ -117,22 +193,51 @@ int nttvec = sizeof(ttvec) / sizeof(string);
int main(int argc, const char **argv)
{
string s;
list<string>::const_iterator it;
#if 0
for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " FATHER " << path_getfather(tstvec[i]) << endl;
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " Father " << path_getfather(tstvec[i]) << endl;
}
for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " SIMPLE " << path_getsimple(tstvec[i]) << endl;
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " Simple " << path_getsimple(tstvec[i]) << endl;
}
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " Basename " <<
path_basename(tstvec[i], ".txt") << endl;
}
#endif
string s;
#if 0
for (int i = 0; i < nttvec; i++) {
cout << "tildexp: '" << ttvec[i] << "' -> '" <<
path_tildexpand(ttvec[i]) << "'" << endl;
}
#endif
#if 0
const string canontst[] = {"/dir1/../../..", "/////", "",
"/dir1/../../.././/////dir2///////",
"../../",
"../../../../../../../../../../"
};
unsigned int nttvec = sizeof(canontst) / sizeof(string);
for (unsigned int i = 0; i < nttvec; i++) {
cout << "canon: '" << canontst[i] << "' -> '" <<
path_canon(canontst[i]) << "'" << endl;
}
#endif
#if 1
if (argc != 3) {
fprintf(stderr, "Usage: trpathut <dir> <pattern>\n");
exit(1);
}
string dir=argv[1], pattern=argv[2];
list<string> matched = path_dirglob(dir, pattern);
for (it = matched.begin(); it != matched.end();it++) {
cout << *it << endl;
}
#endif
return 0;
}

View File

@ -1,14 +1,19 @@
#ifndef _PATHUT_H_INCLUDED_
#define _PATHUT_H_INCLUDED_
/* @(#$Id: pathut.h,v 1.4 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: pathut.h,v 1.5 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
extern void path_catslash(std::string &s);
extern std::string path_cat(const std::string &s1, const std::string &s2);
extern std::string path_getsimple(const std::string &s);
extern std::string path_basename(const std::string &s, const std::string &suff="");
extern std::string path_getfather(const std::string &s);
extern std::string path_home();
extern std::string path_tildexpand(const std::string &s);
extern std::string path_canon(const std::string &s);
extern std::list<std::string> path_dirglob(const std::string &dir,
const std::string pattern);
#endif /* _PATHUT_H_INCLUDED_ */