allow independant creation / deletion of stem dbs

This commit is contained in:
dockes 2006-01-09 16:53:31 +00:00
parent c4ce5cf691
commit dac569ab51
9 changed files with 364 additions and 122 deletions

View File

@ -1,29 +1,29 @@
#*
*.cache
*.core
*.o
*~
*.core
*.cache
#*
.#*
.#*
.moc
.obj
.ui
.#*
CVS
alldeps
.#*
autom4*
TAGS
alldeps
autom4*
config.cache
config.log
config.status
excludefile
lib/librcl.a
makesrcdist.sh
recollinstall
mk/localdefs
sysconf
qtgui/Makefile
qtgui/preview/Makefile
qtgui/preview/preview.pro
qtgui/preview/pvmain.cpp
lib/librcl.a
recollinstall
sampleconf/recoll.conf
sysconf
wxgui

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -10,6 +10,7 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp
#include <iostream>
#include <list>
#include <map>
#include <algorithm>
#include "pathut.h"
#include "conftree.h"
@ -87,13 +88,22 @@ bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
// filesystem anymore.
db.purge();
// Create stemming databases
// Create stemming databases. We also remove those which are not
// configured.
string slangs;
if (config->getConfParam("indexstemminglanguages", slangs)) {
list<string> langs;
stringToStrings(slangs, langs);
for (list<string>::const_iterator it = langs.begin();
it != langs.end(); it++) {
// Get the list of existing stem dbs from the database (some may have
// been manually created, we just keep those from the config
list<string> dblangs = db.getStemLangs();
list<string>::const_iterator it;
for (it = dblangs.begin(); it != dblangs.end(); it++) {
if (find(langs.begin(), langs.end(), *it) == langs.end())
db.deleteStemDb(*it);
}
for (it = langs.begin(); it != langs.end(); it++) {
db.createStemDb(*it);
}
}
@ -120,6 +130,16 @@ bool DbIndexer::init(bool resetbefore)
return true;
}
bool DbIndexer::createStemDb(const string &lang)
{
if (!init())
return false;
return db.createStemDb(lang);
}
/**
Index individual files, out of a full tree run. No database purging
*/
bool DbIndexer::indexFiles(const list<string> &filenames)
{
if (!init())

View File

@ -1,6 +1,6 @@
#ifndef _INDEXER_H_INCLUDED_
#define _INDEXER_H_INCLUDED_
/* @(#$Id: indexer.h,v 1.8 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: indexer.h,v 1.9 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -24,7 +24,9 @@ class DbIndexer;
class ConfIndexer {
public:
enum runStatus {IndexerOk, IndexerError};
ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) {}
ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0)
{
}
virtual ~ConfIndexer();
/** Worker function: doe the actual indexing */
bool index(bool resetbefore = false);
@ -67,6 +69,9 @@ class DbIndexer : public FsTreeWalkerCB {
/** Index a list of files. No db cleaning or stemdb updating */
bool indexFiles(const std::list<std::string> &files);
/** Create stem database for given language */
bool createStemDb(const string &lang);
/** Tree walker callback method */
FsTreeWalker::Status
processone(const std::string &, const struct stat *,

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.13 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.14 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
@ -19,10 +19,12 @@ using namespace std;
#include "pathut.h"
// Globals for exit cleanup
ConfIndexer *confindexer;
DbIndexer *dbindexer;
bool indexfiles(RclConfig *config, const list<string> &filenames)
// Index a list of files
static bool indexfiles(RclConfig *config, const list<string> &filenames)
{
if (filenames.empty())
return true;
@ -42,6 +44,21 @@ bool indexfiles(RclConfig *config, const list<string> &filenames)
return dbindexer->indexFiles(filenames);
}
// Create additional stem database
static bool createstemdb(RclConfig *config, const string &lang)
{
// Note that we do not bother to check for multiple databases,
// which are currently a fiction anyway.
string dbdir;
if (!config->getConfParam("dbdir", dbdir)) {
LOGERR(("createstemdb: no database directory in configuration\n"));
return false;
}
dbdir = path_tildexpand(dbdir);
dbindexer = new DbIndexer(config, dbdir);
return dbindexer->createStemDb(lang);
}
static void cleanup()
{
delete confindexer;
@ -63,15 +80,19 @@ static int op_flags;
#define OPT_z 0x2
#define OPT_h 0x4
#define OPT_i 0x8
#define OPT_s 0x10
static const char usage [] =
"\n"
"recollindex [-hz] \n"
" Normal index run\n"
"recollindex -i <filename [filename ...]>\n"
" Index individual files. No db purge or stem database updates\n"
"recollindex -s <lang>\n"
" Build stem database for language <lang>\n"
"Options:\n"
" -h : print this message\n"
" -z : reset database before starting indexation\n\n"
" -i <filename [filename ...]> : index individual files. No db purge or stem\n"
" database updates in this case\n"
;
static void
@ -97,6 +118,7 @@ int main(int argc, const char **argv)
case 'z': op_flags |= OPT_z; break;
case 'h': op_flags |= OPT_h; break;
case 'i': op_flags |= OPT_i; break;
case 's': op_flags |= OPT_s; break;
default: Usage(); break;
}
b1: argc--; argv++;
@ -108,7 +130,6 @@ int main(int argc, const char **argv)
string reason;
RclConfig *config = recollinit(cleanup, sigcleanup, reason);
if (config == 0 || !config->ok()) {
cerr << "Configuration problem: " << reason << endl;
exit(1);
@ -130,6 +151,11 @@ int main(int argc, const char **argv)
}
}
exit(!indexfiles(config, filenames));
} else if (op_flags & OPT_s) {
if (argc != 1)
Usage();
string lang = *argv++; argc--;
exit(!createstemdb(config, lang));
} else {
confindexer = new ConfIndexer(config);
bool rezero(op_flags & OPT_z);

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.48 2006-01-06 13:55:44 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.49 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@ -23,6 +23,7 @@ using namespace std;
#include "smallut.h"
#include "pathhash.h"
#include "utf8iter.h"
#include "wipedir.h"
#include "xapian.h"
#include <xapian/stem.h>
@ -67,23 +68,24 @@ Rcl::Db::~Db()
ndb->iswritable));
if (ndb->isopen == false)
return;
string ermsg;
const char *ermsg = "Unknown error";
try {
LOGDEB(("Rcl::Db::~Db: closing native database\n"));
if (ndb->iswritable == true)
if (ndb->iswritable == true) {
ndb->wdb.flush();
}
delete ndb;
return;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s;
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
ermsg = "Caught unknown exception";
}
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg.c_str()));
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
}
bool Rcl::Db::open(const string& dir, OpenMode mode)
@ -98,7 +100,7 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
LOGERR(("Rcl::Db::open: already open\n"));
return false;
}
string ermsg;
const char *ermsg = "Unknown";
try {
switch (mode) {
case DbUpd:
@ -125,16 +127,16 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
ndb->basedir = dir;
return true;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s;
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
ermsg = "Caught unknown exception";
}
LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n",
dir.c_str(), ermsg.c_str()));
dir.c_str(), ermsg));
return false;
}
@ -148,7 +150,7 @@ bool Rcl::Db::close()
ndb->iswritable));
if (ndb->isopen == false)
return true;
string ermsg;
const char *ermsg = "Unknown";
try {
if (ndb->iswritable == true) {
ndb->wdb.flush();
@ -159,16 +161,15 @@ bool Rcl::Db::close()
if (pdata)
return true;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s;
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
ermsg = "Caught unknown exception";
}
LOGERR(("Rcl::Db:close: exception while deleting db: %s\n",
ermsg.c_str()));
LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", ermsg));
return false;
}
@ -194,21 +195,29 @@ class mySplitterCB : public TextSplitCB {
// Callback for the document to word splitting class during indexation
bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
{
// cerr << "splitCb: term " << term << endl;
//string printable;
//transcode(term, printable, "UTF-8", "ISO-8859-1");
//cerr << "Adding " << printable << endl;
#if 0
LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
string printable;
if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
LOGDEB((" [%s]\n", printable.c_str()));
}
#endif
const char *ermsg;
try {
// 1 is the value for wdfinc in index_text when called from omindex
// TOBEDONE: check what this is used for
// Note: 1 is the within document frequency increment. It would
// be possible to assign different weigths to doc parts (ie title)
// by using a higher value
curpos = pos;
doc.add_posting(term, basepos + curpos, 1);
} catch (...) {
LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n"));
return false;
}
return true;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (...) {
ermsg= "Unknown error";
}
LOGERR(("Rcl::Db: xapian add_posting error %s\n", ermsg));
return false;
}
// Unaccent and lowercase data, replace \n\r with spaces
@ -239,7 +248,7 @@ bool Rcl::dumb_string(const string &in, string &out)
return true;
}
/* omindex direct */
/* From omindex direct */
/* Truncate a string to a given maxlength, avoiding cutting off midword
* if reasonably possible. */
string
@ -266,17 +275,13 @@ truncate_to_word(string & input, string::size_type maxlen)
output += " ...";
}
// replace newlines with spaces
size_t i = 0;
while ((i = output.find('\n', i)) != string::npos) output[i] = ' ';
// No need to replace newlines with spaces, we do this in dumb_string()
return output;
}
// Truncate longer path and uniquize with hash . The goad for this is
// Truncate longer path and uniquize with hash . The goal for this is
// to avoid xapian max term length limitations, not to gain space (we
// gain very little even with very short maxlens like 30)
#define HASHPATH
#define PATHHASHLEN 150
// Add document in internal form to the database: index the terms in
@ -310,7 +315,8 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
// Split and index file name. This supposes that it's either ascii
// or utf-8. If this fails, we just go on. We need a config
// parameter for file name charset
// parameter for file name charset.
// Do we really want to fold case here ?
if (dumb_string(fn, noacc)) {
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
@ -324,7 +330,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
// Split body and index terms
// Split and index body
if (!dumb_string(doc.text, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
return false;
@ -332,7 +338,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
// Split keywords and index terms
// Split and index keywords
if (!dumb_string(doc.keywords, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
return false;
@ -340,7 +346,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
splitter.text_to_words(noacc);
splitData.basepos += splitData.curpos + 100;
// Split abstract and index terms
// Split and index abstract
if (!dumb_string(doc.abstract, noacc)) {
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
return false;
@ -354,18 +360,13 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
// Path name
string hash;
#ifdef HASHPATH
pathHash(fn, hash, PATHHASHLEN);
#else
hash = fn;
#endif
LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
string pathterm = "P" + hash;
newdocument.add_term(pathterm);
// File path + internal path: document unique identifier for
// documents inside multidocument files.
// Internal path: with path, makes unique identifier for documents
// inside multidocument files.
string uniterm;
if (!doc.ipath.empty()) {
uniterm = "Q" + hash + "|" + doc.ipath;
@ -395,8 +396,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
string record = "url=file://" + fn;
record += "\nmtype=" + doc.mimetype;
record += "\nfmtime=" + doc.fmtime;
if (!doc.dmtime.empty())
if (!doc.dmtime.empty()) {
record += "\ndmtime=" + doc.dmtime;
}
record += "\norigcharset=" + doc.origcharset;
record += "\ncaption=" + doc.title;
record += "\nkeywords=" + doc.keywords;
@ -405,12 +407,10 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
record += "\nipath=" + doc.ipath;
}
record += "\n";
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
newdocument.set_data(record);
const char *fnc = fn.c_str();
// Add db entry or update existing entry:
try {
Xapian::docid did =
@ -426,13 +426,19 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
}
} catch (...) {
// FIXME: is this ever actually needed?
try {
ndb->wdb.add_document(newdocument);
LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n",
fnc));
} catch (...) {
LOGERR(("Rcl::Db::add: failed again after replace_document\n"));
return false;
}
}
return true;
}
// Test if given filename has changed since last indexed:
bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
{
if (pdata == 0)
@ -441,16 +447,9 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
// If no document exist with this path, we do need update
string hash;
#ifdef HASHPATH
pathHash(filename, hash, PATHHASHLEN);
#else
hash = filename;
#endif
string pathterm = "P" + hash;
if (!ndb->wdb.term_exists(pathterm)) {
LOGDEB1(("Db::needUpdate: path inexistant: %s\n", pathterm.c_str()));
return true;
}
const char *ermsg;
// Look for all documents with this path. We need to look at all
// to set their existence flag. We check the update time on the
@ -459,6 +458,11 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
// file changed)
Xapian::PostingIterator doc;
try {
if (!ndb->wdb.term_exists(pathterm)) {
LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
return true;
}
Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm);
for (Xapian::PostingIterator docid = docid0;
docid != ndb->wdb.postlist_end(pathterm); docid++) {
@ -491,21 +495,26 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
if (*docid < ndb->updated.size())
ndb->updated[*docid] = true;
}
return false;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (...) {
ermsg= "Unknown error";
}
LOGERR(("Db::needUpdate: error while checking existence: %s\n", ermsg));
return true;
}
return false;
}
const static string stemdirstem = "stem_";
/// Compute name of stem db for given base database and language
static string stemdbname(const string& basename, string lang)
{
string nm = path_cat(basename, string("stem_") + lang);
string nm = path_cat(basename, stemdirstem + lang);
return nm;
}
// Is char non-lowercase ascii ?
// Deciding if we try to stem the term. If it has numerals or capitals
// we don't
inline static bool
p_notlowerorutf(unsigned int c)
{
@ -514,6 +523,24 @@ p_notlowerorutf(unsigned int c)
return false;
}
/**
* Delete stem db for given language
*/
bool Rcl::Db::deleteStemDb(const string& lang)
{
LOGDEB(("Rcl::Db::deleteStemDb(%s)\n", lang.c_str()));
if (pdata == 0)
return false;
Native *ndb = (Native *)pdata;
if (ndb->isopen == false)
return false;
string dir = stemdbname(ndb->basedir, lang);
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
return true;
return false;
}
/**
* Create database of stem to parents associations for a given language.
* We walk the list of all terms, stem them, and create another Xapian db
@ -526,7 +553,7 @@ bool Rcl::Db::createStemDb(const string& lang)
if (pdata == 0)
return false;
Native *ndb = (Native *)pdata;
if (ndb->isopen == false || ndb->iswritable == false)
if (ndb->isopen == false)
return false;
// First build the in-memory stem database:
@ -562,23 +589,41 @@ bool Rcl::Db::createStemDb(const string& lang)
}
assocs.insert(pair<string,string>(stem, *it));
}
} catch (const Xapian::Error &e) {
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
return false;
} catch (...) {
LOGERR(("Stem database build failed: no stemmer for %s ? \n",
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
lang.c_str()));
return false;
}
class DirWiper {
public:
string dir;
bool do_it;
DirWiper(string d) : dir(d), do_it(true) {}
~DirWiper() {
if (do_it) {
wipedir(dir);
rmdir(dir.c_str());
}
}
};
// Create xapian database for stem relations
string stemdbdir = stemdbname(ndb->basedir, lang);
string ermsg = "NOERROR";
// We want to get rid of the db dir in case of error. This gets disarmed
// just before success return.
DirWiper wiper(stemdbdir);
const char *ermsg = "NOERROR";
Xapian::WritableDatabase sdb;
try {
sdb = Xapian::WritableDatabase(stemdbdir,
Xapian::DB_CREATE_OR_OVERWRITE);
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s;
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
@ -586,7 +631,7 @@ bool Rcl::Db::createStemDb(const string& lang)
}
if (ermsg != "NOERROR") {
LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n",
stemdbdir.c_str(), ermsg.c_str()));
stemdbdir.c_str(), ermsg));
return false;
}
@ -632,9 +677,27 @@ bool Rcl::Db::createStemDb(const string& lang)
}
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
wiper.do_it = false;
return true;
}
list<string> Rcl::Db::getStemLangs()
{
list<string> dirs;
LOGDEB(("Rcl::Db::getStemLang\n"));
if (pdata == 0)
return dirs;
Native *ndb = (Native *)pdata;
string pattern = stemdirstem + "*";
dirs = path_dirglob(ndb->basedir, pattern);
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
*it = path_basename(*it);
*it = it->substr(stemdirstem.length(), string::npos);
}
return dirs;
}
/**
* This is called at the end of an indexing session, to delete the
* documents for files that are no longer there. We also build the
@ -658,7 +721,11 @@ bool Rcl::Db::purge()
// and does nothing). Maybe related to the exceptions below when
// trying to delete an unexistant document ?
// Flushing before trying the deletes seeems to work around the problem
try {
ndb->wdb.flush();
} catch (...) {
LOGDEB(("Rcl::Db::purge: 1st flush failed\n"));
}
for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) {
if (!ndb->updated[docid]) {
try {
@ -669,7 +736,11 @@ bool Rcl::Db::purge()
}
}
}
try {
ndb->wdb.flush();
} catch (...) {
LOGDEB(("Rcl::Db::purge: 2nd flush failed\n"));
}
return true;
}
@ -749,7 +820,6 @@ class wsQData : public TextSplitCB {
};
//
// Turn string into list of xapian queries. There is little
// interpretation done on the string (no +term -term or filename:term
// stuff). We just separate words and phrases, and interpret
@ -1124,21 +1194,18 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
Native *ndb = (Native *)pdata;
string hash;
#ifdef HASHPATH
pathHash(fn, hash, PATHHASHLEN);
#else
hash = fn;
#endif
string pathterm = "P" + hash;
// Look for all documents with this path, searching for the one
// with the appropriate ipath. This is very inefficient.
const char *ermsg = "";
try {
if (!ndb->db.term_exists(pathterm)) {
LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n",
pathterm.c_str(), pathterm.length()));
return false;
}
// Look for all documents with this path, searching for the one
// with the appropriate ipath. This is very inefficient.
try {
for (Xapian::PostingIterator docid =
ndb->db.postlist_begin(pathterm);
docid != ndb->db.postlist_end(pathterm); docid++) {
@ -1148,8 +1215,17 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
return true;
}
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
return false;
ermsg = "Caught unknown exception";
}
if (*ermsg) {
LOGERR(("Rcl::Db::getDoc: %s\n", ermsg));
}
return false;
}

View File

@ -1,6 +1,6 @@
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.20 2005-12-02 16:18:20 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -102,6 +102,7 @@ public:
bool needUpdate(const string &filename, const struct stat *stp);
bool purge();
bool createStemDb(const string &lang);
bool deleteStemDb(const string &lang);
// Query-related functions
@ -127,6 +128,10 @@ public:
/** Get results count for current query */
int getResCnt();
/** Get a list of existing stemming databases */
std::list<std::string> getStemLangs();
/** Things we don't want to have here. */
friend class Rcl::DbPops;
private:

View File

@ -15,7 +15,7 @@ trfstreewalk.o : fstreewalk.cpp fstreewalk.h
$(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \
-DTEST_FSTREEWALK fstreewalk.cpp
PATHUT_OBJS= trpathut.o pathut.o
PATHUT_OBJS= trpathut.o pathut.o $(BIGLIB)
trpathut : $(PATHUT_OBJS)
$(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS)
trpathut.o : pathut.cpp pathut.h

View File

@ -1,15 +1,21 @@
#ifndef lint
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.6 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.7 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#ifndef TEST_PATHUT
#include <unistd.h>
#include <sys/param.h>
#include <pwd.h>
#include <iostream>
#include <list>
#include <stack>
#include "pathut.h"
#ifndef NO_NAMESPACES
using std::string;
using std::list;
using std::stack;
#endif /* NO_NAMESPACES */
void path_catslash(std::string &s) {
@ -61,6 +67,18 @@ string path_getsimple(const string &s) {
return simple;
}
string path_basename(const string &s, const string &suff)
{
string simple = path_getsimple(s);
string::size_type pos = string::npos;
if (suff.length() && simple.length() > suff.length()) {
pos = simple.rfind(suff);
if (pos != string::npos && pos + suff.length() == simple.length())
return simple.substr(0, pos);
}
return simple;
}
string path_home()
{
uid_t uid = getuid();
@ -98,6 +116,64 @@ extern string path_tildexpand(const string &s)
return o;
}
#include <smallut.h>
extern std::string path_canon(const std::string &is)
{
if (is.length() == 0)
return is;
string s = is;
if (s[0] != '/') {
char buf[MAXPATHLEN];
if (!getcwd(buf, MAXPATHLEN)) {
return "";
}
s = path_cat(string(buf), s);
}
list<string>elems;
stringToTokens(s, elems, "/");
list<string> cleaned;
for (list<string>::const_iterator it = elems.begin();
it != elems.end(); it++){
if (*it == "..") {
if (!cleaned.empty())
cleaned.pop_back();
} else if (it->empty() || *it == ".") {
} else {
cleaned.push_back(*it);
}
}
string ret;
if (!cleaned.empty()) {
for (list<string>::const_iterator it = cleaned.begin();
it != cleaned.end(); it++) {
ret += "/";
ret += *it;
}
} else {
ret = "/";
}
return ret;
}
#include <glob.h>
#include <sys/stat.h>
list<std::string> path_dirglob(const std::string &dir,
const std::string pattern)
{
list<string> res;
glob_t mglob;
string mypat=path_cat(dir, pattern);
if (glob(mypat.c_str(), 0, 0, &mglob)) {
return res;
}
for (int i = 0; i < mglob.gl_pathc; i++) {
res.push_back(mglob.gl_pathv[i]);
}
globfree(&mglob);
return res;
}
#else // TEST_PATHUT
#include <iostream>
@ -108,7 +184,7 @@ using namespace std;
const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2",
"/dir1/dir2",
"./dir", "./dir1/", "dir", "../dir", "/dir/toto.c",
"/dir/.c",
"/dir/.c", "/dir/toto.txt", "toto.txt1"
};
const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub",
@ -117,22 +193,51 @@ int nttvec = sizeof(ttvec) / sizeof(string);
int main(int argc, const char **argv)
{
string s;
list<string>::const_iterator it;
#if 0
for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " FATHER " << path_getfather(tstvec[i]) << endl;
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " Father " << path_getfather(tstvec[i]) << endl;
}
for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " SIMPLE " << path_getsimple(tstvec[i]) << endl;
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " Simple " << path_getsimple(tstvec[i]) << endl;
}
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
cout << tstvec[i] << " Basename " <<
path_basename(tstvec[i], ".txt") << endl;
}
#endif
string s;
#if 0
for (int i = 0; i < nttvec; i++) {
cout << "tildexp: '" << ttvec[i] << "' -> '" <<
path_tildexpand(ttvec[i]) << "'" << endl;
}
#endif
#if 0
const string canontst[] = {"/dir1/../../..", "/////", "",
"/dir1/../../.././/////dir2///////",
"../../",
"../../../../../../../../../../"
};
unsigned int nttvec = sizeof(canontst) / sizeof(string);
for (unsigned int i = 0; i < nttvec; i++) {
cout << "canon: '" << canontst[i] << "' -> '" <<
path_canon(canontst[i]) << "'" << endl;
}
#endif
#if 1
if (argc != 3) {
fprintf(stderr, "Usage: trpathut <dir> <pattern>\n");
exit(1);
}
string dir=argv[1], pattern=argv[2];
list<string> matched = path_dirglob(dir, pattern);
for (it = matched.begin(); it != matched.end();it++) {
cout << *it << endl;
}
#endif
return 0;
}

View File

@ -1,14 +1,19 @@
#ifndef _PATHUT_H_INCLUDED_
#define _PATHUT_H_INCLUDED_
/* @(#$Id: pathut.h,v 1.4 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: pathut.h,v 1.5 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
extern void path_catslash(std::string &s);
extern std::string path_cat(const std::string &s1, const std::string &s2);
extern std::string path_getsimple(const std::string &s);
extern std::string path_basename(const std::string &s, const std::string &suff="");
extern std::string path_getfather(const std::string &s);
extern std::string path_home();
extern std::string path_tildexpand(const std::string &s);
extern std::string path_canon(const std::string &s);
extern std::list<std::string> path_dirglob(const std::string &dir,
const std::string pattern);
#endif /* _PATHUT_H_INCLUDED_ */