From dac569ab51ccddef8d42162aacd139d686965157 Mon Sep 17 00:00:00 2001 From: dockes Date: Mon, 9 Jan 2006 16:53:31 +0000 Subject: [PATCH] allow independant creation / deletion of stem dbs --- src/excludefile | 20 ++-- src/index/indexer.cpp | 28 ++++- src/index/indexer.h | 23 ++-- src/index/recollindex.cpp | 40 +++++-- src/rcldb/rcldb.cpp | 236 +++++++++++++++++++++++++------------- src/rcldb/rcldb.h | 7 +- src/utils/Makefile | 4 +- src/utils/pathut.cpp | 121 +++++++++++++++++-- src/utils/pathut.h | 7 +- 9 files changed, 364 insertions(+), 122 deletions(-) diff --git a/src/excludefile b/src/excludefile index bd7984f0..3c40ff36 100644 --- a/src/excludefile +++ b/src/excludefile @@ -1,29 +1,29 @@ +#* +*.cache +*.core *.o *~ -*.core -*.cache -#* +.#* +.#* .moc .obj .ui -.#* CVS -alldeps -.#* -autom4* TAGS +alldeps +autom4* config.cache config.log config.status excludefile +lib/librcl.a makesrcdist.sh -recollinstall mk/localdefs -sysconf qtgui/Makefile qtgui/preview/Makefile qtgui/preview/preview.pro qtgui/preview/pvmain.cpp -lib/librcl.a +recollinstall sampleconf/recoll.conf +sysconf wxgui diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index 43ad1ff8..829ba167 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: indexer.cpp,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -10,6 +10,7 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp #include #include #include +#include #include "pathut.h" #include "conftree.h" @@ -87,13 +88,22 @@ bool DbIndexer::indexDb(bool resetbefore, list *topdirs) // filesystem anymore. db.purge(); - // Create stemming databases + // Create stemming databases. We also remove those which are not + // configured. string slangs; if (config->getConfParam("indexstemminglanguages", slangs)) { list langs; stringToStrings(slangs, langs); - for (list::const_iterator it = langs.begin(); - it != langs.end(); it++) { + + // Get the list of existing stem dbs from the database (some may have + // been manually created, we just keep those from the config + list dblangs = db.getStemLangs(); + list::const_iterator it; + for (it = dblangs.begin(); it != dblangs.end(); it++) { + if (find(langs.begin(), langs.end(), *it) == langs.end()) + db.deleteStemDb(*it); + } + for (it = langs.begin(); it != langs.end(); it++) { db.createStemDb(*it); } } @@ -120,6 +130,16 @@ bool DbIndexer::init(bool resetbefore) return true; } +bool DbIndexer::createStemDb(const string &lang) +{ + if (!init()) + return false; + return db.createStemDb(lang); +} + +/** + Index individual files, out of a full tree run. No database purging +*/ bool DbIndexer::indexFiles(const list &filenames) { if (!init()) diff --git a/src/index/indexer.h b/src/index/indexer.h index 54eb7cc5..6fa9748f 100644 --- a/src/index/indexer.h +++ b/src/index/indexer.h @@ -1,6 +1,6 @@ #ifndef _INDEXER_H_INCLUDED_ #define _INDEXER_H_INCLUDED_ -/* @(#$Id: indexer.h,v 1.8 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: indexer.h,v 1.9 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -24,10 +24,12 @@ class DbIndexer; class ConfIndexer { public: enum runStatus {IndexerOk, IndexerError}; - ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) {} - virtual ~ConfIndexer(); - /** Worker function: doe the actual indexing */ - bool index(bool resetbefore = false); + ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) + { + } + virtual ~ConfIndexer(); + /** Worker function: doe the actual indexing */ + bool index(bool resetbefore = false); private: RclConfig *config; DbIndexer *dbindexer; // Object to process directories for a given db @@ -36,10 +38,10 @@ class ConfIndexer { /** Index things into one database Tree indexing: we inherits FsTreeWalkerCB so that, the processone() -method is called by the file-system tree walk code for each file and -directory. We keep all state needed while indexing, and finally call -the methods to purge the db of stale entries and create the stemming -databases. + method is called by the file-system tree walk code for each file and + directory. We keep all state needed while indexing, and finally call + the methods to purge the db of stale entries and create the stemming + databases. Single file(s) indexing: no database purging or stem db updating. */ @@ -67,6 +69,9 @@ class DbIndexer : public FsTreeWalkerCB { /** Index a list of files. No db cleaning or stemdb updating */ bool indexFiles(const std::list &files); + /** Create stem database for given language */ + bool createStemDb(const string &lang); + /** Tree walker callback method */ FsTreeWalker::Status processone(const std::string &, const struct stat *, diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index a3f0e0bd..d33ba70e 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.13 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.14 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -19,10 +19,12 @@ using namespace std; #include "pathut.h" +// Globals for exit cleanup ConfIndexer *confindexer; DbIndexer *dbindexer; -bool indexfiles(RclConfig *config, const list &filenames) +// Index a list of files +static bool indexfiles(RclConfig *config, const list &filenames) { if (filenames.empty()) return true; @@ -42,6 +44,21 @@ bool indexfiles(RclConfig *config, const list &filenames) return dbindexer->indexFiles(filenames); } +// Create additional stem database +static bool createstemdb(RclConfig *config, const string &lang) +{ + // Note that we do not bother to check for multiple databases, + // which are currently a fiction anyway. + string dbdir; + if (!config->getConfParam("dbdir", dbdir)) { + LOGERR(("createstemdb: no database directory in configuration\n")); + return false; + } + dbdir = path_tildexpand(dbdir); + dbindexer = new DbIndexer(config, dbdir); + return dbindexer->createStemDb(lang); +} + static void cleanup() { delete confindexer; @@ -63,15 +80,19 @@ static int op_flags; #define OPT_z 0x2 #define OPT_h 0x4 #define OPT_i 0x8 +#define OPT_s 0x10 static const char usage [] = -" recollindex [-hz] \n" -" recollindex -i \n" +"\n" +"recollindex [-hz] \n" +" Normal index run\n" +"recollindex -i \n" +" Index individual files. No db purge or stem database updates\n" +"recollindex -s \n" +" Build stem database for language \n" "Options:\n" " -h : print this message\n" " -z : reset database before starting indexation\n\n" -" -i : index individual files. No db purge or stem\n" -" database updates in this case\n" ; static void @@ -97,6 +118,7 @@ int main(int argc, const char **argv) case 'z': op_flags |= OPT_z; break; case 'h': op_flags |= OPT_h; break; case 'i': op_flags |= OPT_i; break; + case 's': op_flags |= OPT_s; break; default: Usage(); break; } b1: argc--; argv++; @@ -108,7 +130,6 @@ int main(int argc, const char **argv) string reason; RclConfig *config = recollinit(cleanup, sigcleanup, reason); - if (config == 0 || !config->ok()) { cerr << "Configuration problem: " << reason << endl; exit(1); @@ -130,6 +151,11 @@ int main(int argc, const char **argv) } } exit(!indexfiles(config, filenames)); + } else if (op_flags & OPT_s) { + if (argc != 1) + Usage(); + string lang = *argv++; argc--; + exit(!createstemdb(config, lang)); } else { confindexer = new ConfIndexer(config); bool rezero(op_flags & OPT_z); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 3c1e0646..cd15ef6a 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.48 2006-01-06 13:55:44 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.49 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -23,6 +23,7 @@ using namespace std; #include "smallut.h" #include "pathhash.h" #include "utf8iter.h" +#include "wipedir.h" #include "xapian.h" #include @@ -67,23 +68,24 @@ Rcl::Db::~Db() ndb->iswritable)); if (ndb->isopen == false) return; - string ermsg; + const char *ermsg = "Unknown error"; try { LOGDEB(("Rcl::Db::~Db: closing native database\n")); - if (ndb->iswritable == true) + if (ndb->iswritable == true) { ndb->wdb.flush(); + } delete ndb; return; } catch (const Xapian::Error &e) { - ermsg = e.get_msg(); + ermsg = e.get_msg().c_str(); } catch (const string &s) { - ermsg = s; + ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { ermsg = "Caught unknown exception"; } - LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg.c_str())); + LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg)); } bool Rcl::Db::open(const string& dir, OpenMode mode) @@ -98,7 +100,7 @@ bool Rcl::Db::open(const string& dir, OpenMode mode) LOGERR(("Rcl::Db::open: already open\n")); return false; } - string ermsg; + const char *ermsg = "Unknown"; try { switch (mode) { case DbUpd: @@ -125,16 +127,16 @@ bool Rcl::Db::open(const string& dir, OpenMode mode) ndb->basedir = dir; return true; } catch (const Xapian::Error &e) { - ermsg = e.get_msg(); + ermsg = e.get_msg().c_str(); } catch (const string &s) { - ermsg = s; + ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { ermsg = "Caught unknown exception"; } LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n", - dir.c_str(), ermsg.c_str())); + dir.c_str(), ermsg)); return false; } @@ -148,7 +150,7 @@ bool Rcl::Db::close() ndb->iswritable)); if (ndb->isopen == false) return true; - string ermsg; + const char *ermsg = "Unknown"; try { if (ndb->iswritable == true) { ndb->wdb.flush(); @@ -159,16 +161,15 @@ bool Rcl::Db::close() if (pdata) return true; } catch (const Xapian::Error &e) { - ermsg = e.get_msg(); + ermsg = e.get_msg().c_str(); } catch (const string &s) { - ermsg = s; + ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { ermsg = "Caught unknown exception"; } - LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", - ermsg.c_str())); + LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", ermsg)); return false; } @@ -194,21 +195,29 @@ class mySplitterCB : public TextSplitCB { // Callback for the document to word splitting class during indexation bool mySplitterCB::takeword(const std::string &term, int pos, int, int) { - // cerr << "splitCb: term " << term << endl; - //string printable; - //transcode(term, printable, "UTF-8", "ISO-8859-1"); - //cerr << "Adding " << printable << endl; +#if 0 + LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str())); + string printable; + if (transcode(term, printable, "UTF-8", "ISO-8859-1")) { + LOGDEB((" [%s]\n", printable.c_str())); + } +#endif + const char *ermsg; try { - // 1 is the value for wdfinc in index_text when called from omindex - // TOBEDONE: check what this is used for + // Note: 1 is the within document frequency increment. It would + // be possible to assign different weigths to doc parts (ie title) + // by using a higher value curpos = pos; doc.add_posting(term, basepos + curpos, 1); + return true; + } catch (const Xapian::Error &e) { + ermsg = e.get_msg().c_str(); } catch (...) { - LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n")); - return false; + ermsg= "Unknown error"; } - return true; + LOGERR(("Rcl::Db: xapian add_posting error %s\n", ermsg)); + return false; } // Unaccent and lowercase data, replace \n\r with spaces @@ -239,7 +248,7 @@ bool Rcl::dumb_string(const string &in, string &out) return true; } -/* omindex direct */ +/* From omindex direct */ /* Truncate a string to a given maxlength, avoiding cutting off midword * if reasonably possible. */ string @@ -266,17 +275,13 @@ truncate_to_word(string & input, string::size_type maxlen) output += " ..."; } - - // replace newlines with spaces - size_t i = 0; - while ((i = output.find('\n', i)) != string::npos) output[i] = ' '; + // No need to replace newlines with spaces, we do this in dumb_string() return output; } -// Truncate longer path and uniquize with hash . The goad for this is +// Truncate longer path and uniquize with hash . The goal for this is // to avoid xapian max term length limitations, not to gain space (we // gain very little even with very short maxlens like 30) -#define HASHPATH #define PATHHASHLEN 150 // Add document in internal form to the database: index the terms in @@ -310,7 +315,8 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) // Split and index file name. This supposes that it's either ascii // or utf-8. If this fails, we just go on. We need a config - // parameter for file name charset + // parameter for file name charset. + // Do we really want to fold case here ? if (dumb_string(fn, noacc)) { splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; @@ -324,7 +330,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; - // Split body and index terms + // Split and index body if (!dumb_string(doc.text, noacc)) { LOGERR(("Rcl::Db::add: dumb_string failed\n")); return false; @@ -332,7 +338,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; - // Split keywords and index terms + // Split and index keywords if (!dumb_string(doc.keywords, noacc)) { LOGERR(("Rcl::Db::add: dumb_string failed\n")); return false; @@ -340,7 +346,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; - // Split abstract and index terms + // Split and index abstract if (!dumb_string(doc.abstract, noacc)) { LOGERR(("Rcl::Db::add: dumb_string failed\n")); return false; @@ -354,18 +360,13 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) // Path name string hash; -#ifdef HASHPATH pathHash(fn, hash, PATHHASHLEN); -#else - hash = fn; -#endif LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str())); - string pathterm = "P" + hash; newdocument.add_term(pathterm); - // File path + internal path: document unique identifier for - // documents inside multidocument files. + // Internal path: with path, makes unique identifier for documents + // inside multidocument files. string uniterm; if (!doc.ipath.empty()) { uniterm = "Q" + hash + "|" + doc.ipath; @@ -395,8 +396,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) string record = "url=file://" + fn; record += "\nmtype=" + doc.mimetype; record += "\nfmtime=" + doc.fmtime; - if (!doc.dmtime.empty()) + if (!doc.dmtime.empty()) { record += "\ndmtime=" + doc.dmtime; + } record += "\norigcharset=" + doc.origcharset; record += "\ncaption=" + doc.title; record += "\nkeywords=" + doc.keywords; @@ -405,12 +407,10 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) record += "\nipath=" + doc.ipath; } record += "\n"; - LOGDEB1(("Newdocument data: %s\n", record.c_str())); newdocument.set_data(record); const char *fnc = fn.c_str(); - // Add db entry or update existing entry: try { Xapian::docid did = @@ -426,13 +426,19 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc) } } catch (...) { // FIXME: is this ever actually needed? - ndb->wdb.add_document(newdocument); - LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n", - fnc)); + try { + ndb->wdb.add_document(newdocument); + LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n", + fnc)); + } catch (...) { + LOGERR(("Rcl::Db::add: failed again after replace_document\n")); + return false; + } } return true; } +// Test if given filename has changed since last indexed: bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) { if (pdata == 0) @@ -441,16 +447,9 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) // If no document exist with this path, we do need update string hash; -#ifdef HASHPATH pathHash(filename, hash, PATHHASHLEN); -#else - hash = filename; -#endif string pathterm = "P" + hash; - if (!ndb->wdb.term_exists(pathterm)) { - LOGDEB1(("Db::needUpdate: path inexistant: %s\n", pathterm.c_str())); - return true; - } + const char *ermsg; // Look for all documents with this path. We need to look at all // to set their existence flag. We check the update time on the @@ -459,6 +458,11 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) // file changed) Xapian::PostingIterator doc; try { + if (!ndb->wdb.term_exists(pathterm)) { + LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str())); + return true; + } + Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm); for (Xapian::PostingIterator docid = docid0; docid != ndb->wdb.postlist_end(pathterm); docid++) { @@ -491,21 +495,26 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) if (*docid < ndb->updated.size()) ndb->updated[*docid] = true; } + return false; + } catch (const Xapian::Error &e) { + ermsg = e.get_msg().c_str(); } catch (...) { - return true; + ermsg= "Unknown error"; } - - return false; + LOGERR(("Db::needUpdate: error while checking existence: %s\n", ermsg)); + return true; } +const static string stemdirstem = "stem_"; /// Compute name of stem db for given base database and language static string stemdbname(const string& basename, string lang) { - string nm = path_cat(basename, string("stem_") + lang); + string nm = path_cat(basename, stemdirstem + lang); return nm; } -// Is char non-lowercase ascii ? +// Deciding if we try to stem the term. If it has numerals or capitals +// we don't inline static bool p_notlowerorutf(unsigned int c) { @@ -514,6 +523,24 @@ p_notlowerorutf(unsigned int c) return false; } +/** + * Delete stem db for given language + */ +bool Rcl::Db::deleteStemDb(const string& lang) +{ + LOGDEB(("Rcl::Db::deleteStemDb(%s)\n", lang.c_str())); + if (pdata == 0) + return false; + Native *ndb = (Native *)pdata; + if (ndb->isopen == false) + return false; + + string dir = stemdbname(ndb->basedir, lang); + if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0) + return true; + return false; +} + /** * Create database of stem to parents associations for a given language. * We walk the list of all terms, stem them, and create another Xapian db @@ -526,7 +553,7 @@ bool Rcl::Db::createStemDb(const string& lang) if (pdata == 0) return false; Native *ndb = (Native *)pdata; - if (ndb->isopen == false || ndb->iswritable == false) + if (ndb->isopen == false) return false; // First build the in-memory stem database: @@ -562,23 +589,41 @@ bool Rcl::Db::createStemDb(const string& lang) } assocs.insert(pair(stem, *it)); } + } catch (const Xapian::Error &e) { + LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str())); + return false; } catch (...) { - LOGERR(("Stem database build failed: no stemmer for %s ? \n", + LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n", lang.c_str())); return false; } + class DirWiper { + public: + string dir; + bool do_it; + DirWiper(string d) : dir(d), do_it(true) {} + ~DirWiper() { + if (do_it) { + wipedir(dir); + rmdir(dir.c_str()); + } + } + }; // Create xapian database for stem relations string stemdbdir = stemdbname(ndb->basedir, lang); - string ermsg = "NOERROR"; + // We want to get rid of the db dir in case of error. This gets disarmed + // just before success return. + DirWiper wiper(stemdbdir); + const char *ermsg = "NOERROR"; Xapian::WritableDatabase sdb; try { sdb = Xapian::WritableDatabase(stemdbdir, Xapian::DB_CREATE_OR_OVERWRITE); } catch (const Xapian::Error &e) { - ermsg = e.get_msg(); + ermsg = e.get_msg().c_str(); } catch (const string &s) { - ermsg = s; + ermsg = s.c_str(); } catch (const char *s) { ermsg = s; } catch (...) { @@ -586,7 +631,7 @@ bool Rcl::Db::createStemDb(const string& lang) } if (ermsg != "NOERROR") { LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n", - stemdbdir.c_str(), ermsg.c_str())); + stemdbdir.c_str(), ermsg)); return false; } @@ -632,9 +677,27 @@ bool Rcl::Db::createStemDb(const string& lang) } LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", assocs.size(), stemdiff, stemmultiple, nostem, stemconst)); + wiper.do_it = false; return true; } +list Rcl::Db::getStemLangs() +{ + list dirs; + LOGDEB(("Rcl::Db::getStemLang\n")); + if (pdata == 0) + return dirs; + Native *ndb = (Native *)pdata; + string pattern = stemdirstem + "*"; + dirs = path_dirglob(ndb->basedir, pattern); + for (list::iterator it = dirs.begin(); it != dirs.end(); it++) { + *it = path_basename(*it); + *it = it->substr(stemdirstem.length(), string::npos); + } + return dirs; +} + + /** * This is called at the end of an indexing session, to delete the * documents for files that are no longer there. We also build the @@ -658,7 +721,11 @@ bool Rcl::Db::purge() // and does nothing). Maybe related to the exceptions below when // trying to delete an unexistant document ? // Flushing before trying the deletes seeems to work around the problem - ndb->wdb.flush(); + try { + ndb->wdb.flush(); + } catch (...) { + LOGDEB(("Rcl::Db::purge: 1st flush failed\n")); + } for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) { if (!ndb->updated[docid]) { try { @@ -669,7 +736,11 @@ bool Rcl::Db::purge() } } } - ndb->wdb.flush(); + try { + ndb->wdb.flush(); + } catch (...) { + LOGDEB(("Rcl::Db::purge: 2nd flush failed\n")); + } return true; } @@ -749,7 +820,6 @@ class wsQData : public TextSplitCB { }; -// // Turn string into list of xapian queries. There is little // interpretation done on the string (no +term -term or filename:term // stuff). We just separate words and phrases, and interpret @@ -1124,21 +1194,18 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc) Native *ndb = (Native *)pdata; string hash; -#ifdef HASHPATH pathHash(fn, hash, PATHHASHLEN); -#else - hash = fn; -#endif string pathterm = "P" + hash; - if (!ndb->db.term_exists(pathterm)) { - LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n", - pathterm.c_str(), pathterm.length())); - return false; - } // Look for all documents with this path, searching for the one // with the appropriate ipath. This is very inefficient. + const char *ermsg = ""; try { + if (!ndb->db.term_exists(pathterm)) { + LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n", + pathterm.c_str(), pathterm.length())); + return false; + } for (Xapian::PostingIterator docid = ndb->db.postlist_begin(pathterm); docid != ndb->db.postlist_end(pathterm); docid++) { @@ -1148,8 +1215,17 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc) if (dbDataToRclDoc(data, doc) && doc.ipath == ipath) return true; } + } catch (const Xapian::Error &e) { + ermsg = e.get_msg().c_str(); + } catch (const string &s) { + ermsg = s.c_str(); + } catch (const char *s) { + ermsg = s; } catch (...) { - return false; + ermsg = "Caught unknown exception"; + } + if (*ermsg) { + LOGERR(("Rcl::Db::getDoc: %s\n", ermsg)); } return false; } diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 5ac1c87f..6cce5c4b 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -1,6 +1,6 @@ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.20 2005-12-02 16:18:20 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -102,6 +102,7 @@ public: bool needUpdate(const string &filename, const struct stat *stp); bool purge(); bool createStemDb(const string &lang); + bool deleteStemDb(const string &lang); // Query-related functions @@ -127,6 +128,10 @@ public: /** Get results count for current query */ int getResCnt(); + /** Get a list of existing stemming databases */ + std::list getStemLangs(); + + /** Things we don't want to have here. */ friend class Rcl::DbPops; private: diff --git a/src/utils/Makefile b/src/utils/Makefile index 8d6f8f7a..2e4e187b 100644 --- a/src/utils/Makefile +++ b/src/utils/Makefile @@ -15,8 +15,8 @@ trfstreewalk.o : fstreewalk.cpp fstreewalk.h $(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \ -DTEST_FSTREEWALK fstreewalk.cpp -PATHUT_OBJS= trpathut.o pathut.o -trpathut : $(PATHUT_OBJS) +PATHUT_OBJS= trpathut.o pathut.o $(BIGLIB) +trpathut : $(PATHUT_OBJS) $(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS) trpathut.o : pathut.cpp pathut.h $(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp diff --git a/src/utils/pathut.cpp b/src/utils/pathut.cpp index 51a2c265..7c93509e 100644 --- a/src/utils/pathut.cpp +++ b/src/utils/pathut.cpp @@ -1,15 +1,21 @@ #ifndef lint -static char rcsid[] = "@(#$Id: pathut.cpp,v 1.6 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: pathut.cpp,v 1.7 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_PATHUT #include +#include #include + #include +#include +#include #include "pathut.h" #ifndef NO_NAMESPACES using std::string; +using std::list; +using std::stack; #endif /* NO_NAMESPACES */ void path_catslash(std::string &s) { @@ -61,6 +67,18 @@ string path_getsimple(const string &s) { return simple; } +string path_basename(const string &s, const string &suff) +{ + string simple = path_getsimple(s); + string::size_type pos = string::npos; + if (suff.length() && simple.length() > suff.length()) { + pos = simple.rfind(suff); + if (pos != string::npos && pos + suff.length() == simple.length()) + return simple.substr(0, pos); + } + return simple; +} + string path_home() { uid_t uid = getuid(); @@ -98,6 +116,64 @@ extern string path_tildexpand(const string &s) return o; } +#include +extern std::string path_canon(const std::string &is) +{ + if (is.length() == 0) + return is; + string s = is; + if (s[0] != '/') { + char buf[MAXPATHLEN]; + if (!getcwd(buf, MAXPATHLEN)) { + return ""; + } + s = path_cat(string(buf), s); + } + listelems; + stringToTokens(s, elems, "/"); + list cleaned; + for (list::const_iterator it = elems.begin(); + it != elems.end(); it++){ + if (*it == "..") { + if (!cleaned.empty()) + cleaned.pop_back(); + } else if (it->empty() || *it == ".") { + } else { + cleaned.push_back(*it); + } + } + string ret; + if (!cleaned.empty()) { + for (list::const_iterator it = cleaned.begin(); + it != cleaned.end(); it++) { + ret += "/"; + ret += *it; + } + } else { + ret = "/"; + } + return ret; +} + +#include +#include +list path_dirglob(const std::string &dir, + const std::string pattern) +{ + list res; + glob_t mglob; + string mypat=path_cat(dir, pattern); + if (glob(mypat.c_str(), 0, 0, &mglob)) { + return res; + } + for (int i = 0; i < mglob.gl_pathc; i++) { + res.push_back(mglob.gl_pathv[i]); + } + globfree(&mglob); + return res; +} + + #else // TEST_PATHUT #include @@ -108,7 +184,7 @@ using namespace std; const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2", "/dir1/dir2", "./dir", "./dir1/", "dir", "../dir", "/dir/toto.c", - "/dir/.c", + "/dir/.c", "/dir/toto.txt", "toto.txt1" }; const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub", @@ -117,22 +193,51 @@ int nttvec = sizeof(ttvec) / sizeof(string); int main(int argc, const char **argv) { + string s; + list::const_iterator it; #if 0 - for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) { - cout << tstvec[i] << " FATHER " << path_getfather(tstvec[i]) << endl; + for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) { + cout << tstvec[i] << " Father " << path_getfather(tstvec[i]) << endl; } - for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) { - cout << tstvec[i] << " SIMPLE " << path_getsimple(tstvec[i]) << endl; + for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) { + cout << tstvec[i] << " Simple " << path_getsimple(tstvec[i]) << endl; + } + for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) { + cout << tstvec[i] << " Basename " << + path_basename(tstvec[i], ".txt") << endl; } #endif - string s; +#if 0 for (int i = 0; i < nttvec; i++) { cout << "tildexp: '" << ttvec[i] << "' -> '" << path_tildexpand(ttvec[i]) << "'" << endl; } - +#endif +#if 0 + const string canontst[] = {"/dir1/../../..", "/////", "", + "/dir1/../../.././/////dir2///////", + "../../", + "../../../../../../../../../../" + }; + unsigned int nttvec = sizeof(canontst) / sizeof(string); + for (unsigned int i = 0; i < nttvec; i++) { + cout << "canon: '" << canontst[i] << "' -> '" << + path_canon(canontst[i]) << "'" << endl; + } +#endif +#if 1 + if (argc != 3) { + fprintf(stderr, "Usage: trpathut \n"); + exit(1); + } + string dir=argv[1], pattern=argv[2]; + list matched = path_dirglob(dir, pattern); + for (it = matched.begin(); it != matched.end();it++) { + cout << *it << endl; + } +#endif return 0; } diff --git a/src/utils/pathut.h b/src/utils/pathut.h index 13425a05..8160238d 100644 --- a/src/utils/pathut.h +++ b/src/utils/pathut.h @@ -1,14 +1,19 @@ #ifndef _PATHUT_H_INCLUDED_ #define _PATHUT_H_INCLUDED_ -/* @(#$Id: pathut.h,v 1.4 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: pathut.h,v 1.5 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */ #include +#include extern void path_catslash(std::string &s); extern std::string path_cat(const std::string &s1, const std::string &s2); extern std::string path_getsimple(const std::string &s); +extern std::string path_basename(const std::string &s, const std::string &suff=""); extern std::string path_getfather(const std::string &s); extern std::string path_home(); extern std::string path_tildexpand(const std::string &s); +extern std::string path_canon(const std::string &s); +extern std::list path_dirglob(const std::string &dir, + const std::string pattern); #endif /* _PATHUT_H_INCLUDED_ */