allow independant creation / deletion of stem dbs
This commit is contained in:
parent
c4ce5cf691
commit
dac569ab51
@ -1,29 +1,29 @@
|
|||||||
|
#*
|
||||||
|
*.cache
|
||||||
|
*.core
|
||||||
*.o
|
*.o
|
||||||
*~
|
*~
|
||||||
*.core
|
.#*
|
||||||
*.cache
|
.#*
|
||||||
#*
|
|
||||||
.moc
|
.moc
|
||||||
.obj
|
.obj
|
||||||
.ui
|
.ui
|
||||||
.#*
|
|
||||||
CVS
|
CVS
|
||||||
alldeps
|
|
||||||
.#*
|
|
||||||
autom4*
|
|
||||||
TAGS
|
TAGS
|
||||||
|
alldeps
|
||||||
|
autom4*
|
||||||
config.cache
|
config.cache
|
||||||
config.log
|
config.log
|
||||||
config.status
|
config.status
|
||||||
excludefile
|
excludefile
|
||||||
|
lib/librcl.a
|
||||||
makesrcdist.sh
|
makesrcdist.sh
|
||||||
recollinstall
|
|
||||||
mk/localdefs
|
mk/localdefs
|
||||||
sysconf
|
|
||||||
qtgui/Makefile
|
qtgui/Makefile
|
||||||
qtgui/preview/Makefile
|
qtgui/preview/Makefile
|
||||||
qtgui/preview/preview.pro
|
qtgui/preview/preview.pro
|
||||||
qtgui/preview/pvmain.cpp
|
qtgui/preview/pvmain.cpp
|
||||||
lib/librcl.a
|
recollinstall
|
||||||
sampleconf/recoll.conf
|
sampleconf/recoll.conf
|
||||||
|
sysconf
|
||||||
wxgui
|
wxgui
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
@ -10,6 +10,7 @@ static char rcsid[] = "@(#$Id: indexer.cpp,v 1.20 2005-12-14 11:00:48 dockes Exp
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <list>
|
#include <list>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
#include "pathut.h"
|
#include "pathut.h"
|
||||||
#include "conftree.h"
|
#include "conftree.h"
|
||||||
@ -87,13 +88,22 @@ bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
|
|||||||
// filesystem anymore.
|
// filesystem anymore.
|
||||||
db.purge();
|
db.purge();
|
||||||
|
|
||||||
// Create stemming databases
|
// Create stemming databases. We also remove those which are not
|
||||||
|
// configured.
|
||||||
string slangs;
|
string slangs;
|
||||||
if (config->getConfParam("indexstemminglanguages", slangs)) {
|
if (config->getConfParam("indexstemminglanguages", slangs)) {
|
||||||
list<string> langs;
|
list<string> langs;
|
||||||
stringToStrings(slangs, langs);
|
stringToStrings(slangs, langs);
|
||||||
for (list<string>::const_iterator it = langs.begin();
|
|
||||||
it != langs.end(); it++) {
|
// Get the list of existing stem dbs from the database (some may have
|
||||||
|
// been manually created, we just keep those from the config
|
||||||
|
list<string> dblangs = db.getStemLangs();
|
||||||
|
list<string>::const_iterator it;
|
||||||
|
for (it = dblangs.begin(); it != dblangs.end(); it++) {
|
||||||
|
if (find(langs.begin(), langs.end(), *it) == langs.end())
|
||||||
|
db.deleteStemDb(*it);
|
||||||
|
}
|
||||||
|
for (it = langs.begin(); it != langs.end(); it++) {
|
||||||
db.createStemDb(*it);
|
db.createStemDb(*it);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -120,6 +130,16 @@ bool DbIndexer::init(bool resetbefore)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool DbIndexer::createStemDb(const string &lang)
|
||||||
|
{
|
||||||
|
if (!init())
|
||||||
|
return false;
|
||||||
|
return db.createStemDb(lang);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
Index individual files, out of a full tree run. No database purging
|
||||||
|
*/
|
||||||
bool DbIndexer::indexFiles(const list<string> &filenames)
|
bool DbIndexer::indexFiles(const list<string> &filenames)
|
||||||
{
|
{
|
||||||
if (!init())
|
if (!init())
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
#ifndef _INDEXER_H_INCLUDED_
|
#ifndef _INDEXER_H_INCLUDED_
|
||||||
#define _INDEXER_H_INCLUDED_
|
#define _INDEXER_H_INCLUDED_
|
||||||
/* @(#$Id: indexer.h,v 1.8 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: indexer.h,v 1.9 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
@ -24,10 +24,12 @@ class DbIndexer;
|
|||||||
class ConfIndexer {
|
class ConfIndexer {
|
||||||
public:
|
public:
|
||||||
enum runStatus {IndexerOk, IndexerError};
|
enum runStatus {IndexerOk, IndexerError};
|
||||||
ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0) {}
|
ConfIndexer(RclConfig *cnf) : config(cnf), dbindexer(0)
|
||||||
virtual ~ConfIndexer();
|
{
|
||||||
/** Worker function: doe the actual indexing */
|
}
|
||||||
bool index(bool resetbefore = false);
|
virtual ~ConfIndexer();
|
||||||
|
/** Worker function: doe the actual indexing */
|
||||||
|
bool index(bool resetbefore = false);
|
||||||
private:
|
private:
|
||||||
RclConfig *config;
|
RclConfig *config;
|
||||||
DbIndexer *dbindexer; // Object to process directories for a given db
|
DbIndexer *dbindexer; // Object to process directories for a given db
|
||||||
@ -36,10 +38,10 @@ class ConfIndexer {
|
|||||||
/** Index things into one database
|
/** Index things into one database
|
||||||
|
|
||||||
Tree indexing: we inherits FsTreeWalkerCB so that, the processone()
|
Tree indexing: we inherits FsTreeWalkerCB so that, the processone()
|
||||||
method is called by the file-system tree walk code for each file and
|
method is called by the file-system tree walk code for each file and
|
||||||
directory. We keep all state needed while indexing, and finally call
|
directory. We keep all state needed while indexing, and finally call
|
||||||
the methods to purge the db of stale entries and create the stemming
|
the methods to purge the db of stale entries and create the stemming
|
||||||
databases.
|
databases.
|
||||||
|
|
||||||
Single file(s) indexing: no database purging or stem db updating.
|
Single file(s) indexing: no database purging or stem db updating.
|
||||||
*/
|
*/
|
||||||
@ -67,6 +69,9 @@ class DbIndexer : public FsTreeWalkerCB {
|
|||||||
/** Index a list of files. No db cleaning or stemdb updating */
|
/** Index a list of files. No db cleaning or stemdb updating */
|
||||||
bool indexFiles(const std::list<std::string> &files);
|
bool indexFiles(const std::list<std::string> &files);
|
||||||
|
|
||||||
|
/** Create stem database for given language */
|
||||||
|
bool createStemDb(const string &lang);
|
||||||
|
|
||||||
/** Tree walker callback method */
|
/** Tree walker callback method */
|
||||||
FsTreeWalker::Status
|
FsTreeWalker::Status
|
||||||
processone(const std::string &, const struct stat *,
|
processone(const std::string &, const struct stat *,
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.13 2005-12-14 11:00:48 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.14 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
@ -19,10 +19,12 @@ using namespace std;
|
|||||||
#include "pathut.h"
|
#include "pathut.h"
|
||||||
|
|
||||||
|
|
||||||
|
// Globals for exit cleanup
|
||||||
ConfIndexer *confindexer;
|
ConfIndexer *confindexer;
|
||||||
DbIndexer *dbindexer;
|
DbIndexer *dbindexer;
|
||||||
|
|
||||||
bool indexfiles(RclConfig *config, const list<string> &filenames)
|
// Index a list of files
|
||||||
|
static bool indexfiles(RclConfig *config, const list<string> &filenames)
|
||||||
{
|
{
|
||||||
if (filenames.empty())
|
if (filenames.empty())
|
||||||
return true;
|
return true;
|
||||||
@ -42,6 +44,21 @@ bool indexfiles(RclConfig *config, const list<string> &filenames)
|
|||||||
return dbindexer->indexFiles(filenames);
|
return dbindexer->indexFiles(filenames);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create additional stem database
|
||||||
|
static bool createstemdb(RclConfig *config, const string &lang)
|
||||||
|
{
|
||||||
|
// Note that we do not bother to check for multiple databases,
|
||||||
|
// which are currently a fiction anyway.
|
||||||
|
string dbdir;
|
||||||
|
if (!config->getConfParam("dbdir", dbdir)) {
|
||||||
|
LOGERR(("createstemdb: no database directory in configuration\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
dbdir = path_tildexpand(dbdir);
|
||||||
|
dbindexer = new DbIndexer(config, dbdir);
|
||||||
|
return dbindexer->createStemDb(lang);
|
||||||
|
}
|
||||||
|
|
||||||
static void cleanup()
|
static void cleanup()
|
||||||
{
|
{
|
||||||
delete confindexer;
|
delete confindexer;
|
||||||
@ -63,15 +80,19 @@ static int op_flags;
|
|||||||
#define OPT_z 0x2
|
#define OPT_z 0x2
|
||||||
#define OPT_h 0x4
|
#define OPT_h 0x4
|
||||||
#define OPT_i 0x8
|
#define OPT_i 0x8
|
||||||
|
#define OPT_s 0x10
|
||||||
|
|
||||||
static const char usage [] =
|
static const char usage [] =
|
||||||
" recollindex [-hz] \n"
|
"\n"
|
||||||
" recollindex -i <filename [filename ...]>\n"
|
"recollindex [-hz] \n"
|
||||||
|
" Normal index run\n"
|
||||||
|
"recollindex -i <filename [filename ...]>\n"
|
||||||
|
" Index individual files. No db purge or stem database updates\n"
|
||||||
|
"recollindex -s <lang>\n"
|
||||||
|
" Build stem database for language <lang>\n"
|
||||||
"Options:\n"
|
"Options:\n"
|
||||||
" -h : print this message\n"
|
" -h : print this message\n"
|
||||||
" -z : reset database before starting indexation\n\n"
|
" -z : reset database before starting indexation\n\n"
|
||||||
" -i <filename [filename ...]> : index individual files. No db purge or stem\n"
|
|
||||||
" database updates in this case\n"
|
|
||||||
;
|
;
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -97,6 +118,7 @@ int main(int argc, const char **argv)
|
|||||||
case 'z': op_flags |= OPT_z; break;
|
case 'z': op_flags |= OPT_z; break;
|
||||||
case 'h': op_flags |= OPT_h; break;
|
case 'h': op_flags |= OPT_h; break;
|
||||||
case 'i': op_flags |= OPT_i; break;
|
case 'i': op_flags |= OPT_i; break;
|
||||||
|
case 's': op_flags |= OPT_s; break;
|
||||||
default: Usage(); break;
|
default: Usage(); break;
|
||||||
}
|
}
|
||||||
b1: argc--; argv++;
|
b1: argc--; argv++;
|
||||||
@ -108,7 +130,6 @@ int main(int argc, const char **argv)
|
|||||||
|
|
||||||
string reason;
|
string reason;
|
||||||
RclConfig *config = recollinit(cleanup, sigcleanup, reason);
|
RclConfig *config = recollinit(cleanup, sigcleanup, reason);
|
||||||
|
|
||||||
if (config == 0 || !config->ok()) {
|
if (config == 0 || !config->ok()) {
|
||||||
cerr << "Configuration problem: " << reason << endl;
|
cerr << "Configuration problem: " << reason << endl;
|
||||||
exit(1);
|
exit(1);
|
||||||
@ -130,6 +151,11 @@ int main(int argc, const char **argv)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
exit(!indexfiles(config, filenames));
|
exit(!indexfiles(config, filenames));
|
||||||
|
} else if (op_flags & OPT_s) {
|
||||||
|
if (argc != 1)
|
||||||
|
Usage();
|
||||||
|
string lang = *argv++; argc--;
|
||||||
|
exit(!createstemdb(config, lang));
|
||||||
} else {
|
} else {
|
||||||
confindexer = new ConfIndexer(config);
|
confindexer = new ConfIndexer(config);
|
||||||
bool rezero(op_flags & OPT_z);
|
bool rezero(op_flags & OPT_z);
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.48 2006-01-06 13:55:44 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.49 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
@ -23,6 +23,7 @@ using namespace std;
|
|||||||
#include "smallut.h"
|
#include "smallut.h"
|
||||||
#include "pathhash.h"
|
#include "pathhash.h"
|
||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
|
#include "wipedir.h"
|
||||||
|
|
||||||
#include "xapian.h"
|
#include "xapian.h"
|
||||||
#include <xapian/stem.h>
|
#include <xapian/stem.h>
|
||||||
@ -67,23 +68,24 @@ Rcl::Db::~Db()
|
|||||||
ndb->iswritable));
|
ndb->iswritable));
|
||||||
if (ndb->isopen == false)
|
if (ndb->isopen == false)
|
||||||
return;
|
return;
|
||||||
string ermsg;
|
const char *ermsg = "Unknown error";
|
||||||
try {
|
try {
|
||||||
LOGDEB(("Rcl::Db::~Db: closing native database\n"));
|
LOGDEB(("Rcl::Db::~Db: closing native database\n"));
|
||||||
if (ndb->iswritable == true)
|
if (ndb->iswritable == true) {
|
||||||
ndb->wdb.flush();
|
ndb->wdb.flush();
|
||||||
|
}
|
||||||
delete ndb;
|
delete ndb;
|
||||||
return;
|
return;
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
ermsg = e.get_msg();
|
ermsg = e.get_msg().c_str();
|
||||||
} catch (const string &s) {
|
} catch (const string &s) {
|
||||||
ermsg = s;
|
ermsg = s.c_str();
|
||||||
} catch (const char *s) {
|
} catch (const char *s) {
|
||||||
ermsg = s;
|
ermsg = s;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
ermsg = "Caught unknown exception";
|
ermsg = "Caught unknown exception";
|
||||||
}
|
}
|
||||||
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg.c_str()));
|
LOGERR(("Rcl::Db::~Db: got exception: %s\n", ermsg));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Rcl::Db::open(const string& dir, OpenMode mode)
|
bool Rcl::Db::open(const string& dir, OpenMode mode)
|
||||||
@ -98,7 +100,7 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
|
|||||||
LOGERR(("Rcl::Db::open: already open\n"));
|
LOGERR(("Rcl::Db::open: already open\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
string ermsg;
|
const char *ermsg = "Unknown";
|
||||||
try {
|
try {
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case DbUpd:
|
case DbUpd:
|
||||||
@ -125,16 +127,16 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
|
|||||||
ndb->basedir = dir;
|
ndb->basedir = dir;
|
||||||
return true;
|
return true;
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
ermsg = e.get_msg();
|
ermsg = e.get_msg().c_str();
|
||||||
} catch (const string &s) {
|
} catch (const string &s) {
|
||||||
ermsg = s;
|
ermsg = s.c_str();
|
||||||
} catch (const char *s) {
|
} catch (const char *s) {
|
||||||
ermsg = s;
|
ermsg = s;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
ermsg = "Caught unknown exception";
|
ermsg = "Caught unknown exception";
|
||||||
}
|
}
|
||||||
LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n",
|
LOGERR(("Rcl::Db::open: exception while opening '%s': %s\n",
|
||||||
dir.c_str(), ermsg.c_str()));
|
dir.c_str(), ermsg));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -148,7 +150,7 @@ bool Rcl::Db::close()
|
|||||||
ndb->iswritable));
|
ndb->iswritable));
|
||||||
if (ndb->isopen == false)
|
if (ndb->isopen == false)
|
||||||
return true;
|
return true;
|
||||||
string ermsg;
|
const char *ermsg = "Unknown";
|
||||||
try {
|
try {
|
||||||
if (ndb->iswritable == true) {
|
if (ndb->iswritable == true) {
|
||||||
ndb->wdb.flush();
|
ndb->wdb.flush();
|
||||||
@ -159,16 +161,15 @@ bool Rcl::Db::close()
|
|||||||
if (pdata)
|
if (pdata)
|
||||||
return true;
|
return true;
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
ermsg = e.get_msg();
|
ermsg = e.get_msg().c_str();
|
||||||
} catch (const string &s) {
|
} catch (const string &s) {
|
||||||
ermsg = s;
|
ermsg = s.c_str();
|
||||||
} catch (const char *s) {
|
} catch (const char *s) {
|
||||||
ermsg = s;
|
ermsg = s;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
ermsg = "Caught unknown exception";
|
ermsg = "Caught unknown exception";
|
||||||
}
|
}
|
||||||
LOGERR(("Rcl::Db:close: exception while deleting db: %s\n",
|
LOGERR(("Rcl::Db:close: exception while deleting db: %s\n", ermsg));
|
||||||
ermsg.c_str()));
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -194,21 +195,29 @@ class mySplitterCB : public TextSplitCB {
|
|||||||
// Callback for the document to word splitting class during indexation
|
// Callback for the document to word splitting class during indexation
|
||||||
bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
|
bool mySplitterCB::takeword(const std::string &term, int pos, int, int)
|
||||||
{
|
{
|
||||||
// cerr << "splitCb: term " << term << endl;
|
#if 0
|
||||||
//string printable;
|
LOGDEB(("mySplitterCB::takeword:splitCb: [%s]\n", term.c_str()));
|
||||||
//transcode(term, printable, "UTF-8", "ISO-8859-1");
|
string printable;
|
||||||
//cerr << "Adding " << printable << endl;
|
if (transcode(term, printable, "UTF-8", "ISO-8859-1")) {
|
||||||
|
LOGDEB((" [%s]\n", printable.c_str()));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const char *ermsg;
|
||||||
try {
|
try {
|
||||||
// 1 is the value for wdfinc in index_text when called from omindex
|
// Note: 1 is the within document frequency increment. It would
|
||||||
// TOBEDONE: check what this is used for
|
// be possible to assign different weigths to doc parts (ie title)
|
||||||
|
// by using a higher value
|
||||||
curpos = pos;
|
curpos = pos;
|
||||||
doc.add_posting(term, basepos + curpos, 1);
|
doc.add_posting(term, basepos + curpos, 1);
|
||||||
|
return true;
|
||||||
|
} catch (const Xapian::Error &e) {
|
||||||
|
ermsg = e.get_msg().c_str();
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
LOGERR(("Rcl::Db: Error occurred during xapian add_posting\n"));
|
ermsg= "Unknown error";
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
return true;
|
LOGERR(("Rcl::Db: xapian add_posting error %s\n", ermsg));
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unaccent and lowercase data, replace \n\r with spaces
|
// Unaccent and lowercase data, replace \n\r with spaces
|
||||||
@ -239,7 +248,7 @@ bool Rcl::dumb_string(const string &in, string &out)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* omindex direct */
|
/* From omindex direct */
|
||||||
/* Truncate a string to a given maxlength, avoiding cutting off midword
|
/* Truncate a string to a given maxlength, avoiding cutting off midword
|
||||||
* if reasonably possible. */
|
* if reasonably possible. */
|
||||||
string
|
string
|
||||||
@ -266,17 +275,13 @@ truncate_to_word(string & input, string::size_type maxlen)
|
|||||||
|
|
||||||
output += " ...";
|
output += " ...";
|
||||||
}
|
}
|
||||||
|
// No need to replace newlines with spaces, we do this in dumb_string()
|
||||||
// replace newlines with spaces
|
|
||||||
size_t i = 0;
|
|
||||||
while ((i = output.find('\n', i)) != string::npos) output[i] = ' ';
|
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Truncate longer path and uniquize with hash . The goad for this is
|
// Truncate longer path and uniquize with hash . The goal for this is
|
||||||
// to avoid xapian max term length limitations, not to gain space (we
|
// to avoid xapian max term length limitations, not to gain space (we
|
||||||
// gain very little even with very short maxlens like 30)
|
// gain very little even with very short maxlens like 30)
|
||||||
#define HASHPATH
|
|
||||||
#define PATHHASHLEN 150
|
#define PATHHASHLEN 150
|
||||||
|
|
||||||
// Add document in internal form to the database: index the terms in
|
// Add document in internal form to the database: index the terms in
|
||||||
@ -310,7 +315,8 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
|||||||
|
|
||||||
// Split and index file name. This supposes that it's either ascii
|
// Split and index file name. This supposes that it's either ascii
|
||||||
// or utf-8. If this fails, we just go on. We need a config
|
// or utf-8. If this fails, we just go on. We need a config
|
||||||
// parameter for file name charset
|
// parameter for file name charset.
|
||||||
|
// Do we really want to fold case here ?
|
||||||
if (dumb_string(fn, noacc)) {
|
if (dumb_string(fn, noacc)) {
|
||||||
splitter.text_to_words(noacc);
|
splitter.text_to_words(noacc);
|
||||||
splitData.basepos += splitData.curpos + 100;
|
splitData.basepos += splitData.curpos + 100;
|
||||||
@ -324,7 +330,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
|||||||
splitter.text_to_words(noacc);
|
splitter.text_to_words(noacc);
|
||||||
splitData.basepos += splitData.curpos + 100;
|
splitData.basepos += splitData.curpos + 100;
|
||||||
|
|
||||||
// Split body and index terms
|
// Split and index body
|
||||||
if (!dumb_string(doc.text, noacc)) {
|
if (!dumb_string(doc.text, noacc)) {
|
||||||
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
|
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
|
||||||
return false;
|
return false;
|
||||||
@ -332,7 +338,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
|||||||
splitter.text_to_words(noacc);
|
splitter.text_to_words(noacc);
|
||||||
splitData.basepos += splitData.curpos + 100;
|
splitData.basepos += splitData.curpos + 100;
|
||||||
|
|
||||||
// Split keywords and index terms
|
// Split and index keywords
|
||||||
if (!dumb_string(doc.keywords, noacc)) {
|
if (!dumb_string(doc.keywords, noacc)) {
|
||||||
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
|
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
|
||||||
return false;
|
return false;
|
||||||
@ -340,7 +346,7 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
|||||||
splitter.text_to_words(noacc);
|
splitter.text_to_words(noacc);
|
||||||
splitData.basepos += splitData.curpos + 100;
|
splitData.basepos += splitData.curpos + 100;
|
||||||
|
|
||||||
// Split abstract and index terms
|
// Split and index abstract
|
||||||
if (!dumb_string(doc.abstract, noacc)) {
|
if (!dumb_string(doc.abstract, noacc)) {
|
||||||
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
|
LOGERR(("Rcl::Db::add: dumb_string failed\n"));
|
||||||
return false;
|
return false;
|
||||||
@ -354,18 +360,13 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
|||||||
|
|
||||||
// Path name
|
// Path name
|
||||||
string hash;
|
string hash;
|
||||||
#ifdef HASHPATH
|
|
||||||
pathHash(fn, hash, PATHHASHLEN);
|
pathHash(fn, hash, PATHHASHLEN);
|
||||||
#else
|
|
||||||
hash = fn;
|
|
||||||
#endif
|
|
||||||
LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
|
LOGDEB2(("Rcl::Db::add: pathhash [%s]\n", hash.c_str()));
|
||||||
|
|
||||||
string pathterm = "P" + hash;
|
string pathterm = "P" + hash;
|
||||||
newdocument.add_term(pathterm);
|
newdocument.add_term(pathterm);
|
||||||
|
|
||||||
// File path + internal path: document unique identifier for
|
// Internal path: with path, makes unique identifier for documents
|
||||||
// documents inside multidocument files.
|
// inside multidocument files.
|
||||||
string uniterm;
|
string uniterm;
|
||||||
if (!doc.ipath.empty()) {
|
if (!doc.ipath.empty()) {
|
||||||
uniterm = "Q" + hash + "|" + doc.ipath;
|
uniterm = "Q" + hash + "|" + doc.ipath;
|
||||||
@ -395,8 +396,9 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
|||||||
string record = "url=file://" + fn;
|
string record = "url=file://" + fn;
|
||||||
record += "\nmtype=" + doc.mimetype;
|
record += "\nmtype=" + doc.mimetype;
|
||||||
record += "\nfmtime=" + doc.fmtime;
|
record += "\nfmtime=" + doc.fmtime;
|
||||||
if (!doc.dmtime.empty())
|
if (!doc.dmtime.empty()) {
|
||||||
record += "\ndmtime=" + doc.dmtime;
|
record += "\ndmtime=" + doc.dmtime;
|
||||||
|
}
|
||||||
record += "\norigcharset=" + doc.origcharset;
|
record += "\norigcharset=" + doc.origcharset;
|
||||||
record += "\ncaption=" + doc.title;
|
record += "\ncaption=" + doc.title;
|
||||||
record += "\nkeywords=" + doc.keywords;
|
record += "\nkeywords=" + doc.keywords;
|
||||||
@ -405,12 +407,10 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
|||||||
record += "\nipath=" + doc.ipath;
|
record += "\nipath=" + doc.ipath;
|
||||||
}
|
}
|
||||||
record += "\n";
|
record += "\n";
|
||||||
|
|
||||||
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
|
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
|
||||||
newdocument.set_data(record);
|
newdocument.set_data(record);
|
||||||
|
|
||||||
const char *fnc = fn.c_str();
|
const char *fnc = fn.c_str();
|
||||||
|
|
||||||
// Add db entry or update existing entry:
|
// Add db entry or update existing entry:
|
||||||
try {
|
try {
|
||||||
Xapian::docid did =
|
Xapian::docid did =
|
||||||
@ -426,13 +426,19 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &idoc)
|
|||||||
}
|
}
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
// FIXME: is this ever actually needed?
|
// FIXME: is this ever actually needed?
|
||||||
ndb->wdb.add_document(newdocument);
|
try {
|
||||||
LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n",
|
ndb->wdb.add_document(newdocument);
|
||||||
fnc));
|
LOGDEB(("Rcl::Db::add: %s added (failed re-seek for duplicate)\n",
|
||||||
|
fnc));
|
||||||
|
} catch (...) {
|
||||||
|
LOGERR(("Rcl::Db::add: failed again after replace_document\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test if given filename has changed since last indexed:
|
||||||
bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
||||||
{
|
{
|
||||||
if (pdata == 0)
|
if (pdata == 0)
|
||||||
@ -441,16 +447,9 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
|||||||
|
|
||||||
// If no document exist with this path, we do need update
|
// If no document exist with this path, we do need update
|
||||||
string hash;
|
string hash;
|
||||||
#ifdef HASHPATH
|
|
||||||
pathHash(filename, hash, PATHHASHLEN);
|
pathHash(filename, hash, PATHHASHLEN);
|
||||||
#else
|
|
||||||
hash = filename;
|
|
||||||
#endif
|
|
||||||
string pathterm = "P" + hash;
|
string pathterm = "P" + hash;
|
||||||
if (!ndb->wdb.term_exists(pathterm)) {
|
const char *ermsg;
|
||||||
LOGDEB1(("Db::needUpdate: path inexistant: %s\n", pathterm.c_str()));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look for all documents with this path. We need to look at all
|
// Look for all documents with this path. We need to look at all
|
||||||
// to set their existence flag. We check the update time on the
|
// to set their existence flag. We check the update time on the
|
||||||
@ -459,6 +458,11 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
|||||||
// file changed)
|
// file changed)
|
||||||
Xapian::PostingIterator doc;
|
Xapian::PostingIterator doc;
|
||||||
try {
|
try {
|
||||||
|
if (!ndb->wdb.term_exists(pathterm)) {
|
||||||
|
LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm);
|
Xapian::PostingIterator docid0 = ndb->wdb.postlist_begin(pathterm);
|
||||||
for (Xapian::PostingIterator docid = docid0;
|
for (Xapian::PostingIterator docid = docid0;
|
||||||
docid != ndb->wdb.postlist_end(pathterm); docid++) {
|
docid != ndb->wdb.postlist_end(pathterm); docid++) {
|
||||||
@ -491,21 +495,26 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
|
|||||||
if (*docid < ndb->updated.size())
|
if (*docid < ndb->updated.size())
|
||||||
ndb->updated[*docid] = true;
|
ndb->updated[*docid] = true;
|
||||||
}
|
}
|
||||||
|
return false;
|
||||||
|
} catch (const Xapian::Error &e) {
|
||||||
|
ermsg = e.get_msg().c_str();
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
return true;
|
ermsg= "Unknown error";
|
||||||
}
|
}
|
||||||
|
LOGERR(("Db::needUpdate: error while checking existence: %s\n", ermsg));
|
||||||
return false;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const static string stemdirstem = "stem_";
|
||||||
/// Compute name of stem db for given base database and language
|
/// Compute name of stem db for given base database and language
|
||||||
static string stemdbname(const string& basename, string lang)
|
static string stemdbname(const string& basename, string lang)
|
||||||
{
|
{
|
||||||
string nm = path_cat(basename, string("stem_") + lang);
|
string nm = path_cat(basename, stemdirstem + lang);
|
||||||
return nm;
|
return nm;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Is char non-lowercase ascii ?
|
// Deciding if we try to stem the term. If it has numerals or capitals
|
||||||
|
// we don't
|
||||||
inline static bool
|
inline static bool
|
||||||
p_notlowerorutf(unsigned int c)
|
p_notlowerorutf(unsigned int c)
|
||||||
{
|
{
|
||||||
@ -514,6 +523,24 @@ p_notlowerorutf(unsigned int c)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete stem db for given language
|
||||||
|
*/
|
||||||
|
bool Rcl::Db::deleteStemDb(const string& lang)
|
||||||
|
{
|
||||||
|
LOGDEB(("Rcl::Db::deleteStemDb(%s)\n", lang.c_str()));
|
||||||
|
if (pdata == 0)
|
||||||
|
return false;
|
||||||
|
Native *ndb = (Native *)pdata;
|
||||||
|
if (ndb->isopen == false)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
string dir = stemdbname(ndb->basedir, lang);
|
||||||
|
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create database of stem to parents associations for a given language.
|
* Create database of stem to parents associations for a given language.
|
||||||
* We walk the list of all terms, stem them, and create another Xapian db
|
* We walk the list of all terms, stem them, and create another Xapian db
|
||||||
@ -526,7 +553,7 @@ bool Rcl::Db::createStemDb(const string& lang)
|
|||||||
if (pdata == 0)
|
if (pdata == 0)
|
||||||
return false;
|
return false;
|
||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
if (ndb->isopen == false || ndb->iswritable == false)
|
if (ndb->isopen == false)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// First build the in-memory stem database:
|
// First build the in-memory stem database:
|
||||||
@ -562,23 +589,41 @@ bool Rcl::Db::createStemDb(const string& lang)
|
|||||||
}
|
}
|
||||||
assocs.insert(pair<string,string>(stem, *it));
|
assocs.insert(pair<string,string>(stem, *it));
|
||||||
}
|
}
|
||||||
|
} catch (const Xapian::Error &e) {
|
||||||
|
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
|
||||||
|
return false;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
LOGERR(("Stem database build failed: no stemmer for %s ? \n",
|
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
|
||||||
lang.c_str()));
|
lang.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class DirWiper {
|
||||||
|
public:
|
||||||
|
string dir;
|
||||||
|
bool do_it;
|
||||||
|
DirWiper(string d) : dir(d), do_it(true) {}
|
||||||
|
~DirWiper() {
|
||||||
|
if (do_it) {
|
||||||
|
wipedir(dir);
|
||||||
|
rmdir(dir.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
// Create xapian database for stem relations
|
// Create xapian database for stem relations
|
||||||
string stemdbdir = stemdbname(ndb->basedir, lang);
|
string stemdbdir = stemdbname(ndb->basedir, lang);
|
||||||
string ermsg = "NOERROR";
|
// We want to get rid of the db dir in case of error. This gets disarmed
|
||||||
|
// just before success return.
|
||||||
|
DirWiper wiper(stemdbdir);
|
||||||
|
const char *ermsg = "NOERROR";
|
||||||
Xapian::WritableDatabase sdb;
|
Xapian::WritableDatabase sdb;
|
||||||
try {
|
try {
|
||||||
sdb = Xapian::WritableDatabase(stemdbdir,
|
sdb = Xapian::WritableDatabase(stemdbdir,
|
||||||
Xapian::DB_CREATE_OR_OVERWRITE);
|
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
ermsg = e.get_msg();
|
ermsg = e.get_msg().c_str();
|
||||||
} catch (const string &s) {
|
} catch (const string &s) {
|
||||||
ermsg = s;
|
ermsg = s.c_str();
|
||||||
} catch (const char *s) {
|
} catch (const char *s) {
|
||||||
ermsg = s;
|
ermsg = s;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
@ -586,7 +631,7 @@ bool Rcl::Db::createStemDb(const string& lang)
|
|||||||
}
|
}
|
||||||
if (ermsg != "NOERROR") {
|
if (ermsg != "NOERROR") {
|
||||||
LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n",
|
LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n",
|
||||||
stemdbdir.c_str(), ermsg.c_str()));
|
stemdbdir.c_str(), ermsg));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -632,9 +677,27 @@ bool Rcl::Db::createStemDb(const string& lang)
|
|||||||
}
|
}
|
||||||
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
|
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
|
||||||
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
||||||
|
wiper.do_it = false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
list<string> Rcl::Db::getStemLangs()
|
||||||
|
{
|
||||||
|
list<string> dirs;
|
||||||
|
LOGDEB(("Rcl::Db::getStemLang\n"));
|
||||||
|
if (pdata == 0)
|
||||||
|
return dirs;
|
||||||
|
Native *ndb = (Native *)pdata;
|
||||||
|
string pattern = stemdirstem + "*";
|
||||||
|
dirs = path_dirglob(ndb->basedir, pattern);
|
||||||
|
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
||||||
|
*it = path_basename(*it);
|
||||||
|
*it = it->substr(stemdirstem.length(), string::npos);
|
||||||
|
}
|
||||||
|
return dirs;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is called at the end of an indexing session, to delete the
|
* This is called at the end of an indexing session, to delete the
|
||||||
* documents for files that are no longer there. We also build the
|
* documents for files that are no longer there. We also build the
|
||||||
@ -658,7 +721,11 @@ bool Rcl::Db::purge()
|
|||||||
// and does nothing). Maybe related to the exceptions below when
|
// and does nothing). Maybe related to the exceptions below when
|
||||||
// trying to delete an unexistant document ?
|
// trying to delete an unexistant document ?
|
||||||
// Flushing before trying the deletes seeems to work around the problem
|
// Flushing before trying the deletes seeems to work around the problem
|
||||||
ndb->wdb.flush();
|
try {
|
||||||
|
ndb->wdb.flush();
|
||||||
|
} catch (...) {
|
||||||
|
LOGDEB(("Rcl::Db::purge: 1st flush failed\n"));
|
||||||
|
}
|
||||||
for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) {
|
for (Xapian::docid docid = 1; docid < ndb->updated.size(); ++docid) {
|
||||||
if (!ndb->updated[docid]) {
|
if (!ndb->updated[docid]) {
|
||||||
try {
|
try {
|
||||||
@ -669,7 +736,11 @@ bool Rcl::Db::purge()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ndb->wdb.flush();
|
try {
|
||||||
|
ndb->wdb.flush();
|
||||||
|
} catch (...) {
|
||||||
|
LOGDEB(("Rcl::Db::purge: 2nd flush failed\n"));
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -749,7 +820,6 @@ class wsQData : public TextSplitCB {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
//
|
|
||||||
// Turn string into list of xapian queries. There is little
|
// Turn string into list of xapian queries. There is little
|
||||||
// interpretation done on the string (no +term -term or filename:term
|
// interpretation done on the string (no +term -term or filename:term
|
||||||
// stuff). We just separate words and phrases, and interpret
|
// stuff). We just separate words and phrases, and interpret
|
||||||
@ -1124,21 +1194,18 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
|
|||||||
Native *ndb = (Native *)pdata;
|
Native *ndb = (Native *)pdata;
|
||||||
|
|
||||||
string hash;
|
string hash;
|
||||||
#ifdef HASHPATH
|
|
||||||
pathHash(fn, hash, PATHHASHLEN);
|
pathHash(fn, hash, PATHHASHLEN);
|
||||||
#else
|
|
||||||
hash = fn;
|
|
||||||
#endif
|
|
||||||
string pathterm = "P" + hash;
|
string pathterm = "P" + hash;
|
||||||
if (!ndb->db.term_exists(pathterm)) {
|
|
||||||
LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n",
|
|
||||||
pathterm.c_str(), pathterm.length()));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look for all documents with this path, searching for the one
|
// Look for all documents with this path, searching for the one
|
||||||
// with the appropriate ipath. This is very inefficient.
|
// with the appropriate ipath. This is very inefficient.
|
||||||
|
const char *ermsg = "";
|
||||||
try {
|
try {
|
||||||
|
if (!ndb->db.term_exists(pathterm)) {
|
||||||
|
LOGDEB(("Db::getDoc: path inexistant: [%s] len %d\n",
|
||||||
|
pathterm.c_str(), pathterm.length()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
for (Xapian::PostingIterator docid =
|
for (Xapian::PostingIterator docid =
|
||||||
ndb->db.postlist_begin(pathterm);
|
ndb->db.postlist_begin(pathterm);
|
||||||
docid != ndb->db.postlist_end(pathterm); docid++) {
|
docid != ndb->db.postlist_end(pathterm); docid++) {
|
||||||
@ -1148,8 +1215,17 @@ bool Rcl::Db::getDoc(const string &fn, const string &ipath, Doc &doc)
|
|||||||
if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
|
if (dbDataToRclDoc(data, doc) && doc.ipath == ipath)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
} catch (const Xapian::Error &e) {
|
||||||
|
ermsg = e.get_msg().c_str();
|
||||||
|
} catch (const string &s) {
|
||||||
|
ermsg = s.c_str();
|
||||||
|
} catch (const char *s) {
|
||||||
|
ermsg = s;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
return false;
|
ermsg = "Caught unknown exception";
|
||||||
|
}
|
||||||
|
if (*ermsg) {
|
||||||
|
LOGERR(("Rcl::Db::getDoc: %s\n", ermsg));
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
#ifndef _DB_H_INCLUDED_
|
#ifndef _DB_H_INCLUDED_
|
||||||
#define _DB_H_INCLUDED_
|
#define _DB_H_INCLUDED_
|
||||||
/* @(#$Id: rcldb.h,v 1.20 2005-12-02 16:18:20 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: rcldb.h,v 1.21 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
@ -102,6 +102,7 @@ public:
|
|||||||
bool needUpdate(const string &filename, const struct stat *stp);
|
bool needUpdate(const string &filename, const struct stat *stp);
|
||||||
bool purge();
|
bool purge();
|
||||||
bool createStemDb(const string &lang);
|
bool createStemDb(const string &lang);
|
||||||
|
bool deleteStemDb(const string &lang);
|
||||||
|
|
||||||
// Query-related functions
|
// Query-related functions
|
||||||
|
|
||||||
@ -127,6 +128,10 @@ public:
|
|||||||
/** Get results count for current query */
|
/** Get results count for current query */
|
||||||
int getResCnt();
|
int getResCnt();
|
||||||
|
|
||||||
|
/** Get a list of existing stemming databases */
|
||||||
|
std::list<std::string> getStemLangs();
|
||||||
|
|
||||||
|
/** Things we don't want to have here. */
|
||||||
friend class Rcl::DbPops;
|
friend class Rcl::DbPops;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|||||||
@ -15,8 +15,8 @@ trfstreewalk.o : fstreewalk.cpp fstreewalk.h
|
|||||||
$(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \
|
$(CXX) -o trfstreewalk.o -c $(CXXFLAGS) \
|
||||||
-DTEST_FSTREEWALK fstreewalk.cpp
|
-DTEST_FSTREEWALK fstreewalk.cpp
|
||||||
|
|
||||||
PATHUT_OBJS= trpathut.o pathut.o
|
PATHUT_OBJS= trpathut.o pathut.o $(BIGLIB)
|
||||||
trpathut : $(PATHUT_OBJS)
|
trpathut : $(PATHUT_OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS)
|
$(CXX) $(CXXFLAGS) -o trpathut $(PATHUT_OBJS)
|
||||||
trpathut.o : pathut.cpp pathut.h
|
trpathut.o : pathut.cpp pathut.h
|
||||||
$(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp
|
$(CXX) -o trpathut.o -c $(CXXFLAGS) -DTEST_PATHUT pathut.cpp
|
||||||
|
|||||||
@ -1,15 +1,21 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.6 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: pathut.cpp,v 1.7 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef TEST_PATHUT
|
#ifndef TEST_PATHUT
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#include <sys/param.h>
|
||||||
#include <pwd.h>
|
#include <pwd.h>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <list>
|
||||||
|
#include <stack>
|
||||||
|
|
||||||
#include "pathut.h"
|
#include "pathut.h"
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
using std::string;
|
using std::string;
|
||||||
|
using std::list;
|
||||||
|
using std::stack;
|
||||||
#endif /* NO_NAMESPACES */
|
#endif /* NO_NAMESPACES */
|
||||||
|
|
||||||
void path_catslash(std::string &s) {
|
void path_catslash(std::string &s) {
|
||||||
@ -61,6 +67,18 @@ string path_getsimple(const string &s) {
|
|||||||
return simple;
|
return simple;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string path_basename(const string &s, const string &suff)
|
||||||
|
{
|
||||||
|
string simple = path_getsimple(s);
|
||||||
|
string::size_type pos = string::npos;
|
||||||
|
if (suff.length() && simple.length() > suff.length()) {
|
||||||
|
pos = simple.rfind(suff);
|
||||||
|
if (pos != string::npos && pos + suff.length() == simple.length())
|
||||||
|
return simple.substr(0, pos);
|
||||||
|
}
|
||||||
|
return simple;
|
||||||
|
}
|
||||||
|
|
||||||
string path_home()
|
string path_home()
|
||||||
{
|
{
|
||||||
uid_t uid = getuid();
|
uid_t uid = getuid();
|
||||||
@ -98,6 +116,64 @@ extern string path_tildexpand(const string &s)
|
|||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include <smallut.h>
|
||||||
|
extern std::string path_canon(const std::string &is)
|
||||||
|
{
|
||||||
|
if (is.length() == 0)
|
||||||
|
return is;
|
||||||
|
string s = is;
|
||||||
|
if (s[0] != '/') {
|
||||||
|
char buf[MAXPATHLEN];
|
||||||
|
if (!getcwd(buf, MAXPATHLEN)) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
s = path_cat(string(buf), s);
|
||||||
|
}
|
||||||
|
list<string>elems;
|
||||||
|
stringToTokens(s, elems, "/");
|
||||||
|
list<string> cleaned;
|
||||||
|
for (list<string>::const_iterator it = elems.begin();
|
||||||
|
it != elems.end(); it++){
|
||||||
|
if (*it == "..") {
|
||||||
|
if (!cleaned.empty())
|
||||||
|
cleaned.pop_back();
|
||||||
|
} else if (it->empty() || *it == ".") {
|
||||||
|
} else {
|
||||||
|
cleaned.push_back(*it);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
string ret;
|
||||||
|
if (!cleaned.empty()) {
|
||||||
|
for (list<string>::const_iterator it = cleaned.begin();
|
||||||
|
it != cleaned.end(); it++) {
|
||||||
|
ret += "/";
|
||||||
|
ret += *it;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ret = "/";
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
#include <glob.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
list<std::string> path_dirglob(const std::string &dir,
|
||||||
|
const std::string pattern)
|
||||||
|
{
|
||||||
|
list<string> res;
|
||||||
|
glob_t mglob;
|
||||||
|
string mypat=path_cat(dir, pattern);
|
||||||
|
if (glob(mypat.c_str(), 0, 0, &mglob)) {
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < mglob.gl_pathc; i++) {
|
||||||
|
res.push_back(mglob.gl_pathv[i]);
|
||||||
|
}
|
||||||
|
globfree(&mglob);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#else // TEST_PATHUT
|
#else // TEST_PATHUT
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
@ -108,7 +184,7 @@ using namespace std;
|
|||||||
const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2",
|
const char *tstvec[] = {"", "/", "/dir", "/dir/", "/dir1/dir2",
|
||||||
"/dir1/dir2",
|
"/dir1/dir2",
|
||||||
"./dir", "./dir1/", "dir", "../dir", "/dir/toto.c",
|
"./dir", "./dir1/", "dir", "../dir", "/dir/toto.c",
|
||||||
"/dir/.c",
|
"/dir/.c", "/dir/toto.txt", "toto.txt1"
|
||||||
};
|
};
|
||||||
|
|
||||||
const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub",
|
const string ttvec[] = {"/dir", "", "~", "~/sub", "~root", "~root/sub",
|
||||||
@ -117,22 +193,51 @@ int nttvec = sizeof(ttvec) / sizeof(string);
|
|||||||
|
|
||||||
int main(int argc, const char **argv)
|
int main(int argc, const char **argv)
|
||||||
{
|
{
|
||||||
|
string s;
|
||||||
|
list<string>::const_iterator it;
|
||||||
#if 0
|
#if 0
|
||||||
for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
|
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
|
||||||
cout << tstvec[i] << " FATHER " << path_getfather(tstvec[i]) << endl;
|
cout << tstvec[i] << " Father " << path_getfather(tstvec[i]) << endl;
|
||||||
}
|
}
|
||||||
for (int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
|
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
|
||||||
cout << tstvec[i] << " SIMPLE " << path_getsimple(tstvec[i]) << endl;
|
cout << tstvec[i] << " Simple " << path_getsimple(tstvec[i]) << endl;
|
||||||
|
}
|
||||||
|
for (unsigned int i = 0;i < sizeof(tstvec) / sizeof(char *); i++) {
|
||||||
|
cout << tstvec[i] << " Basename " <<
|
||||||
|
path_basename(tstvec[i], ".txt") << endl;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
string s;
|
|
||||||
|
|
||||||
|
#if 0
|
||||||
for (int i = 0; i < nttvec; i++) {
|
for (int i = 0; i < nttvec; i++) {
|
||||||
cout << "tildexp: '" << ttvec[i] << "' -> '" <<
|
cout << "tildexp: '" << ttvec[i] << "' -> '" <<
|
||||||
path_tildexpand(ttvec[i]) << "'" << endl;
|
path_tildexpand(ttvec[i]) << "'" << endl;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
const string canontst[] = {"/dir1/../../..", "/////", "",
|
||||||
|
"/dir1/../../.././/////dir2///////",
|
||||||
|
"../../",
|
||||||
|
"../../../../../../../../../../"
|
||||||
|
};
|
||||||
|
unsigned int nttvec = sizeof(canontst) / sizeof(string);
|
||||||
|
for (unsigned int i = 0; i < nttvec; i++) {
|
||||||
|
cout << "canon: '" << canontst[i] << "' -> '" <<
|
||||||
|
path_canon(canontst[i]) << "'" << endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if 1
|
||||||
|
if (argc != 3) {
|
||||||
|
fprintf(stderr, "Usage: trpathut <dir> <pattern>\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
string dir=argv[1], pattern=argv[2];
|
||||||
|
list<string> matched = path_dirglob(dir, pattern);
|
||||||
|
for (it = matched.begin(); it != matched.end();it++) {
|
||||||
|
cout << *it << endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,14 +1,19 @@
|
|||||||
#ifndef _PATHUT_H_INCLUDED_
|
#ifndef _PATHUT_H_INCLUDED_
|
||||||
#define _PATHUT_H_INCLUDED_
|
#define _PATHUT_H_INCLUDED_
|
||||||
/* @(#$Id: pathut.h,v 1.4 2005-12-13 12:42:59 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: pathut.h,v 1.5 2006-01-09 16:53:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <list>
|
||||||
|
|
||||||
extern void path_catslash(std::string &s);
|
extern void path_catslash(std::string &s);
|
||||||
extern std::string path_cat(const std::string &s1, const std::string &s2);
|
extern std::string path_cat(const std::string &s1, const std::string &s2);
|
||||||
extern std::string path_getsimple(const std::string &s);
|
extern std::string path_getsimple(const std::string &s);
|
||||||
|
extern std::string path_basename(const std::string &s, const std::string &suff="");
|
||||||
extern std::string path_getfather(const std::string &s);
|
extern std::string path_getfather(const std::string &s);
|
||||||
extern std::string path_home();
|
extern std::string path_home();
|
||||||
extern std::string path_tildexpand(const std::string &s);
|
extern std::string path_tildexpand(const std::string &s);
|
||||||
|
|
||||||
|
extern std::string path_canon(const std::string &s);
|
||||||
|
extern std::list<std::string> path_dirglob(const std::string &dir,
|
||||||
|
const std::string pattern);
|
||||||
#endif /* _PATHUT_H_INCLUDED_ */
|
#endif /* _PATHUT_H_INCLUDED_ */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user