Reimplemented the stem expansion mechanism over Xapian synonyms feature
This commit is contained in:
parent
909d92b218
commit
bd0f002c1a
@ -235,7 +235,7 @@ bool ConfIndexer::createStemmingDatabases()
|
||||
{
|
||||
string slangs;
|
||||
if (m_config->getConfParam("indexstemminglanguages", slangs)) {
|
||||
if (!m_db.open(Rcl::Db::DbRO)) {
|
||||
if (!m_db.open(Rcl::Db::DbUpd)) {
|
||||
LOGERR(("ConfIndexer::createStemmingDb: could not open db\n"))
|
||||
return false;
|
||||
}
|
||||
@ -262,7 +262,7 @@ bool ConfIndexer::createStemmingDatabases()
|
||||
|
||||
bool ConfIndexer::createStemDb(const string &lang)
|
||||
{
|
||||
if (!m_db.open(Rcl::Db::DbRO)) {
|
||||
if (!m_db.open(Rcl::Db::DbUpd)) {
|
||||
return false;
|
||||
}
|
||||
return m_db.createStemDb(lang);
|
||||
|
||||
@ -6,8 +6,8 @@ LIBS = librcl.a
|
||||
|
||||
all: $(LIBS)
|
||||
|
||||
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
|
||||
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
||||
OBJS = rclaspell.o beaglequeuecache.o cstr.o rclconfig.o rclinit.o textsplit.o unacpp.o beaglequeue.o bglfetcher.o fetcher.o fsfetcher.o fsindexer.o indexer.o mimetype.o subtreelist.o htmlparse.o internfile.o mh_exec.o mh_execm.o mh_html.o mh_mail.o mh_mbox.o mh_text.o mimehandler.o myhtmlparse.o txtdcode.o docseq.o docseqdb.o docseqhist.o filtseq.o dynconf.o plaintorich.o recollq.o reslistpager.o sortseq.o wasastringtoquery.o wasatorcl.o rcldb.o rcldoc.o rclquery.o searchdata.o stemdb.o stoplist.o synfamily.o unac.o base64.o circache.o closefrom.o conftree.o copyfile.o debuglog.o ecrontab.o execmd.o fstreewalk.o idfile.o fileudi.o md5.o mimeparse.o netcon.o pathut.o pxattr.o rclionice.o readfile.o smallut.o transcode.o wipedir.o x11mon.o mime-parsefull.o mime-parseonlyheader.o mime-printbody.o mime.o convert.o iodevice.o iofactory.o
|
||||
DEPS = rclaspell.dep.stamp beaglequeuecache.dep.stamp cstr.dep.stamp rclconfig.dep.stamp rclinit.dep.stamp textsplit.dep.stamp unacpp.dep.stamp beaglequeue.dep.stamp bglfetcher.dep.stamp fetcher.dep.stamp fsfetcher.dep.stamp fsindexer.dep.stamp indexer.dep.stamp mimetype.dep.stamp subtreelist.dep.stamp htmlparse.dep.stamp internfile.dep.stamp mh_exec.dep.stamp mh_execm.dep.stamp mh_html.dep.stamp mh_mail.dep.stamp mh_mbox.dep.stamp mh_text.dep.stamp mimehandler.dep.stamp myhtmlparse.dep.stamp txtdcode.dep.stamp docseq.dep.stamp docseqdb.dep.stamp docseqhist.dep.stamp filtseq.dep.stamp dynconf.dep.stamp plaintorich.dep.stamp recollq.dep.stamp reslistpager.dep.stamp sortseq.dep.stamp wasastringtoquery.dep.stamp wasatorcl.dep.stamp rcldb.dep.stamp rcldoc.dep.stamp rclquery.dep.stamp searchdata.dep.stamp stemdb.dep.stamp stoplist.dep.stamp synfamily.dep.stamp unac.dep.stamp base64.dep.stamp circache.dep.stamp closefrom.dep.stamp conftree.dep.stamp copyfile.dep.stamp debuglog.dep.stamp ecrontab.dep.stamp execmd.dep.stamp fstreewalk.dep.stamp idfile.dep.stamp fileudi.dep.stamp md5.dep.stamp mimeparse.dep.stamp netcon.dep.stamp pathut.dep.stamp pxattr.dep.stamp rclionice.dep.stamp readfile.dep.stamp smallut.dep.stamp transcode.dep.stamp wipedir.dep.stamp x11mon.dep.stamp mime-parsefull.dep.stamp mime-parseonlyheader.dep.stamp mime-printbody.dep.stamp mime.dep.stamp convert.dep.stamp iodevice.dep.stamp iofactory.dep.stamp
|
||||
|
||||
librcl.a : $(DEPS) $(OBJS)
|
||||
ar ru librcl.a $(OBJS)
|
||||
@ -99,6 +99,8 @@ stemdb.o : ../rcldb/stemdb.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/stemdb.cpp
|
||||
stoplist.o : ../rcldb/stoplist.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/stoplist.cpp
|
||||
synfamily.o : ../rcldb/synfamily.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../rcldb/synfamily.cpp
|
||||
unac.o : ../unac/unac.cpp $(depth)/mk/localdefs
|
||||
$(CXX) $(ALL_CXXFLAGS) -c ../unac/unac.cpp
|
||||
base64.o : ../utils/base64.cpp $(depth)/mk/localdefs
|
||||
@ -294,6 +296,9 @@ stemdb.dep.stamp : ../rcldb/stemdb.cpp $(depth)/mk/localdefs
|
||||
stoplist.dep.stamp : ../rcldb/stoplist.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/stoplist.cpp > stoplist.dep
|
||||
touch stoplist.dep.stamp
|
||||
synfamily.dep.stamp : ../rcldb/synfamily.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../rcldb/synfamily.cpp > synfamily.dep
|
||||
touch synfamily.dep.stamp
|
||||
unac.dep.stamp : ../unac/unac.cpp $(depth)/mk/localdefs
|
||||
$(CXX) -M $(ALL_CXXFLAGS) ../unac/unac.cpp > unac.dep
|
||||
touch unac.dep.stamp
|
||||
@ -406,6 +411,7 @@ include rclquery.dep
|
||||
include searchdata.dep
|
||||
include stemdb.dep
|
||||
include stoplist.dep
|
||||
include synfamily.dep
|
||||
include unac.dep
|
||||
include base64.dep
|
||||
include circache.dep
|
||||
|
||||
@ -47,6 +47,7 @@ ${depth}/rcldb/rclquery.cpp \
|
||||
${depth}/rcldb/searchdata.cpp \
|
||||
${depth}/rcldb/stemdb.cpp \
|
||||
${depth}/rcldb/stoplist.cpp \
|
||||
${depth}/rcldb/synfamily.cpp \
|
||||
${depth}/unac/unac.cpp \
|
||||
${depth}/utils/base64.cpp \
|
||||
${depth}/utils/circache.cpp \
|
||||
|
||||
@ -1607,11 +1607,11 @@ bool Db::needUpdate(const string &udi, const string& sig)
|
||||
vector<string> Db::getStemLangs()
|
||||
{
|
||||
LOGDEB(("Db::getStemLang\n"));
|
||||
vector<string> dirs;
|
||||
vector<string> langs;
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||
return dirs;
|
||||
dirs = StemDb::getLangs(m_basedir);
|
||||
return dirs;
|
||||
return langs;
|
||||
langs = StemDb::getLangs(m_ndb->xrdb);
|
||||
return langs;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1620,9 +1620,9 @@ vector<string> Db::getStemLangs()
|
||||
bool Db::deleteStemDb(const string& lang)
|
||||
{
|
||||
LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str()));
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable)
|
||||
return false;
|
||||
return StemDb::deleteDb(m_basedir, lang);
|
||||
return StemDb::deleteDb(m_ndb->xwdb, lang);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1634,10 +1634,12 @@ bool Db::deleteStemDb(const string& lang)
|
||||
bool Db::createStemDb(const string& lang)
|
||||
{
|
||||
LOGDEB(("Db::createStemDb(%s)\n", lang.c_str()));
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable) {
|
||||
LOGERR(("createStemDb: db not open or not writable\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
return StemDb::createDb(m_ndb->xdb(), m_basedir, lang);
|
||||
return StemDb::createDb(m_ndb->xwdb, lang);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1842,21 +1844,15 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
bool Db::stemExpand(const string &lang, const string &term,
|
||||
bool Db::stemExpand(const string &langs, const string &term,
|
||||
TermMatchResult& result, int max)
|
||||
{
|
||||
vector<string> dirs(1, m_basedir);
|
||||
dirs.insert(dirs.end(), m_extraDbs.begin(), m_extraDbs.end());
|
||||
for (vector<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
||||
vector<string> more;
|
||||
StemDb::stemExpand(*it, lang, term, more);
|
||||
LOGDEB1(("Db::stemExpand: Got %d from %s\n",
|
||||
more.size(), it->c_str()));
|
||||
result.entries.insert(result.entries.end(), more.begin(), more.end());
|
||||
if (result.entries.size() >= (unsigned int)max)
|
||||
break;
|
||||
}
|
||||
LOGDEB1(("Db:::stemExpand: final count %d \n", result.size()));
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||
return false;
|
||||
vector<string> exp;
|
||||
if (!StemDb::stemExpand(m_ndb->xrdb, langs, term, exp))
|
||||
return false;
|
||||
result.entries.insert(result.entries.end(), exp.begin(), exp.end());
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -143,5 +143,16 @@ class Db::Native {
|
||||
|
||||
};
|
||||
|
||||
// Xapian synonyms table abuse:
|
||||
// The Xapian synonyms mechanisms can be put to many uses, but,
|
||||
// unfortunately, it has a global name space (we'd like to be able to open
|
||||
// different synonym tables, but there is only one).
|
||||
// We use prefixes to create separate name spaces, in mostly the same way
|
||||
// that they are used in the main index. See synfamily.h
|
||||
// Prefixes are centrally defined here to avoid collisions
|
||||
//
|
||||
// Stem expansion family prefix. The family member name is the language
|
||||
static const std::string synprefStem("Stm");
|
||||
|
||||
}
|
||||
#endif /* _rcldb_p_h_included_ */
|
||||
|
||||
@ -27,12 +27,16 @@
|
||||
#include <xapian.h>
|
||||
|
||||
#include "stemdb.h"
|
||||
#include "wipedir.h"
|
||||
#include "pathut.h"
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
#include "utf8iter.h"
|
||||
#include "textsplit.h"
|
||||
#include "rcldb.h"
|
||||
#include "rcldb_p.h"
|
||||
#include "synfamily.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -40,47 +44,20 @@ namespace Rcl {
|
||||
namespace StemDb {
|
||||
|
||||
|
||||
static const string cstr_stemdirstem = "stem_";
|
||||
|
||||
/// Compute name of stem db for given base database and language
|
||||
static string stemdbname(const string& dbdir, const string& lang)
|
||||
vector<string> getLangs(Xapian::Database& xdb)
|
||||
{
|
||||
return path_cat(dbdir, cstr_stemdirstem + lang);
|
||||
XapSynFamily fam(xdb, synprefStem);
|
||||
vector<string> langs;
|
||||
(void)fam.getMembers(langs);
|
||||
return langs;
|
||||
}
|
||||
|
||||
vector<string> getLangs(const string& dbdir)
|
||||
bool deleteDb(Xapian::WritableDatabase& xdb, const string& lang)
|
||||
{
|
||||
string pattern = cstr_stemdirstem + "*";
|
||||
vector<string> dirs = path_dirglob(dbdir, pattern);
|
||||
for (vector<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
||||
*it = path_basename(*it);
|
||||
*it = it->substr(cstr_stemdirstem.length(), string::npos);
|
||||
}
|
||||
return dirs;
|
||||
XapWritableSynFamily fam(xdb, synprefStem);
|
||||
return fam.deleteMember(lang);
|
||||
}
|
||||
|
||||
bool deleteDb(const string& dbdir, const string& lang)
|
||||
{
|
||||
string dir = stemdbname(dbdir, lang);
|
||||
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Autoclean/delete directory
|
||||
class DirWiper {
|
||||
public:
|
||||
string dir;
|
||||
bool do_it;
|
||||
DirWiper(string d) : dir(d), do_it(true) {}
|
||||
~DirWiper() {
|
||||
if (do_it) {
|
||||
wipedir(dir);
|
||||
rmdir(dir.c_str());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
inline static bool
|
||||
p_notlowerascii(unsigned int c)
|
||||
{
|
||||
@ -89,37 +66,13 @@ p_notlowerascii(unsigned int c)
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool addAssoc(Xapian::WritableDatabase &sdb, const string& stem,
|
||||
const vector<string>& derivs)
|
||||
{
|
||||
Xapian::Document newdocument;
|
||||
newdocument.add_term(stem);
|
||||
// The doc data is just parents=blank-separated-list
|
||||
string record = "parents=";
|
||||
for (vector<string>::const_iterator it = derivs.begin();
|
||||
it != derivs.end(); it++) {
|
||||
record += *it + " ";
|
||||
}
|
||||
record += "\n";
|
||||
LOGDEB2(("createStemDb: stmdoc data: [%s]\n", record.c_str()));
|
||||
newdocument.set_data(record);
|
||||
try {
|
||||
sdb.replace_document(stem, newdocument);
|
||||
} catch (...) {
|
||||
LOGERR(("Db::createstemdb(addAssoc): replace failed\n"));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create database of stem to parents associations for a given language.
|
||||
* We walk the list of all terms, stem them, and create another Xapian db
|
||||
* with documents indexed by a single term (the stem), and with the list of
|
||||
* parent terms in the document data.
|
||||
*/
|
||||
bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
||||
bool createDb(Xapian::WritableDatabase& xdb, const string& lang)
|
||||
{
|
||||
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
|
||||
Chrono cron;
|
||||
@ -136,6 +89,7 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
||||
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
|
||||
int stemconst=0; // Stem == term
|
||||
int stemmultiple = 0; // Count of stems with multiple derivatives
|
||||
string ermsg;
|
||||
try {
|
||||
Xapian::Stem stemmer(lang);
|
||||
Xapian::TermIterator it;
|
||||
@ -174,43 +128,18 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
||||
}
|
||||
assocs[stem].push_back(*it);
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
|
||||
return false;
|
||||
} catch (...) {
|
||||
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
|
||||
lang.c_str()));
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
LOGDEB1(("StemDb::createDb(%s): in memory map built: %.2f S\n",
|
||||
lang.c_str(), cron.secs()));
|
||||
|
||||
// Create xapian database for stem relations
|
||||
string stemdbdir = stemdbname(dbdir, lang);
|
||||
// We want to get rid of the db dir in case of error. This gets disarmed
|
||||
// just before success return.
|
||||
DirWiper wiper(stemdbdir);
|
||||
string ermsg;
|
||||
Xapian::WritableDatabase sdb;
|
||||
try {
|
||||
sdb = Xapian::WritableDatabase(stemdbdir,
|
||||
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg();
|
||||
} catch (const string &s) {
|
||||
ermsg = s;
|
||||
} catch (const char *s) {
|
||||
ermsg = s;
|
||||
} catch (...) {
|
||||
ermsg = "Caught unknown exception";
|
||||
}
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n",
|
||||
stemdbdir.c_str(), ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
XapWritableSynFamily fam(xdb, synprefStem);
|
||||
fam.createMember(lang);
|
||||
|
||||
// Enter pseud-docs in db by walking the map.
|
||||
for (map<string, vector<string> >::const_iterator it = assocs.begin();
|
||||
it != assocs.end(); it++) {
|
||||
LOGDEB2(("createStemDb: stem [%s]\n", it->first.c_str()));
|
||||
@ -219,8 +148,7 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
||||
// even if it doesnt exist as a term
|
||||
if (it->second.size() > 1)
|
||||
++stemmultiple;
|
||||
|
||||
if (!addAssoc(sdb, it->first, it->second)) {
|
||||
if (!fam.addSynonyms(lang, it->first, it->second)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -229,7 +157,7 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
||||
lang.c_str(), cron.secs()));
|
||||
LOGDEB(("Stem map size: %d mult %d const %d no %d \n",
|
||||
assocs.size(), stemmultiple, stemconst, nostem));
|
||||
wiper.do_it = false;
|
||||
fam.listMap(lang);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -247,7 +175,7 @@ static string stringlistdisp(const vector<string>& sl)
|
||||
* Expand term to list of all terms which stem to the same term, for one
|
||||
* expansion language
|
||||
*/
|
||||
static bool stemExpandOne(const std::string& dbdir,
|
||||
static bool stemExpandOne(Xapian::Database& xdb,
|
||||
const std::string& lang,
|
||||
const std::string& term,
|
||||
vector<string>& result)
|
||||
@ -258,37 +186,9 @@ static bool stemExpandOne(const std::string& dbdir,
|
||||
LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n",
|
||||
lang.c_str(), term.c_str(), stem.c_str()));
|
||||
|
||||
// Open stem database
|
||||
string stemdbdir = stemdbname(dbdir, lang);
|
||||
Xapian::Database sdb(stemdbdir);
|
||||
LOGDEB0(("stemExpand: %s lastdocid: %d\n",
|
||||
stemdbdir.c_str(), sdb.get_lastdocid()));
|
||||
|
||||
// Try to fetch the doc from the stem db
|
||||
if (!sdb.term_exists(stem)) {
|
||||
LOGDEB0(("Db::stemExpand: no term for %s\n", stem.c_str()));
|
||||
} else {
|
||||
Xapian::PostingIterator did = sdb.postlist_begin(stem);
|
||||
if (did == sdb.postlist_end(stem)) {
|
||||
LOGDEB0(("stemExpand: no term(1) for %s\n",stem.c_str()));
|
||||
} else {
|
||||
Xapian::Document doc = sdb.get_document(*did);
|
||||
string data = doc.get_data();
|
||||
|
||||
// Build expansion list from database data No need for
|
||||
// a conftree, but we need to massage the data a
|
||||
// little
|
||||
string::size_type pos = data.find('=');
|
||||
string::size_type pos1 = data.rfind('\n');
|
||||
if (pos == string::npos || pos1 == string::npos ||
|
||||
pos1 <= pos+1) {
|
||||
LOGERR(("stemExpand: bad data in db: [%s]\n",
|
||||
data.c_str()));
|
||||
} else {
|
||||
++pos;
|
||||
stringToStrings(data.substr(pos, pos1-pos), result);
|
||||
}
|
||||
}
|
||||
XapSynFamily fam(xdb, synprefStem);
|
||||
if (!fam.synExpand(lang, stem, result)) {
|
||||
// ?
|
||||
}
|
||||
|
||||
// If the user term or stem are not in the list, add them
|
||||
@ -302,8 +202,8 @@ static bool stemExpandOne(const std::string& dbdir,
|
||||
stringlistdisp(result).c_str()));
|
||||
|
||||
} catch (...) {
|
||||
LOGERR(("stemExpand: error accessing stem db. dbdir [%s] lang [%s]\n",
|
||||
dbdir.c_str(), lang.c_str()));
|
||||
LOGERR(("stemExpand: error accessing stem db. lang [%s]\n",
|
||||
lang.c_str()));
|
||||
result.push_back(term);
|
||||
return false;
|
||||
}
|
||||
@ -315,18 +215,17 @@ static bool stemExpandOne(const std::string& dbdir,
|
||||
* Expand term to list of all terms which stem to the same term, add the
|
||||
* expansion sets for possibly multiple expansion languages
|
||||
*/
|
||||
bool stemExpand(const std::string& dbdir,
|
||||
bool stemExpand(Xapian::Database& xdb,
|
||||
const std::string& langs,
|
||||
const std::string& term,
|
||||
vector<string>& result)
|
||||
{
|
||||
|
||||
vector<string> llangs;
|
||||
stringToStrings(langs, llangs);
|
||||
for (vector<string>::const_iterator it = llangs.begin();
|
||||
it != llangs.end(); it++) {
|
||||
vector<string> oneexp;
|
||||
stemExpandOne(dbdir, *it, term, oneexp);
|
||||
stemExpandOne(xdb, *it, term, oneexp);
|
||||
result.insert(result.end(), oneexp.begin(), oneexp.end());
|
||||
}
|
||||
sort(result.begin(), result.end());
|
||||
|
||||
@ -58,15 +58,17 @@ namespace Rcl {
|
||||
namespace StemDb {
|
||||
|
||||
/// Get languages of existing stem databases
|
||||
extern std::vector<std::string> getLangs(const std::string& dbdir);
|
||||
extern std::vector<std::string> getLangs(Xapian::Database& xdb);
|
||||
|
||||
/// Delete stem database for given language
|
||||
extern bool deleteDb(const std::string& dbdir, const std::string& lang);
|
||||
extern bool deleteDb(Xapian::WritableDatabase&, const std::string& lang);
|
||||
|
||||
/// Create stem database for given language
|
||||
extern bool createDb(Xapian::Database& xdb,
|
||||
const std::string& dbdir, const std::string& lang);
|
||||
extern bool createDb(Xapian::WritableDatabase&, const std::string& lang);
|
||||
|
||||
/// Expand term to stem siblings
|
||||
extern bool stemExpand(const std::string& dbdir,
|
||||
const std::string& langs,
|
||||
extern bool stemExpand(Xapian::Database& xdb,
|
||||
const std::string& lang,
|
||||
const std::string& term,
|
||||
std::vector<std::string>& result);
|
||||
}
|
||||
|
||||
151
src/rcldb/synfamily.cpp
Normal file
151
src/rcldb/synfamily.cpp
Normal file
@ -0,0 +1,151 @@
|
||||
/* Copyright (C) 2012 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef TEST_SYNFAMILY
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include "debuglog.h"
|
||||
#include "rcldb.h"
|
||||
#include "rcldb_p.h"
|
||||
#include "synfamily.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
bool XapSynFamily::synExpand(const string& member, const string& term,
|
||||
vector<string>& result)
|
||||
{
|
||||
string key = entryprefix(member) + term;
|
||||
string ermsg;
|
||||
try {
|
||||
for (Xapian::TermIterator xit = m_rdb.synonyms_begin(key);
|
||||
xit != m_rdb.synonyms_end(key); xit++) {
|
||||
result.push_back(*xit);
|
||||
}
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("synFamily::synExpand: error for member [%s] term [%s]\n",
|
||||
member.c_str(), term.c_str()));
|
||||
return false;
|
||||
}
|
||||
#if 0
|
||||
string out;
|
||||
stringsToString(result, out);
|
||||
LOGDEB0(("XapSynFamily::synExpand:%s: [%s] -> %s\n", member.c_str(),
|
||||
term.c_str(), out.c_str()));
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XapSynFamily::getMembers(vector<string>& members)
|
||||
{
|
||||
string key = memberskey();
|
||||
string ermsg;
|
||||
try {
|
||||
for (Xapian::TermIterator xit = m_rdb.synonyms_begin(key);
|
||||
xit != m_rdb.synonyms_end(key); xit++) {
|
||||
members.push_back(*xit);
|
||||
}
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("XapSynFamily::getMembers: xapian error %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XapSynFamily::listMap(const string& membername)
|
||||
{
|
||||
string key = entryprefix(membername);
|
||||
string ermsg;
|
||||
try {
|
||||
for (Xapian::TermIterator kit = m_rdb.synonym_keys_begin(key);
|
||||
kit != m_rdb.synonym_keys_end(key); kit++) {
|
||||
cout << "[" << *kit << "] -> ";
|
||||
for (Xapian::TermIterator xit = m_rdb.synonyms_begin(*kit);
|
||||
xit != m_rdb.synonyms_end(*kit); xit++) {
|
||||
cout << *xit << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("XapSynFamily::listMap: xapian error %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
vector<string>members;
|
||||
getMembers(members);
|
||||
cout << "All family members: ";
|
||||
for (vector<string>::const_iterator it = members.begin();
|
||||
it != members.end(); it++) {
|
||||
cout << *it << " ";
|
||||
}
|
||||
cout << endl;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XapWritableSynFamily::deleteMember(const string& membername)
|
||||
{
|
||||
string key = entryprefix(membername);
|
||||
|
||||
for (Xapian::TermIterator xit = m_wdb.synonym_keys_begin(key);
|
||||
xit != m_wdb.synonym_keys_end(key); xit++) {
|
||||
m_wdb.clear_synonyms(*xit);
|
||||
}
|
||||
m_wdb.remove_synonym(memberskey(), membername);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XapWritableSynFamily::createMember(const string& membername)
|
||||
{
|
||||
string ermsg;
|
||||
try {
|
||||
m_wdb.add_synonym(memberskey(), membername);
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("XapSynFamily::createMember: xapian error %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XapWritableSynFamily::addSynonyms(const string& membername,
|
||||
const string& term,
|
||||
const vector<string>& trans)
|
||||
{
|
||||
string key = entryprefix(membername) + term;
|
||||
string ermsg;
|
||||
try {
|
||||
for (vector<string>::const_iterator it = trans.begin();
|
||||
it != trans.end(); it++) {
|
||||
m_wdb.add_synonym(key, *it);
|
||||
}
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("XapSynFamily::addSynonyms: xapian error %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#else // TEST_SYNFAMILY
|
||||
#endif // TEST_SYNFAMILY
|
||||
107
src/rcldb/synfamily.h
Normal file
107
src/rcldb/synfamily.h
Normal file
@ -0,0 +1,107 @@
|
||||
/* Copyright (C) 2012 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _SYNFAMILY_H_INCLUDED_
|
||||
#define _SYNFAMILY_H_INCLUDED_
|
||||
|
||||
/**
|
||||
* The Xapian synonyms mechanism can be used for many things beyond actual
|
||||
* synonyms, anything that would turn a string into a group of equivalents.
|
||||
* Unfortunately, it has only one keyspace.
|
||||
* This class partitions the Xapian synonyms keyspace by using prefixes and
|
||||
* can provide different applications each with a family of keyspaces.
|
||||
* Two characters are reserved by the class and should not be used inside
|
||||
* either family or member names: ':' and ';'
|
||||
* A synonym key for family "stemdb", member "french", key "thisstem"
|
||||
* looks like:
|
||||
* :stemdb:french:stem -> stem siblings
|
||||
* A special entry is used to list all the members for a family, e.g.:
|
||||
* :stemdb;members -> french, english ...
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "xapian.h"
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
class XapSynFamily {
|
||||
public:
|
||||
/**
|
||||
* Construct from readable xapian database and family name (ie: Stm)
|
||||
*/
|
||||
XapSynFamily(Xapian::Database xdb, const std::string& familyname)
|
||||
: m_rdb(xdb)
|
||||
{
|
||||
m_prefix1 = string(":") + familyname;
|
||||
}
|
||||
|
||||
/** Expand one term (e.g.: familier) inside one family number (e.g: french)
|
||||
*/
|
||||
bool synExpand(const std::string& fammember,
|
||||
const std::string& term,
|
||||
std::vector<std::string>& result);
|
||||
|
||||
/** Retrieve all members of this family (e.g: french english german...) */
|
||||
bool getMembers(std::vector<std::string>&);
|
||||
|
||||
/** debug: list map for one member to stdout */
|
||||
bool listMap(const std::string& fam);
|
||||
|
||||
protected:
|
||||
Xapian::Database m_rdb;
|
||||
std::string m_prefix1;
|
||||
string entryprefix(const string& member)
|
||||
{
|
||||
return m_prefix1 + ":" + member + ":";
|
||||
}
|
||||
string memberskey()
|
||||
{
|
||||
return m_prefix1 + ";" + "members";
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
class XapWritableSynFamily : public XapSynFamily {
|
||||
public:
|
||||
/** Construct with Xapian db open for r/w */
|
||||
XapWritableSynFamily(Xapian::WritableDatabase db, const std::string& pfx)
|
||||
: XapSynFamily(db, pfx), m_wdb(db)
|
||||
{
|
||||
}
|
||||
|
||||
/** Delete all entries for one member (e.g. french), and remove from list
|
||||
* of members */
|
||||
bool deleteMember(const std::string& membername);
|
||||
|
||||
/** Add to list of members. Idempotent, does not affect actual expansions */
|
||||
bool createMember(const std::string& membername);
|
||||
|
||||
/** Add expansion list for term inside family member (e.g., inside
|
||||
* the french member, add expansion for familier -> familier,
|
||||
* familierement, ... */
|
||||
bool addSynonyms(const string& membername,
|
||||
const string& term, const vector<string>& trans);
|
||||
|
||||
protected:
|
||||
Xapian::WritableDatabase m_wdb;
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
|
||||
#endif /* _SYNFAMILY_H_INCLUDED_ */
|
||||
Loading…
x
Reference in New Issue
Block a user