arrange to create all stem dicts in one pass

This commit is contained in:
Jean-Francois Dockes 2012-08-28 13:39:34 +02:00
parent fc8b458222
commit 776800f47a
8 changed files with 75 additions and 45 deletions

View File

@ -250,11 +250,7 @@ bool ConfIndexer::createStemmingDatabases()
if (find(langs.begin(), langs.end(), *it) == langs.end())
m_db.deleteStemDb(*it);
}
for (it = langs.begin(); it != langs.end(); it++) {
if (m_updater && !m_updater->update(DbIxStatus::DBIXS_STEMDB, *it))
return false;
m_db.createStemDb(*it);
}
m_db.createStemDbs(langs);
}
m_db.close();
return true;
@ -265,7 +261,7 @@ bool ConfIndexer::createStemDb(const string &lang)
if (!m_db.open(Rcl::Db::DbUpd)) {
return false;
}
return m_db.createStemDb(lang);
return m_db.createStemDbs(vector<string>(1, lang));
}
// The language for the aspell dictionary is handled internally by the aspell

View File

@ -1623,7 +1623,7 @@ bool Db::deleteStemDb(const string& lang)
LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str()));
if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable)
return false;
WritableStemDb db(m_ndb->xwdb);
XapWritableSynFamily db(m_ndb->xwdb, synFamStem);
return db.deleteMember(lang);
}
@ -1633,16 +1633,15 @@ bool Db::deleteStemDb(const string& lang)
* with documents indexed by a single term (the stem), and with the list of
* parent terms in the document data.
*/
bool Db::createStemDb(const string& lang)
bool Db::createStemDbs(const vector<string>& langs)
{
LOGDEB(("Db::createStemDb(%s)\n", lang.c_str()));
LOGDEB(("Db::createStemDbs\n"));
if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable) {
LOGERR(("createStemDb: db not open or not writable\n"));
return false;
}
WritableStemDb db(m_ndb->xwdb);
return db.createDb(lang);
return createExpansionDbs(m_ndb->xwdb, langs);
}
/**

View File

@ -181,7 +181,7 @@ class Db {
bool purge();
/** Create stem expansion database for given language. */
bool createStemDb(const string &lang);
bool createStemDbs(const std::vector<std::string> &langs);
/** Delete stem expansion database for given language. */
bool deleteStemDb(const string &lang);

View File

@ -35,6 +35,7 @@
#include "rcldb.h"
#include "rcldb_p.h"
#include "synfamily.h"
#include "unacpp.h"
#include <iostream>
@ -56,12 +57,19 @@ inline static bool p_notlowerascii(unsigned int c)
/**
* Create database of stem to parents associations for a given language.
*/
bool WritableStemDb::createDb(const string& lang)
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
const vector<string>& langs)
{
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
LOGDEB(("StemDb::createExpansionDbs\n"));
Chrono cron;
createMember(lang);
string prefix = entryprefix(lang);
vector<XapWritableSynFamily> stemdbs;
for (unsigned int i = 0; i < langs.size(); i++) {
stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
stemdbs[i].deleteMember(langs[i]);
stemdbs[i].createMember(langs[i]);
stemdbs[i].setCurrentMemberName(langs[i]);
}
// We walk the list of all terms, and stem each. We skip terms which
// don't look like natural language.
@ -73,10 +81,13 @@ bool WritableStemDb::createDb(const string& lang)
string ermsg;
try {
Xapian::Stem stemmer(lang);
vector<Xapian::Stem> stemmers;
for (unsigned int i = 0; i < langs.size(); i++) {
stemmers.push_back(Xapian::Stem(langs[i]));
}
for (Xapian::TermIterator it = m_wdb.allterms_begin();
it != m_wdb.allterms_end(); it++) {
for (Xapian::TermIterator it = wdb.allterms_begin();
it != wdb.allterms_end(); it++) {
// If the term has any non-lowercase 7bit char (that is,
// numbers, capitals and punctuation) dont stem.
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
@ -102,16 +113,19 @@ bool WritableStemDb::createDb(const string& lang)
continue;
}
string stem = stemmer(*it);
LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
stem.c_str()));
if (stem == *it) {
++stemconst;
continue;
}
m_wdb.add_synonym(prefix + stem, *it);
++allsyns;
// Create stemming synonym for every lang
for (unsigned int i = 0; i < langs.size(); i++) {
string stem = stemmers[i](*it);
if (stem == *it) {
++stemconst;
} else {
stemdbs[i].addSynonym(stem, *it);
LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n",
(*it).c_str(), langs[i].c_str(), stem.c_str()));
++allsyns;
}
}
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
@ -119,7 +133,7 @@ bool WritableStemDb::createDb(const string& lang)
return false;
}
LOGDEB(("StemDb::createDb(%s): done: %.2f S\n", lang.c_str(), cron.secs()));
LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
nostem, stemconst, allsyns));
return true;

View File

@ -76,14 +76,8 @@ private:
std::vector<std::string>& result);
};
class WritableStemDb : public XapWritableSynFamily {
public:
WritableStemDb(Xapian::WritableDatabase& xdb)
: XapWritableSynFamily(xdb, synFamStem)
{
}
bool createDb(const std::string& lang);
};
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
const std::vector<std::string>& langs);
}

View File

@ -144,6 +144,7 @@ bool XapWritableSynFamily::addSynonyms(const string& membername,
return true;
}
}
#else // TEST_SYNFAMILY
@ -232,11 +233,11 @@ int main(int argc, char **argv)
// We do stem only for now
string familyname;
if (op_flags & (OPT_a|OPT_c)) {
cerr << "Accents and case not ready" << endl;
return 1;
if (op_flags & OPT_a) {
familyname = Rcl::synFamDiac;
} else if (op_flags &OPT_c) {
familyname = Rcl::synFamCase;
} else {
op_flags |= OPT_s;
familyname = Rcl::synFamStem;
}
if ((op_flags & (OPT_l|OPT_L|OPT_D|OPT_e)) == 0)

View File

@ -35,7 +35,10 @@
#include <string>
#include <vector>
#include "xapian.h"
#include <xapian.h>
#include "debuglog.h"
#include "xmacros.h"
namespace Rcl {
@ -98,15 +101,38 @@ public:
const std::string& term,
const std::vector<std::string>& trans);
// Need to call setCurrentMemberName before addSynonym !
// We don't check it, for speed
virtual void setCurrentMemberName(const std::string& nm)
{
m_currentPrefix = entryprefix(nm);
}
virtual bool addSynonym(const std::string& term, const std::string& trans)
{
std::string key = m_currentPrefix + term;
std::string ermsg;
try {
m_wdb.add_synonym(key, trans);
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("XapSynFamily::addSynonym: xapian error %s\n",
ermsg.c_str()));
return false;
}
return true;
}
protected:
Xapian::WritableDatabase m_wdb;
std::string m_currentPrefix;
};
//
// Prefixes are centrally defined here to avoid collisions
//
// Stem expansion family prefix. The family member name is the language
// Stem expansion family prefix. The family member name is the
// language ("all" for Dia and Cse)
static const std::string synFamStem("Stm");
static const std::string synFamDiac("Dia");
static const std::string synFamCase("Cse");

View File

@ -24,7 +24,7 @@
catch (const Xapian::Error &e) { \
MSG = e.get_msg(); \
if (MSG.empty()) MSG = "Empty error message"; \
} catch (const string &s) { \
} catch (const std::string &s) { \
MSG = s; \
if (MSG.empty()) MSG = "Empty error message"; \
} catch (const char *s) { \