arrange to create all stem dicts in one pass
This commit is contained in:
parent
fc8b458222
commit
776800f47a
@ -250,11 +250,7 @@ bool ConfIndexer::createStemmingDatabases()
|
||||
if (find(langs.begin(), langs.end(), *it) == langs.end())
|
||||
m_db.deleteStemDb(*it);
|
||||
}
|
||||
for (it = langs.begin(); it != langs.end(); it++) {
|
||||
if (m_updater && !m_updater->update(DbIxStatus::DBIXS_STEMDB, *it))
|
||||
return false;
|
||||
m_db.createStemDb(*it);
|
||||
}
|
||||
m_db.createStemDbs(langs);
|
||||
}
|
||||
m_db.close();
|
||||
return true;
|
||||
@ -265,7 +261,7 @@ bool ConfIndexer::createStemDb(const string &lang)
|
||||
if (!m_db.open(Rcl::Db::DbUpd)) {
|
||||
return false;
|
||||
}
|
||||
return m_db.createStemDb(lang);
|
||||
return m_db.createStemDbs(vector<string>(1, lang));
|
||||
}
|
||||
|
||||
// The language for the aspell dictionary is handled internally by the aspell
|
||||
|
||||
@ -1623,7 +1623,7 @@ bool Db::deleteStemDb(const string& lang)
|
||||
LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str()));
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable)
|
||||
return false;
|
||||
WritableStemDb db(m_ndb->xwdb);
|
||||
XapWritableSynFamily db(m_ndb->xwdb, synFamStem);
|
||||
return db.deleteMember(lang);
|
||||
}
|
||||
|
||||
@ -1633,16 +1633,15 @@ bool Db::deleteStemDb(const string& lang)
|
||||
* with documents indexed by a single term (the stem), and with the list of
|
||||
* parent terms in the document data.
|
||||
*/
|
||||
bool Db::createStemDb(const string& lang)
|
||||
bool Db::createStemDbs(const vector<string>& langs)
|
||||
{
|
||||
LOGDEB(("Db::createStemDb(%s)\n", lang.c_str()));
|
||||
LOGDEB(("Db::createStemDbs\n"));
|
||||
if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable) {
|
||||
LOGERR(("createStemDb: db not open or not writable\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
WritableStemDb db(m_ndb->xwdb);
|
||||
return db.createDb(lang);
|
||||
return createExpansionDbs(m_ndb->xwdb, langs);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -181,7 +181,7 @@ class Db {
|
||||
bool purge();
|
||||
|
||||
/** Create stem expansion database for given language. */
|
||||
bool createStemDb(const string &lang);
|
||||
bool createStemDbs(const std::vector<std::string> &langs);
|
||||
/** Delete stem expansion database for given language. */
|
||||
bool deleteStemDb(const string &lang);
|
||||
|
||||
|
||||
@ -35,6 +35,7 @@
|
||||
#include "rcldb.h"
|
||||
#include "rcldb_p.h"
|
||||
#include "synfamily.h"
|
||||
#include "unacpp.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
@ -56,12 +57,19 @@ inline static bool p_notlowerascii(unsigned int c)
|
||||
/**
|
||||
* Create database of stem to parents associations for a given language.
|
||||
*/
|
||||
bool WritableStemDb::createDb(const string& lang)
|
||||
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||
const vector<string>& langs)
|
||||
{
|
||||
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
|
||||
LOGDEB(("StemDb::createExpansionDbs\n"));
|
||||
Chrono cron;
|
||||
createMember(lang);
|
||||
string prefix = entryprefix(lang);
|
||||
|
||||
vector<XapWritableSynFamily> stemdbs;
|
||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||
stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
|
||||
stemdbs[i].deleteMember(langs[i]);
|
||||
stemdbs[i].createMember(langs[i]);
|
||||
stemdbs[i].setCurrentMemberName(langs[i]);
|
||||
}
|
||||
|
||||
// We walk the list of all terms, and stem each. We skip terms which
|
||||
// don't look like natural language.
|
||||
@ -73,10 +81,13 @@ bool WritableStemDb::createDb(const string& lang)
|
||||
|
||||
string ermsg;
|
||||
try {
|
||||
Xapian::Stem stemmer(lang);
|
||||
vector<Xapian::Stem> stemmers;
|
||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||
stemmers.push_back(Xapian::Stem(langs[i]));
|
||||
}
|
||||
|
||||
for (Xapian::TermIterator it = m_wdb.allterms_begin();
|
||||
it != m_wdb.allterms_end(); it++) {
|
||||
for (Xapian::TermIterator it = wdb.allterms_begin();
|
||||
it != wdb.allterms_end(); it++) {
|
||||
// If the term has any non-lowercase 7bit char (that is,
|
||||
// numbers, capitals and punctuation) dont stem.
|
||||
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
||||
@ -102,16 +113,19 @@ bool WritableStemDb::createDb(const string& lang)
|
||||
continue;
|
||||
}
|
||||
|
||||
string stem = stemmer(*it);
|
||||
LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
|
||||
stem.c_str()));
|
||||
if (stem == *it) {
|
||||
++stemconst;
|
||||
continue;
|
||||
}
|
||||
|
||||
m_wdb.add_synonym(prefix + stem, *it);
|
||||
++allsyns;
|
||||
// Create stemming synonym for every lang
|
||||
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||
string stem = stemmers[i](*it);
|
||||
if (stem == *it) {
|
||||
++stemconst;
|
||||
} else {
|
||||
stemdbs[i].addSynonym(stem, *it);
|
||||
LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n",
|
||||
(*it).c_str(), langs[i].c_str(), stem.c_str()));
|
||||
++allsyns;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
@ -119,7 +133,7 @@ bool WritableStemDb::createDb(const string& lang)
|
||||
return false;
|
||||
}
|
||||
|
||||
LOGDEB(("StemDb::createDb(%s): done: %.2f S\n", lang.c_str(), cron.secs()));
|
||||
LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
|
||||
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
|
||||
nostem, stemconst, allsyns));
|
||||
return true;
|
||||
|
||||
@ -76,14 +76,8 @@ private:
|
||||
std::vector<std::string>& result);
|
||||
};
|
||||
|
||||
class WritableStemDb : public XapWritableSynFamily {
|
||||
public:
|
||||
WritableStemDb(Xapian::WritableDatabase& xdb)
|
||||
: XapWritableSynFamily(xdb, synFamStem)
|
||||
{
|
||||
}
|
||||
bool createDb(const std::string& lang);
|
||||
};
|
||||
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||
const std::vector<std::string>& langs);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -144,6 +144,7 @@ bool XapWritableSynFamily::addSynonyms(const string& membername,
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
#else // TEST_SYNFAMILY
|
||||
@ -232,11 +233,11 @@ int main(int argc, char **argv)
|
||||
|
||||
// We do stem only for now
|
||||
string familyname;
|
||||
if (op_flags & (OPT_a|OPT_c)) {
|
||||
cerr << "Accents and case not ready" << endl;
|
||||
return 1;
|
||||
if (op_flags & OPT_a) {
|
||||
familyname = Rcl::synFamDiac;
|
||||
} else if (op_flags &OPT_c) {
|
||||
familyname = Rcl::synFamCase;
|
||||
} else {
|
||||
op_flags |= OPT_s;
|
||||
familyname = Rcl::synFamStem;
|
||||
}
|
||||
if ((op_flags & (OPT_l|OPT_L|OPT_D|OPT_e)) == 0)
|
||||
|
||||
@ -35,7 +35,10 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "xapian.h"
|
||||
#include <xapian.h>
|
||||
|
||||
#include "debuglog.h"
|
||||
#include "xmacros.h"
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
@ -98,15 +101,38 @@ public:
|
||||
const std::string& term,
|
||||
const std::vector<std::string>& trans);
|
||||
|
||||
// Need to call setCurrentMemberName before addSynonym !
|
||||
// We don't check it, for speed
|
||||
virtual void setCurrentMemberName(const std::string& nm)
|
||||
{
|
||||
m_currentPrefix = entryprefix(nm);
|
||||
}
|
||||
virtual bool addSynonym(const std::string& term, const std::string& trans)
|
||||
{
|
||||
std::string key = m_currentPrefix + term;
|
||||
std::string ermsg;
|
||||
try {
|
||||
m_wdb.add_synonym(key, trans);
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("XapSynFamily::addSynonym: xapian error %s\n",
|
||||
ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
Xapian::WritableDatabase m_wdb;
|
||||
std::string m_currentPrefix;
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// Prefixes are centrally defined here to avoid collisions
|
||||
//
|
||||
// Stem expansion family prefix. The family member name is the language
|
||||
// Stem expansion family prefix. The family member name is the
|
||||
// language ("all" for Dia and Cse)
|
||||
static const std::string synFamStem("Stm");
|
||||
static const std::string synFamDiac("Dia");
|
||||
static const std::string synFamCase("Cse");
|
||||
|
||||
@ -24,7 +24,7 @@
|
||||
catch (const Xapian::Error &e) { \
|
||||
MSG = e.get_msg(); \
|
||||
if (MSG.empty()) MSG = "Empty error message"; \
|
||||
} catch (const string &s) { \
|
||||
} catch (const std::string &s) { \
|
||||
MSG = s; \
|
||||
if (MSG.empty()) MSG = "Empty error message"; \
|
||||
} catch (const char *s) { \
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user