arrange to create all stem dicts in one pass
This commit is contained in:
parent
fc8b458222
commit
776800f47a
@ -250,11 +250,7 @@ bool ConfIndexer::createStemmingDatabases()
|
|||||||
if (find(langs.begin(), langs.end(), *it) == langs.end())
|
if (find(langs.begin(), langs.end(), *it) == langs.end())
|
||||||
m_db.deleteStemDb(*it);
|
m_db.deleteStemDb(*it);
|
||||||
}
|
}
|
||||||
for (it = langs.begin(); it != langs.end(); it++) {
|
m_db.createStemDbs(langs);
|
||||||
if (m_updater && !m_updater->update(DbIxStatus::DBIXS_STEMDB, *it))
|
|
||||||
return false;
|
|
||||||
m_db.createStemDb(*it);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
m_db.close();
|
m_db.close();
|
||||||
return true;
|
return true;
|
||||||
@ -265,7 +261,7 @@ bool ConfIndexer::createStemDb(const string &lang)
|
|||||||
if (!m_db.open(Rcl::Db::DbUpd)) {
|
if (!m_db.open(Rcl::Db::DbUpd)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return m_db.createStemDb(lang);
|
return m_db.createStemDbs(vector<string>(1, lang));
|
||||||
}
|
}
|
||||||
|
|
||||||
// The language for the aspell dictionary is handled internally by the aspell
|
// The language for the aspell dictionary is handled internally by the aspell
|
||||||
|
|||||||
@ -1623,7 +1623,7 @@ bool Db::deleteStemDb(const string& lang)
|
|||||||
LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str()));
|
LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str()));
|
||||||
if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable)
|
if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable)
|
||||||
return false;
|
return false;
|
||||||
WritableStemDb db(m_ndb->xwdb);
|
XapWritableSynFamily db(m_ndb->xwdb, synFamStem);
|
||||||
return db.deleteMember(lang);
|
return db.deleteMember(lang);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1633,16 +1633,15 @@ bool Db::deleteStemDb(const string& lang)
|
|||||||
* with documents indexed by a single term (the stem), and with the list of
|
* with documents indexed by a single term (the stem), and with the list of
|
||||||
* parent terms in the document data.
|
* parent terms in the document data.
|
||||||
*/
|
*/
|
||||||
bool Db::createStemDb(const string& lang)
|
bool Db::createStemDbs(const vector<string>& langs)
|
||||||
{
|
{
|
||||||
LOGDEB(("Db::createStemDb(%s)\n", lang.c_str()));
|
LOGDEB(("Db::createStemDbs\n"));
|
||||||
if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable) {
|
if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable) {
|
||||||
LOGERR(("createStemDb: db not open or not writable\n"));
|
LOGERR(("createStemDb: db not open or not writable\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
WritableStemDb db(m_ndb->xwdb);
|
return createExpansionDbs(m_ndb->xwdb, langs);
|
||||||
return db.createDb(lang);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -181,7 +181,7 @@ class Db {
|
|||||||
bool purge();
|
bool purge();
|
||||||
|
|
||||||
/** Create stem expansion database for given language. */
|
/** Create stem expansion database for given language. */
|
||||||
bool createStemDb(const string &lang);
|
bool createStemDbs(const std::vector<std::string> &langs);
|
||||||
/** Delete stem expansion database for given language. */
|
/** Delete stem expansion database for given language. */
|
||||||
bool deleteStemDb(const string &lang);
|
bool deleteStemDb(const string &lang);
|
||||||
|
|
||||||
|
|||||||
@ -35,6 +35,7 @@
|
|||||||
#include "rcldb.h"
|
#include "rcldb.h"
|
||||||
#include "rcldb_p.h"
|
#include "rcldb_p.h"
|
||||||
#include "synfamily.h"
|
#include "synfamily.h"
|
||||||
|
#include "unacpp.h"
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
@ -56,12 +57,19 @@ inline static bool p_notlowerascii(unsigned int c)
|
|||||||
/**
|
/**
|
||||||
* Create database of stem to parents associations for a given language.
|
* Create database of stem to parents associations for a given language.
|
||||||
*/
|
*/
|
||||||
bool WritableStemDb::createDb(const string& lang)
|
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||||
|
const vector<string>& langs)
|
||||||
{
|
{
|
||||||
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
|
LOGDEB(("StemDb::createExpansionDbs\n"));
|
||||||
Chrono cron;
|
Chrono cron;
|
||||||
createMember(lang);
|
|
||||||
string prefix = entryprefix(lang);
|
vector<XapWritableSynFamily> stemdbs;
|
||||||
|
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||||
|
stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
|
||||||
|
stemdbs[i].deleteMember(langs[i]);
|
||||||
|
stemdbs[i].createMember(langs[i]);
|
||||||
|
stemdbs[i].setCurrentMemberName(langs[i]);
|
||||||
|
}
|
||||||
|
|
||||||
// We walk the list of all terms, and stem each. We skip terms which
|
// We walk the list of all terms, and stem each. We skip terms which
|
||||||
// don't look like natural language.
|
// don't look like natural language.
|
||||||
@ -73,10 +81,13 @@ bool WritableStemDb::createDb(const string& lang)
|
|||||||
|
|
||||||
string ermsg;
|
string ermsg;
|
||||||
try {
|
try {
|
||||||
Xapian::Stem stemmer(lang);
|
vector<Xapian::Stem> stemmers;
|
||||||
|
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||||
|
stemmers.push_back(Xapian::Stem(langs[i]));
|
||||||
|
}
|
||||||
|
|
||||||
for (Xapian::TermIterator it = m_wdb.allterms_begin();
|
for (Xapian::TermIterator it = wdb.allterms_begin();
|
||||||
it != m_wdb.allterms_end(); it++) {
|
it != wdb.allterms_end(); it++) {
|
||||||
// If the term has any non-lowercase 7bit char (that is,
|
// If the term has any non-lowercase 7bit char (that is,
|
||||||
// numbers, capitals and punctuation) dont stem.
|
// numbers, capitals and punctuation) dont stem.
|
||||||
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
||||||
@ -102,16 +113,19 @@ bool WritableStemDb::createDb(const string& lang)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
string stem = stemmer(*it);
|
// Create stemming synonym for every lang
|
||||||
LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
|
for (unsigned int i = 0; i < langs.size(); i++) {
|
||||||
stem.c_str()));
|
string stem = stemmers[i](*it);
|
||||||
if (stem == *it) {
|
if (stem == *it) {
|
||||||
++stemconst;
|
++stemconst;
|
||||||
continue;
|
} else {
|
||||||
}
|
stemdbs[i].addSynonym(stem, *it);
|
||||||
|
LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n",
|
||||||
|
(*it).c_str(), langs[i].c_str(), stem.c_str()));
|
||||||
|
++allsyns;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
m_wdb.add_synonym(prefix + stem, *it);
|
|
||||||
++allsyns;
|
|
||||||
}
|
}
|
||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
if (!ermsg.empty()) {
|
if (!ermsg.empty()) {
|
||||||
@ -119,7 +133,7 @@ bool WritableStemDb::createDb(const string& lang)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGDEB(("StemDb::createDb(%s): done: %.2f S\n", lang.c_str(), cron.secs()));
|
LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
|
||||||
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
|
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
|
||||||
nostem, stemconst, allsyns));
|
nostem, stemconst, allsyns));
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@ -76,14 +76,8 @@ private:
|
|||||||
std::vector<std::string>& result);
|
std::vector<std::string>& result);
|
||||||
};
|
};
|
||||||
|
|
||||||
class WritableStemDb : public XapWritableSynFamily {
|
extern bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
||||||
public:
|
const std::vector<std::string>& langs);
|
||||||
WritableStemDb(Xapian::WritableDatabase& xdb)
|
|
||||||
: XapWritableSynFamily(xdb, synFamStem)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
bool createDb(const std::string& lang);
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -144,6 +144,7 @@ bool XapWritableSynFamily::addSynonyms(const string& membername,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else // TEST_SYNFAMILY
|
#else // TEST_SYNFAMILY
|
||||||
@ -232,11 +233,11 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
// We do stem only for now
|
// We do stem only for now
|
||||||
string familyname;
|
string familyname;
|
||||||
if (op_flags & (OPT_a|OPT_c)) {
|
if (op_flags & OPT_a) {
|
||||||
cerr << "Accents and case not ready" << endl;
|
familyname = Rcl::synFamDiac;
|
||||||
return 1;
|
} else if (op_flags &OPT_c) {
|
||||||
|
familyname = Rcl::synFamCase;
|
||||||
} else {
|
} else {
|
||||||
op_flags |= OPT_s;
|
|
||||||
familyname = Rcl::synFamStem;
|
familyname = Rcl::synFamStem;
|
||||||
}
|
}
|
||||||
if ((op_flags & (OPT_l|OPT_L|OPT_D|OPT_e)) == 0)
|
if ((op_flags & (OPT_l|OPT_L|OPT_D|OPT_e)) == 0)
|
||||||
|
|||||||
@ -35,7 +35,10 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "xapian.h"
|
#include <xapian.h>
|
||||||
|
|
||||||
|
#include "debuglog.h"
|
||||||
|
#include "xmacros.h"
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
@ -98,15 +101,38 @@ public:
|
|||||||
const std::string& term,
|
const std::string& term,
|
||||||
const std::vector<std::string>& trans);
|
const std::vector<std::string>& trans);
|
||||||
|
|
||||||
|
// Need to call setCurrentMemberName before addSynonym !
|
||||||
|
// We don't check it, for speed
|
||||||
|
virtual void setCurrentMemberName(const std::string& nm)
|
||||||
|
{
|
||||||
|
m_currentPrefix = entryprefix(nm);
|
||||||
|
}
|
||||||
|
virtual bool addSynonym(const std::string& term, const std::string& trans)
|
||||||
|
{
|
||||||
|
std::string key = m_currentPrefix + term;
|
||||||
|
std::string ermsg;
|
||||||
|
try {
|
||||||
|
m_wdb.add_synonym(key, trans);
|
||||||
|
} XCATCHERROR(ermsg);
|
||||||
|
if (!ermsg.empty()) {
|
||||||
|
LOGERR(("XapSynFamily::addSynonym: xapian error %s\n",
|
||||||
|
ermsg.c_str()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
Xapian::WritableDatabase m_wdb;
|
Xapian::WritableDatabase m_wdb;
|
||||||
|
std::string m_currentPrefix;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Prefixes are centrally defined here to avoid collisions
|
// Prefixes are centrally defined here to avoid collisions
|
||||||
//
|
//
|
||||||
// Stem expansion family prefix. The family member name is the language
|
// Stem expansion family prefix. The family member name is the
|
||||||
|
// language ("all" for Dia and Cse)
|
||||||
static const std::string synFamStem("Stm");
|
static const std::string synFamStem("Stm");
|
||||||
static const std::string synFamDiac("Dia");
|
static const std::string synFamDiac("Dia");
|
||||||
static const std::string synFamCase("Cse");
|
static const std::string synFamCase("Cse");
|
||||||
|
|||||||
@ -24,7 +24,7 @@
|
|||||||
catch (const Xapian::Error &e) { \
|
catch (const Xapian::Error &e) { \
|
||||||
MSG = e.get_msg(); \
|
MSG = e.get_msg(); \
|
||||||
if (MSG.empty()) MSG = "Empty error message"; \
|
if (MSG.empty()) MSG = "Empty error message"; \
|
||||||
} catch (const string &s) { \
|
} catch (const std::string &s) { \
|
||||||
MSG = s; \
|
MSG = s; \
|
||||||
if (MSG.empty()) MSG = "Empty error message"; \
|
if (MSG.empty()) MSG = "Empty error message"; \
|
||||||
} catch (const char *s) { \
|
} catch (const char *s) { \
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user