diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index fc135ba8..bd64c975 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -250,11 +250,7 @@ bool ConfIndexer::createStemmingDatabases() if (find(langs.begin(), langs.end(), *it) == langs.end()) m_db.deleteStemDb(*it); } - for (it = langs.begin(); it != langs.end(); it++) { - if (m_updater && !m_updater->update(DbIxStatus::DBIXS_STEMDB, *it)) - return false; - m_db.createStemDb(*it); - } + m_db.createStemDbs(langs); } m_db.close(); return true; @@ -265,7 +261,7 @@ bool ConfIndexer::createStemDb(const string &lang) if (!m_db.open(Rcl::Db::DbUpd)) { return false; } - return m_db.createStemDb(lang); + return m_db.createStemDbs(vector(1, lang)); } // The language for the aspell dictionary is handled internally by the aspell diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 6362a559..8a00cb95 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1623,7 +1623,7 @@ bool Db::deleteStemDb(const string& lang) LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str())); if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable) return false; - WritableStemDb db(m_ndb->xwdb); + XapWritableSynFamily db(m_ndb->xwdb, synFamStem); return db.deleteMember(lang); } @@ -1633,16 +1633,15 @@ bool Db::deleteStemDb(const string& lang) * with documents indexed by a single term (the stem), and with the list of * parent terms in the document data. */ -bool Db::createStemDb(const string& lang) +bool Db::createStemDbs(const vector& langs) { - LOGDEB(("Db::createStemDb(%s)\n", lang.c_str())); + LOGDEB(("Db::createStemDbs\n")); if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable) { LOGERR(("createStemDb: db not open or not writable\n")); return false; } - WritableStemDb db(m_ndb->xwdb); - return db.createDb(lang); + return createExpansionDbs(m_ndb->xwdb, langs); } /** diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 465912db..9a3385a1 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -181,7 +181,7 @@ class Db { bool purge(); /** Create stem expansion database for given language. */ - bool createStemDb(const string &lang); + bool createStemDbs(const std::vector &langs); /** Delete stem expansion database for given language. */ bool deleteStemDb(const string &lang); diff --git a/src/rcldb/stemdb.cpp b/src/rcldb/stemdb.cpp index dbc8d690..3ef44a5c 100644 --- a/src/rcldb/stemdb.cpp +++ b/src/rcldb/stemdb.cpp @@ -35,6 +35,7 @@ #include "rcldb.h" #include "rcldb_p.h" #include "synfamily.h" +#include "unacpp.h" #include @@ -56,12 +57,19 @@ inline static bool p_notlowerascii(unsigned int c) /** * Create database of stem to parents associations for a given language. */ -bool WritableStemDb::createDb(const string& lang) +bool createExpansionDbs(Xapian::WritableDatabase& wdb, + const vector& langs) { - LOGDEB(("StemDb::createDb(%s)\n", lang.c_str())); + LOGDEB(("StemDb::createExpansionDbs\n")); Chrono cron; - createMember(lang); - string prefix = entryprefix(lang); + + vector stemdbs; + for (unsigned int i = 0; i < langs.size(); i++) { + stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem)); + stemdbs[i].deleteMember(langs[i]); + stemdbs[i].createMember(langs[i]); + stemdbs[i].setCurrentMemberName(langs[i]); + } // We walk the list of all terms, and stem each. We skip terms which // don't look like natural language. @@ -73,10 +81,13 @@ bool WritableStemDb::createDb(const string& lang) string ermsg; try { - Xapian::Stem stemmer(lang); + vector stemmers; + for (unsigned int i = 0; i < langs.size(); i++) { + stemmers.push_back(Xapian::Stem(langs[i])); + } - for (Xapian::TermIterator it = m_wdb.allterms_begin(); - it != m_wdb.allterms_end(); it++) { + for (Xapian::TermIterator it = wdb.allterms_begin(); + it != wdb.allterms_end(); it++) { // If the term has any non-lowercase 7bit char (that is, // numbers, capitals and punctuation) dont stem. string::iterator sit = (*it).begin(), eit = sit + (*it).length(); @@ -102,16 +113,19 @@ bool WritableStemDb::createDb(const string& lang) continue; } - string stem = stemmer(*it); - LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(), - stem.c_str())); - if (stem == *it) { - ++stemconst; - continue; - } - - m_wdb.add_synonym(prefix + stem, *it); - ++allsyns; + // Create stemming synonym for every lang + for (unsigned int i = 0; i < langs.size(); i++) { + string stem = stemmers[i](*it); + if (stem == *it) { + ++stemconst; + } else { + stemdbs[i].addSynonym(stem, *it); + LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n", + (*it).c_str(), langs[i].c_str(), stem.c_str())); + ++allsyns; + } + } + } } XCATCHERROR(ermsg); if (!ermsg.empty()) { @@ -119,7 +133,7 @@ bool WritableStemDb::createDb(const string& lang) return false; } - LOGDEB(("StemDb::createDb(%s): done: %.2f S\n", lang.c_str(), cron.secs())); + LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs())); LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n", nostem, stemconst, allsyns)); return true; diff --git a/src/rcldb/stemdb.h b/src/rcldb/stemdb.h index 5a332c19..798d3b37 100644 --- a/src/rcldb/stemdb.h +++ b/src/rcldb/stemdb.h @@ -76,14 +76,8 @@ private: std::vector& result); }; -class WritableStemDb : public XapWritableSynFamily { -public: - WritableStemDb(Xapian::WritableDatabase& xdb) - : XapWritableSynFamily(xdb, synFamStem) - { - } - bool createDb(const std::string& lang); -}; +extern bool createExpansionDbs(Xapian::WritableDatabase& wdb, + const std::vector& langs); } diff --git a/src/rcldb/synfamily.cpp b/src/rcldb/synfamily.cpp index fd8ae16b..839214e3 100644 --- a/src/rcldb/synfamily.cpp +++ b/src/rcldb/synfamily.cpp @@ -144,6 +144,7 @@ bool XapWritableSynFamily::addSynonyms(const string& membername, return true; } + } #else // TEST_SYNFAMILY @@ -232,11 +233,11 @@ int main(int argc, char **argv) // We do stem only for now string familyname; - if (op_flags & (OPT_a|OPT_c)) { - cerr << "Accents and case not ready" << endl; - return 1; + if (op_flags & OPT_a) { + familyname = Rcl::synFamDiac; + } else if (op_flags &OPT_c) { + familyname = Rcl::synFamCase; } else { - op_flags |= OPT_s; familyname = Rcl::synFamStem; } if ((op_flags & (OPT_l|OPT_L|OPT_D|OPT_e)) == 0) diff --git a/src/rcldb/synfamily.h b/src/rcldb/synfamily.h index 6d045a4d..664a9f07 100644 --- a/src/rcldb/synfamily.h +++ b/src/rcldb/synfamily.h @@ -35,7 +35,10 @@ #include #include -#include "xapian.h" +#include + +#include "debuglog.h" +#include "xmacros.h" namespace Rcl { @@ -98,15 +101,38 @@ public: const std::string& term, const std::vector& trans); + // Need to call setCurrentMemberName before addSynonym ! + // We don't check it, for speed + virtual void setCurrentMemberName(const std::string& nm) + { + m_currentPrefix = entryprefix(nm); + } + virtual bool addSynonym(const std::string& term, const std::string& trans) + { + std::string key = m_currentPrefix + term; + std::string ermsg; + try { + m_wdb.add_synonym(key, trans); + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR(("XapSynFamily::addSynonym: xapian error %s\n", + ermsg.c_str())); + return false; + } + return true; + } + protected: Xapian::WritableDatabase m_wdb; + std::string m_currentPrefix; }; // // Prefixes are centrally defined here to avoid collisions // -// Stem expansion family prefix. The family member name is the language +// Stem expansion family prefix. The family member name is the +// language ("all" for Dia and Cse) static const std::string synFamStem("Stm"); static const std::string synFamDiac("Dia"); static const std::string synFamCase("Cse"); diff --git a/src/rcldb/xmacros.h b/src/rcldb/xmacros.h index 030cc8fb..5311cf23 100644 --- a/src/rcldb/xmacros.h +++ b/src/rcldb/xmacros.h @@ -24,7 +24,7 @@ catch (const Xapian::Error &e) { \ MSG = e.get_msg(); \ if (MSG.empty()) MSG = "Empty error message"; \ - } catch (const string &s) { \ + } catch (const std::string &s) { \ MSG = s; \ if (MSG.empty()) MSG = "Empty error message"; \ } catch (const char *s) { \