From fc8b45822270a7603502d5caf129e1645c2e85ec Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Mon, 27 Aug 2012 15:38:08 +0200 Subject: [PATCH] create class StemDb as derived class from XapSynFamily --- src/rcldb/rcldb.cpp | 20 ++++---- src/rcldb/stemdb.cpp | 107 +++++++++++++----------------------------- src/rcldb/stemdb.h | 39 ++++++++++----- src/rcldb/synfamily.h | 3 +- 4 files changed, 72 insertions(+), 97 deletions(-) diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 48e68ae9..6362a559 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -891,7 +891,7 @@ int Db::termDocCnt(const string& _term) return -1; string term; - if (!unacmaybefold(_term, term, "UTF-8", true)) { + if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) { LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str())); return 0; } @@ -1117,7 +1117,7 @@ string Db::getSpellingSuggestion(const string& word) if (m_ndb == 0) return string(); string term; - if (!unacmaybefold(word, term, "UTF-8", true)) { + if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) { LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str())); return string(); } @@ -1316,7 +1316,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, string utf8fn; if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) { string fn; - if (unacmaybefold(utf8fn, fn, "UTF-8", true)) { + if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) { // We should truncate after extracting the extension, but this is // a pathological case anyway if (fn.size() > 230) @@ -1610,7 +1610,8 @@ vector Db::getStemLangs() vector langs; if (m_ndb == 0 || m_ndb->m_isopen == false) return langs; - langs = StemDb::getLangs(m_ndb->xrdb); + StemDb db(m_ndb->xrdb); + db.getMembers(langs); return langs; } @@ -1622,7 +1623,8 @@ bool Db::deleteStemDb(const string& lang) LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str())); if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable) return false; - return StemDb::deleteDb(m_ndb->xwdb, lang); + WritableStemDb db(m_ndb->xwdb); + return db.deleteMember(lang); } /** @@ -1639,7 +1641,8 @@ bool Db::createStemDb(const string& lang) return false; } - return StemDb::createDb(m_ndb->xwdb, lang); + WritableStemDb db(m_ndb->xwdb); + return db.createDb(lang); } /** @@ -1850,7 +1853,8 @@ bool Db::stemExpand(const string &langs, const string &term, if (m_ndb == 0 || m_ndb->m_isopen == false) return false; vector exp; - if (!StemDb::stemExpand(m_ndb->xrdb, langs, term, exp)) + StemDb db(m_ndb->xrdb); + if (!db.stemExpand(langs, term, exp)) return false; result.entries.insert(result.entries.end(), exp.begin(), exp.end()); return true; @@ -1893,7 +1897,7 @@ bool Db::termMatch(MatchType typ, const string &lang, // Get rid of capitals and accents string droot; - if (!unacmaybefold(root, droot, "UTF-8", true)) { + if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) { LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str())); return false; } diff --git a/src/rcldb/stemdb.cpp b/src/rcldb/stemdb.cpp index b6ec5a8e..dbc8d690 100644 --- a/src/rcldb/stemdb.cpp +++ b/src/rcldb/stemdb.cpp @@ -41,25 +41,12 @@ using namespace std; namespace Rcl { -namespace StemDb { - -vector getLangs(Xapian::Database& xdb) -{ - XapSynFamily fam(xdb, synFamStem); - vector langs; - (void)fam.getMembers(langs); - return langs; -} - -bool deleteDb(Xapian::WritableDatabase& xdb, const string& lang) -{ - XapWritableSynFamily fam(xdb, synFamStem); - return fam.deleteMember(lang); -} - -inline static bool -p_notlowerascii(unsigned int c) +// Fast raw detection of non-natural-language words: look for ascii +// chars which are not lowercase letters. Not too sure what islower() +// would do with 8 bit values, so not using it here. If we want to be +// more complete we'd need to go full utf-8 +inline static bool p_notlowerascii(unsigned int c) { if (c < 'a' || (c > 'z' && c < 128)) return true; @@ -68,32 +55,28 @@ p_notlowerascii(unsigned int c) /** * Create database of stem to parents associations for a given language. - * We walk the list of all terms, stem them, and create another Xapian db - * with documents indexed by a single term (the stem), and with the list of - * parent terms in the document data. */ -bool createDb(Xapian::WritableDatabase& xdb, const string& lang) +bool WritableStemDb::createDb(const string& lang) { LOGDEB(("StemDb::createDb(%s)\n", lang.c_str())); Chrono cron; + createMember(lang); + string prefix = entryprefix(lang); - // First build the in-memory stem database: - // We walk the list of all terms, and stem each. - // If the stem is identical to the term, no need to create an entry - // Else, we add an entry to the multimap. - // At the end, we only save stem-terms associations with several terms, the - // others are not useful - // Note: a map > would probably be more efficient - map > assocs; + // We walk the list of all terms, and stem each. We skip terms which + // don't look like natural language. + // If the stem is not identical to the term, we add a synonym entry. // Statistics - int nostem=0; // Dont even try: not-alphanum (incomplete for now) - int stemconst=0; // Stem == term - int stemmultiple = 0; // Count of stems with multiple derivatives + int nostem = 0; // Dont even try: not-alphanum (incomplete for now) + int stemconst = 0; // Stem == term + int allsyns = 0; // Total number of entries created + string ermsg; try { Xapian::Stem stemmer(lang); - Xapian::TermIterator it; - for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) { + + for (Xapian::TermIterator it = m_wdb.allterms_begin(); + it != m_wdb.allterms_end(); it++) { // If the term has any non-lowercase 7bit char (that is, // numbers, capitals and punctuation) dont stem. string::iterator sit = (*it).begin(), eit = sit + (*it).length(); @@ -126,7 +109,9 @@ bool createDb(Xapian::WritableDatabase& xdb, const string& lang) ++stemconst; continue; } - assocs[stem].push_back(*it); + + m_wdb.add_synonym(prefix + stem, *it); + ++allsyns; } } XCATCHERROR(ermsg); if (!ermsg.empty()) { @@ -134,30 +119,9 @@ bool createDb(Xapian::WritableDatabase& xdb, const string& lang) return false; } - LOGDEB1(("StemDb::createDb(%s): in memory map built: %.2f S\n", - lang.c_str(), cron.secs())); - - XapWritableSynFamily fam(xdb, synFamStem); - fam.createMember(lang); - - for (map >::const_iterator it = assocs.begin(); - it != assocs.end(); it++) { - LOGDEB2(("createStemDb: stem [%s]\n", it->first.c_str())); - // We need an entry even if there is only one derivative - // so that it is possible to search by entering the stem - // even if it doesnt exist as a term - if (it->second.size() > 1) - ++stemmultiple; - if (!fam.addSynonyms(lang, it->first, it->second)) { - return false; - } - } - - LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n", - lang.c_str(), cron.secs())); - LOGDEB(("Stem map size: %d mult %d const %d no %d \n", - assocs.size(), stemmultiple, stemconst, nostem)); - fam.listMap(lang); + LOGDEB(("StemDb::createDb(%s): done: %.2f S\n", lang.c_str(), cron.secs())); + LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n", + nostem, stemconst, allsyns)); return true; } @@ -165,10 +129,9 @@ bool createDb(Xapian::WritableDatabase& xdb, const string& lang) * Expand term to list of all terms which stem to the same term, for one * expansion language */ -static bool stemExpandOne(Xapian::Database& xdb, - const std::string& lang, - const std::string& term, - vector& result) +bool StemDb::expandOne(const std::string& lang, + const std::string& term, + vector& result) { try { Xapian::Stem stemmer(lang); @@ -176,8 +139,7 @@ static bool stemExpandOne(Xapian::Database& xdb, LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n", lang.c_str(), term.c_str(), stem.c_str())); - XapSynFamily fam(xdb, synFamStem); - if (!fam.synExpand(lang, stem, result)) { + if (!synExpand(lang, stem, result)) { // ? } @@ -202,20 +164,18 @@ static bool stemExpandOne(Xapian::Database& xdb, } /** - * Expand term to list of all terms which stem to the same term, add the - * expansion sets for possibly multiple expansion languages + * Expand for one or several languages */ -bool stemExpand(Xapian::Database& xdb, - const std::string& langs, - const std::string& term, - vector& result) +bool StemDb::stemExpand(const std::string& langs, + const std::string& term, + vector& result) { vector llangs; stringToStrings(langs, llangs); for (vector::const_iterator it = llangs.begin(); it != llangs.end(); it++) { vector oneexp; - stemExpandOne(xdb, *it, term, oneexp); + expandOne(*it, term, oneexp); result.insert(result.end(), oneexp.begin(), oneexp.end()); } sort(result.begin(), result.end()); @@ -225,4 +185,3 @@ bool stemExpand(Xapian::Database& xdb, } -} diff --git a/src/rcldb/stemdb.h b/src/rcldb/stemdb.h index 9b726d41..5a332c19 100644 --- a/src/rcldb/stemdb.h +++ b/src/rcldb/stemdb.h @@ -54,24 +54,37 @@ #include +#include "synfamily.h" + namespace Rcl { -namespace StemDb { -/// Get languages of existing stem databases -extern std::vector getLangs(Xapian::Database& xdb); +class StemDb : public XapSynFamily { +public: + StemDb(Xapian::Database& xdb) + : XapSynFamily(xdb, synFamStem) + { + } -/// Delete stem database for given language -extern bool deleteDb(Xapian::WritableDatabase&, const std::string& lang); + /** Expand for a number of languages */ + bool stemExpand(const std::string& langs, + const std::string& term, + std::vector& result); +private: + /** Compute stem and call synExpand() */ + bool expandOne(const std::string& lang, + const std::string& term, + std::vector& result); +}; -/// Create stem database for given language -extern bool createDb(Xapian::WritableDatabase&, const std::string& lang); +class WritableStemDb : public XapWritableSynFamily { +public: + WritableStemDb(Xapian::WritableDatabase& xdb) + : XapWritableSynFamily(xdb, synFamStem) + { + } + bool createDb(const std::string& lang); +}; -/// Expand term to stem siblings -extern bool stemExpand(Xapian::Database& xdb, - const std::string& lang, - const std::string& term, - std::vector& result); -} } #endif /* _STEMDB_H_INCLUDED_ */ diff --git a/src/rcldb/synfamily.h b/src/rcldb/synfamily.h index 36e1470b..6d045a4d 100644 --- a/src/rcldb/synfamily.h +++ b/src/rcldb/synfamily.h @@ -93,8 +93,7 @@ public: virtual bool createMember(const std::string& membername); /** Add expansion list for term inside family member (e.g., inside - * the french member, add expansion for familier -> familier, - * familierement, ... */ + * the english member, add expansion for floor -> floors, flooring.. */ virtual bool addSynonyms(const std::string& membername, const std::string& term, const std::vector& trans);