create class StemDb as derived class from XapSynFamily
This commit is contained in:
parent
913dffc597
commit
fc8b458222
@ -891,7 +891,7 @@ int Db::termDocCnt(const string& _term)
|
|||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
string term;
|
string term;
|
||||||
if (!unacmaybefold(_term, term, "UTF-8", true)) {
|
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
|
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -1117,7 +1117,7 @@ string Db::getSpellingSuggestion(const string& word)
|
|||||||
if (m_ndb == 0)
|
if (m_ndb == 0)
|
||||||
return string();
|
return string();
|
||||||
string term;
|
string term;
|
||||||
if (!unacmaybefold(word, term, "UTF-8", true)) {
|
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
|
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
|
||||||
return string();
|
return string();
|
||||||
}
|
}
|
||||||
@ -1316,7 +1316,7 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
string utf8fn;
|
string utf8fn;
|
||||||
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
|
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
|
||||||
string fn;
|
string fn;
|
||||||
if (unacmaybefold(utf8fn, fn, "UTF-8", true)) {
|
if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
// We should truncate after extracting the extension, but this is
|
// We should truncate after extracting the extension, but this is
|
||||||
// a pathological case anyway
|
// a pathological case anyway
|
||||||
if (fn.size() > 230)
|
if (fn.size() > 230)
|
||||||
@ -1610,7 +1610,8 @@ vector<string> Db::getStemLangs()
|
|||||||
vector<string> langs;
|
vector<string> langs;
|
||||||
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||||
return langs;
|
return langs;
|
||||||
langs = StemDb::getLangs(m_ndb->xrdb);
|
StemDb db(m_ndb->xrdb);
|
||||||
|
db.getMembers(langs);
|
||||||
return langs;
|
return langs;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1622,7 +1623,8 @@ bool Db::deleteStemDb(const string& lang)
|
|||||||
LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str()));
|
LOGDEB(("Db::deleteStemDb(%s)\n", lang.c_str()));
|
||||||
if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable)
|
if (m_ndb == 0 || m_ndb->m_isopen == false || !m_ndb->m_iswritable)
|
||||||
return false;
|
return false;
|
||||||
return StemDb::deleteDb(m_ndb->xwdb, lang);
|
WritableStemDb db(m_ndb->xwdb);
|
||||||
|
return db.deleteMember(lang);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1639,7 +1641,8 @@ bool Db::createStemDb(const string& lang)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return StemDb::createDb(m_ndb->xwdb, lang);
|
WritableStemDb db(m_ndb->xwdb);
|
||||||
|
return db.createDb(lang);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1850,7 +1853,8 @@ bool Db::stemExpand(const string &langs, const string &term,
|
|||||||
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
||||||
return false;
|
return false;
|
||||||
vector<string> exp;
|
vector<string> exp;
|
||||||
if (!StemDb::stemExpand(m_ndb->xrdb, langs, term, exp))
|
StemDb db(m_ndb->xrdb);
|
||||||
|
if (!db.stemExpand(langs, term, exp))
|
||||||
return false;
|
return false;
|
||||||
result.entries.insert(result.entries.end(), exp.begin(), exp.end());
|
result.entries.insert(result.entries.end(), exp.begin(), exp.end());
|
||||||
return true;
|
return true;
|
||||||
@ -1893,7 +1897,7 @@ bool Db::termMatch(MatchType typ, const string &lang,
|
|||||||
|
|
||||||
// Get rid of capitals and accents
|
// Get rid of capitals and accents
|
||||||
string droot;
|
string droot;
|
||||||
if (!unacmaybefold(root, droot, "UTF-8", true)) {
|
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
|
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -41,25 +41,12 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
namespace StemDb {
|
|
||||||
|
|
||||||
|
// Fast raw detection of non-natural-language words: look for ascii
|
||||||
vector<string> getLangs(Xapian::Database& xdb)
|
// chars which are not lowercase letters. Not too sure what islower()
|
||||||
{
|
// would do with 8 bit values, so not using it here. If we want to be
|
||||||
XapSynFamily fam(xdb, synFamStem);
|
// more complete we'd need to go full utf-8
|
||||||
vector<string> langs;
|
inline static bool p_notlowerascii(unsigned int c)
|
||||||
(void)fam.getMembers(langs);
|
|
||||||
return langs;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool deleteDb(Xapian::WritableDatabase& xdb, const string& lang)
|
|
||||||
{
|
|
||||||
XapWritableSynFamily fam(xdb, synFamStem);
|
|
||||||
return fam.deleteMember(lang);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static bool
|
|
||||||
p_notlowerascii(unsigned int c)
|
|
||||||
{
|
{
|
||||||
if (c < 'a' || (c > 'z' && c < 128))
|
if (c < 'a' || (c > 'z' && c < 128))
|
||||||
return true;
|
return true;
|
||||||
@ -68,32 +55,28 @@ p_notlowerascii(unsigned int c)
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Create database of stem to parents associations for a given language.
|
* Create database of stem to parents associations for a given language.
|
||||||
* We walk the list of all terms, stem them, and create another Xapian db
|
|
||||||
* with documents indexed by a single term (the stem), and with the list of
|
|
||||||
* parent terms in the document data.
|
|
||||||
*/
|
*/
|
||||||
bool createDb(Xapian::WritableDatabase& xdb, const string& lang)
|
bool WritableStemDb::createDb(const string& lang)
|
||||||
{
|
{
|
||||||
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
|
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
|
||||||
Chrono cron;
|
Chrono cron;
|
||||||
|
createMember(lang);
|
||||||
|
string prefix = entryprefix(lang);
|
||||||
|
|
||||||
// First build the in-memory stem database:
|
// We walk the list of all terms, and stem each. We skip terms which
|
||||||
// We walk the list of all terms, and stem each.
|
// don't look like natural language.
|
||||||
// If the stem is identical to the term, no need to create an entry
|
// If the stem is not identical to the term, we add a synonym entry.
|
||||||
// Else, we add an entry to the multimap.
|
|
||||||
// At the end, we only save stem-terms associations with several terms, the
|
|
||||||
// others are not useful
|
|
||||||
// Note: a map<string, vector<string> > would probably be more efficient
|
|
||||||
map<string, vector<string> > assocs;
|
|
||||||
// Statistics
|
// Statistics
|
||||||
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
|
int nostem = 0; // Dont even try: not-alphanum (incomplete for now)
|
||||||
int stemconst=0; // Stem == term
|
int stemconst = 0; // Stem == term
|
||||||
int stemmultiple = 0; // Count of stems with multiple derivatives
|
int allsyns = 0; // Total number of entries created
|
||||||
|
|
||||||
string ermsg;
|
string ermsg;
|
||||||
try {
|
try {
|
||||||
Xapian::Stem stemmer(lang);
|
Xapian::Stem stemmer(lang);
|
||||||
Xapian::TermIterator it;
|
|
||||||
for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
|
for (Xapian::TermIterator it = m_wdb.allterms_begin();
|
||||||
|
it != m_wdb.allterms_end(); it++) {
|
||||||
// If the term has any non-lowercase 7bit char (that is,
|
// If the term has any non-lowercase 7bit char (that is,
|
||||||
// numbers, capitals and punctuation) dont stem.
|
// numbers, capitals and punctuation) dont stem.
|
||||||
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
||||||
@ -126,7 +109,9 @@ bool createDb(Xapian::WritableDatabase& xdb, const string& lang)
|
|||||||
++stemconst;
|
++stemconst;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
assocs[stem].push_back(*it);
|
|
||||||
|
m_wdb.add_synonym(prefix + stem, *it);
|
||||||
|
++allsyns;
|
||||||
}
|
}
|
||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
if (!ermsg.empty()) {
|
if (!ermsg.empty()) {
|
||||||
@ -134,30 +119,9 @@ bool createDb(Xapian::WritableDatabase& xdb, const string& lang)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGDEB1(("StemDb::createDb(%s): in memory map built: %.2f S\n",
|
LOGDEB(("StemDb::createDb(%s): done: %.2f S\n", lang.c_str(), cron.secs()));
|
||||||
lang.c_str(), cron.secs()));
|
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
|
||||||
|
nostem, stemconst, allsyns));
|
||||||
XapWritableSynFamily fam(xdb, synFamStem);
|
|
||||||
fam.createMember(lang);
|
|
||||||
|
|
||||||
for (map<string, vector<string> >::const_iterator it = assocs.begin();
|
|
||||||
it != assocs.end(); it++) {
|
|
||||||
LOGDEB2(("createStemDb: stem [%s]\n", it->first.c_str()));
|
|
||||||
// We need an entry even if there is only one derivative
|
|
||||||
// so that it is possible to search by entering the stem
|
|
||||||
// even if it doesnt exist as a term
|
|
||||||
if (it->second.size() > 1)
|
|
||||||
++stemmultiple;
|
|
||||||
if (!fam.addSynonyms(lang, it->first, it->second)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n",
|
|
||||||
lang.c_str(), cron.secs()));
|
|
||||||
LOGDEB(("Stem map size: %d mult %d const %d no %d \n",
|
|
||||||
assocs.size(), stemmultiple, stemconst, nostem));
|
|
||||||
fam.listMap(lang);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,10 +129,9 @@ bool createDb(Xapian::WritableDatabase& xdb, const string& lang)
|
|||||||
* Expand term to list of all terms which stem to the same term, for one
|
* Expand term to list of all terms which stem to the same term, for one
|
||||||
* expansion language
|
* expansion language
|
||||||
*/
|
*/
|
||||||
static bool stemExpandOne(Xapian::Database& xdb,
|
bool StemDb::expandOne(const std::string& lang,
|
||||||
const std::string& lang,
|
const std::string& term,
|
||||||
const std::string& term,
|
vector<string>& result)
|
||||||
vector<string>& result)
|
|
||||||
{
|
{
|
||||||
try {
|
try {
|
||||||
Xapian::Stem stemmer(lang);
|
Xapian::Stem stemmer(lang);
|
||||||
@ -176,8 +139,7 @@ static bool stemExpandOne(Xapian::Database& xdb,
|
|||||||
LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n",
|
LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n",
|
||||||
lang.c_str(), term.c_str(), stem.c_str()));
|
lang.c_str(), term.c_str(), stem.c_str()));
|
||||||
|
|
||||||
XapSynFamily fam(xdb, synFamStem);
|
if (!synExpand(lang, stem, result)) {
|
||||||
if (!fam.synExpand(lang, stem, result)) {
|
|
||||||
// ?
|
// ?
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -202,20 +164,18 @@ static bool stemExpandOne(Xapian::Database& xdb,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Expand term to list of all terms which stem to the same term, add the
|
* Expand for one or several languages
|
||||||
* expansion sets for possibly multiple expansion languages
|
|
||||||
*/
|
*/
|
||||||
bool stemExpand(Xapian::Database& xdb,
|
bool StemDb::stemExpand(const std::string& langs,
|
||||||
const std::string& langs,
|
const std::string& term,
|
||||||
const std::string& term,
|
vector<string>& result)
|
||||||
vector<string>& result)
|
|
||||||
{
|
{
|
||||||
vector<string> llangs;
|
vector<string> llangs;
|
||||||
stringToStrings(langs, llangs);
|
stringToStrings(langs, llangs);
|
||||||
for (vector<string>::const_iterator it = llangs.begin();
|
for (vector<string>::const_iterator it = llangs.begin();
|
||||||
it != llangs.end(); it++) {
|
it != llangs.end(); it++) {
|
||||||
vector<string> oneexp;
|
vector<string> oneexp;
|
||||||
stemExpandOne(xdb, *it, term, oneexp);
|
expandOne(*it, term, oneexp);
|
||||||
result.insert(result.end(), oneexp.begin(), oneexp.end());
|
result.insert(result.end(), oneexp.begin(), oneexp.end());
|
||||||
}
|
}
|
||||||
sort(result.begin(), result.end());
|
sort(result.begin(), result.end());
|
||||||
@ -225,4 +185,3 @@ bool stemExpand(Xapian::Database& xdb,
|
|||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|||||||
@ -54,24 +54,37 @@
|
|||||||
|
|
||||||
#include <xapian.h>
|
#include <xapian.h>
|
||||||
|
|
||||||
|
#include "synfamily.h"
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
namespace StemDb {
|
|
||||||
|
|
||||||
/// Get languages of existing stem databases
|
class StemDb : public XapSynFamily {
|
||||||
extern std::vector<std::string> getLangs(Xapian::Database& xdb);
|
public:
|
||||||
|
StemDb(Xapian::Database& xdb)
|
||||||
|
: XapSynFamily(xdb, synFamStem)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
/// Delete stem database for given language
|
/** Expand for a number of languages */
|
||||||
extern bool deleteDb(Xapian::WritableDatabase&, const std::string& lang);
|
bool stemExpand(const std::string& langs,
|
||||||
|
const std::string& term,
|
||||||
|
std::vector<std::string>& result);
|
||||||
|
private:
|
||||||
|
/** Compute stem and call synExpand() */
|
||||||
|
bool expandOne(const std::string& lang,
|
||||||
|
const std::string& term,
|
||||||
|
std::vector<std::string>& result);
|
||||||
|
};
|
||||||
|
|
||||||
/// Create stem database for given language
|
class WritableStemDb : public XapWritableSynFamily {
|
||||||
extern bool createDb(Xapian::WritableDatabase&, const std::string& lang);
|
public:
|
||||||
|
WritableStemDb(Xapian::WritableDatabase& xdb)
|
||||||
|
: XapWritableSynFamily(xdb, synFamStem)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
bool createDb(const std::string& lang);
|
||||||
|
};
|
||||||
|
|
||||||
/// Expand term to stem siblings
|
|
||||||
extern bool stemExpand(Xapian::Database& xdb,
|
|
||||||
const std::string& lang,
|
|
||||||
const std::string& term,
|
|
||||||
std::vector<std::string>& result);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* _STEMDB_H_INCLUDED_ */
|
#endif /* _STEMDB_H_INCLUDED_ */
|
||||||
|
|||||||
@ -93,8 +93,7 @@ public:
|
|||||||
virtual bool createMember(const std::string& membername);
|
virtual bool createMember(const std::string& membername);
|
||||||
|
|
||||||
/** Add expansion list for term inside family member (e.g., inside
|
/** Add expansion list for term inside family member (e.g., inside
|
||||||
* the french member, add expansion for familier -> familier,
|
* the english member, add expansion for floor -> floors, flooring.. */
|
||||||
* familierement, ... */
|
|
||||||
virtual bool addSynonyms(const std::string& membername,
|
virtual bool addSynonyms(const std::string& membername,
|
||||||
const std::string& term,
|
const std::string& term,
|
||||||
const std::vector<std::string>& trans);
|
const std::vector<std::string>& trans);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user