fixed stemming, was completely broken in 1.13
This commit is contained in:
parent
f84c7685b0
commit
fb9f128e26
@ -195,6 +195,10 @@ bool ConfIndexer::createStemmingDatabases()
|
|||||||
{
|
{
|
||||||
string slangs;
|
string slangs;
|
||||||
if (m_config->getConfParam("indexstemminglanguages", slangs)) {
|
if (m_config->getConfParam("indexstemminglanguages", slangs)) {
|
||||||
|
if (!m_db.open(Rcl::Db::DbRO)) {
|
||||||
|
LOGERR(("ConfIndexer::createStemmingDb: could not open db\n"))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
list<string> langs;
|
list<string> langs;
|
||||||
stringToStrings(slangs, langs);
|
stringToStrings(slangs, langs);
|
||||||
|
|
||||||
@ -215,6 +219,7 @@ bool ConfIndexer::createStemmingDatabases()
|
|||||||
m_db.createStemDb(*it);
|
m_db.createStemDb(*it);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m_db.close();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -245,6 +250,7 @@ bool ConfIndexer::createAspellDict()
|
|||||||
return true;
|
return true;
|
||||||
|
|
||||||
if (!m_db.open(Rcl::Db::DbRO)) {
|
if (!m_db.open(Rcl::Db::DbRO)) {
|
||||||
|
LOGERR(("ConfIndexer::createAspellDict: could not open db\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -75,6 +75,30 @@ p_notlowerascii(unsigned int c)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool addAssoc(Xapian::WritableDatabase &sdb, const string& stem,
|
||||||
|
const list<string>& derivs)
|
||||||
|
{
|
||||||
|
Xapian::Document newdocument;
|
||||||
|
newdocument.add_term(stem);
|
||||||
|
// The doc data is just parents=blank-separated-list
|
||||||
|
string record = "parents=";
|
||||||
|
for (list<string>::const_iterator it = derivs.begin();
|
||||||
|
it != derivs.end(); it++) {
|
||||||
|
record += *it + " ";
|
||||||
|
}
|
||||||
|
record += "\n";
|
||||||
|
LOGDEB2(("createStemDb: stmdoc data: [%s]\n", record.c_str()));
|
||||||
|
newdocument.set_data(record);
|
||||||
|
try {
|
||||||
|
sdb.replace_document(stem, newdocument);
|
||||||
|
} catch (...) {
|
||||||
|
LOGERR(("Db::createstemdb(addAssoc): replace failed\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create database of stem to parents associations for a given language.
|
* Create database of stem to parents associations for a given language.
|
||||||
* We walk the list of all terms, stem them, and create another Xapian db
|
* We walk the list of all terms, stem them, and create another Xapian db
|
||||||
@ -100,41 +124,41 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
|||||||
int stemdiff=0; // Count of all different stems
|
int stemdiff=0; // Count of all different stems
|
||||||
int stemmultiple = 0; // Count of stems with multiple derivatives
|
int stemmultiple = 0; // Count of stems with multiple derivatives
|
||||||
try {
|
try {
|
||||||
Xapian::Stem stemmer(lang);
|
Xapian::Stem stemmer(lang);
|
||||||
Xapian::TermIterator it;
|
Xapian::TermIterator it;
|
||||||
for (it = xdb.allterms_begin();
|
for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
|
||||||
it != xdb.allterms_end(); it++) {
|
// Deciding if we try to stem the term. If it has any
|
||||||
// Deciding if we try to stem the term. If it has any
|
// non-lowercase 7bit char (that is, numbers, capitals and
|
||||||
// non-lowercase 7bit char (that is, numbers, capitals and
|
// punctuation) dont. We're still sending all multibyte
|
||||||
// punctuation) dont. We're still sending all multibyte
|
// utf-8 chars to the stemmer, which is not too well
|
||||||
// utf-8 chars to the stemmer, which is not too well
|
// defined for xapian < 1.0, but seems to work anyway. We don't
|
||||||
// defined for xapian < 1.0, but seems to work anyway. We don't
|
// try to look for multibyte non alphabetic data.
|
||||||
// try to look for multibyte non alphabetic data.
|
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
||||||
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
||||||
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
++nostem;
|
||||||
++nostem;
|
LOGDEB1(("stemskipped: [%s], because of 0x%x\n",
|
||||||
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
|
(*it).c_str(), *sit));
|
||||||
// (*it).c_str(), *sit));
|
continue;
|
||||||
continue;
|
}
|
||||||
}
|
string stem = stemmer(*it);
|
||||||
string stem = stemmer(*it);
|
LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
|
||||||
//cerr << "word " << *it << " stem " << stem << endl;
|
stem.c_str()));
|
||||||
if (stem == *it) {
|
if (stem == *it) {
|
||||||
++stemconst;
|
++stemconst;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
assocs.insert(pair<string,string>(stem, *it));
|
assocs.insert(pair<string,string>(stem, *it));
|
||||||
}
|
}
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
|
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
|
||||||
return false;
|
return false;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
|
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
|
||||||
lang.c_str()));
|
lang.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
LOGDEB1(("StemDb::createDb(%s): in memory map built: %.2f S\n",
|
LOGDEB1(("StemDb::createDb(%s): in memory map built: %.2f S\n",
|
||||||
lang.c_str(), cron.secs()));
|
lang.c_str(), cron.secs()));
|
||||||
|
|
||||||
// Create xapian database for stem relations
|
// Create xapian database for stem relations
|
||||||
string stemdbdir = stemdbname(dbdir, lang);
|
string stemdbdir = stemdbname(dbdir, lang);
|
||||||
@ -144,69 +168,66 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
|||||||
string ermsg;
|
string ermsg;
|
||||||
Xapian::WritableDatabase sdb;
|
Xapian::WritableDatabase sdb;
|
||||||
try {
|
try {
|
||||||
sdb = Xapian::WritableDatabase(stemdbdir,
|
sdb = Xapian::WritableDatabase(stemdbdir,
|
||||||
Xapian::DB_CREATE_OR_OVERWRITE);
|
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
ermsg = e.get_msg();
|
ermsg = e.get_msg();
|
||||||
} catch (const string &s) {
|
} catch (const string &s) {
|
||||||
ermsg = s;
|
ermsg = s;
|
||||||
} catch (const char *s) {
|
} catch (const char *s) {
|
||||||
ermsg = s;
|
ermsg = s;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
ermsg = "Caught unknown exception";
|
ermsg = "Caught unknown exception";
|
||||||
}
|
}
|
||||||
if (!ermsg.empty()) {
|
if (!ermsg.empty()) {
|
||||||
LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n",
|
LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n",
|
||||||
stemdbdir.c_str(), ermsg.c_str()));
|
stemdbdir.c_str(), ermsg.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enter pseud-docs in db. Walk the multimap, only enter
|
// Enter pseud-docs in db by walking the multimap.
|
||||||
// associations where there are several parent terms
|
|
||||||
string stem;
|
string stem;
|
||||||
list<string> derivs;
|
list<string> derivs;
|
||||||
for (multimap<string,string>::const_iterator it = assocs.begin();
|
for (multimap<string,string>::const_iterator it = assocs.begin();
|
||||||
it != assocs.end(); it++) {
|
it != assocs.end(); it++) {
|
||||||
if (stem == it->first) {
|
if (stem == it->first) {
|
||||||
// Staying with same stem
|
// Staying with same stem
|
||||||
derivs.push_back(it->second);
|
derivs.push_back(it->second);
|
||||||
// cerr << " " << it->second << endl;
|
// cerr << " " << it->second << endl;
|
||||||
} else {
|
} else {
|
||||||
// Changing stems
|
// Changing stems
|
||||||
++stemdiff;
|
++stemdiff;
|
||||||
// We need an entry even if there is only one derivative
|
LOGDEB2(("createStemDb: stem [%s]\n", stem.c_str()));
|
||||||
// so that it is possible to search by entering the stem
|
|
||||||
// even if it doesnt exist as a term
|
// We need an entry even if there is only one derivative
|
||||||
if (derivs.size() >= 1) {
|
// so that it is possible to search by entering the stem
|
||||||
// Previous stem has multiple derivatives. Enter in db
|
// even if it doesnt exist as a term
|
||||||
++stemmultiple;
|
if (!derivs.empty()) {
|
||||||
Xapian::Document newdocument;
|
|
||||||
newdocument.add_term(stem);
|
if (derivs.size() > 1)
|
||||||
// The doc data is just parents=blank-separated-list
|
++stemmultiple;
|
||||||
string record = "parents=";
|
|
||||||
for (list<string>::const_iterator it = derivs.begin();
|
if (!addAssoc(sdb, stem, derivs)) {
|
||||||
it != derivs.end(); it++) {
|
return false;
|
||||||
record += *it + " ";
|
}
|
||||||
}
|
derivs.clear();
|
||||||
record += "\n";
|
}
|
||||||
LOGDEB1(("stemdocument data: %s\n", record.c_str()));
|
stem = it->first;
|
||||||
newdocument.set_data(record);
|
derivs.push_back(it->second);
|
||||||
try {
|
// cerr << "\n" << stem << " " << it->second;
|
||||||
sdb.replace_document(stem, newdocument);
|
}
|
||||||
//sdb.add_document(newdocument);
|
|
||||||
} catch (...) {
|
|
||||||
LOGERR(("Db::createstemdb: replace failed\n"));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
derivs.clear();
|
|
||||||
stem = it->first;
|
|
||||||
derivs.push_back(it->second);
|
|
||||||
// cerr << "\n" << stem << " " << it->second;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
if (!derivs.empty()) {
|
||||||
|
if (derivs.size() > 1)
|
||||||
|
++stemmultiple;
|
||||||
|
|
||||||
|
if (!addAssoc(sdb, stem, derivs)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n",
|
LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n",
|
||||||
lang.c_str(), cron.secs()));
|
lang.c_str(), cron.secs()));
|
||||||
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
|
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
|
||||||
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
||||||
wiper.do_it = false;
|
wiper.do_it = false;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user