fixed stemming, was completely broken in 1.13

This commit is contained in:
Jean-Francois Dockes 2010-04-12 19:11:23 +02:00
parent f84c7685b0
commit fb9f128e26
2 changed files with 108 additions and 81 deletions

View File

@ -195,6 +195,10 @@ bool ConfIndexer::createStemmingDatabases()
{ {
string slangs; string slangs;
if (m_config->getConfParam("indexstemminglanguages", slangs)) { if (m_config->getConfParam("indexstemminglanguages", slangs)) {
if (!m_db.open(Rcl::Db::DbRO)) {
LOGERR(("ConfIndexer::createStemmingDb: could not open db\n"))
return false;
}
list<string> langs; list<string> langs;
stringToStrings(slangs, langs); stringToStrings(slangs, langs);
@ -215,6 +219,7 @@ bool ConfIndexer::createStemmingDatabases()
m_db.createStemDb(*it); m_db.createStemDb(*it);
} }
} }
m_db.close();
return true; return true;
} }
@ -245,6 +250,7 @@ bool ConfIndexer::createAspellDict()
return true; return true;
if (!m_db.open(Rcl::Db::DbRO)) { if (!m_db.open(Rcl::Db::DbRO)) {
LOGERR(("ConfIndexer::createAspellDict: could not open db\n"));
return false; return false;
} }

View File

@ -75,6 +75,30 @@ p_notlowerascii(unsigned int c)
return false; return false;
} }
static bool addAssoc(Xapian::WritableDatabase &sdb, const string& stem,
const list<string>& derivs)
{
Xapian::Document newdocument;
newdocument.add_term(stem);
// The doc data is just parents=blank-separated-list
string record = "parents=";
for (list<string>::const_iterator it = derivs.begin();
it != derivs.end(); it++) {
record += *it + " ";
}
record += "\n";
LOGDEB2(("createStemDb: stmdoc data: [%s]\n", record.c_str()));
newdocument.set_data(record);
try {
sdb.replace_document(stem, newdocument);
} catch (...) {
LOGERR(("Db::createstemdb(addAssoc): replace failed\n"));
return false;
}
return true;
}
/** /**
* Create database of stem to parents associations for a given language. * Create database of stem to parents associations for a given language.
* We walk the list of all terms, stem them, and create another Xapian db * We walk the list of all terms, stem them, and create another Xapian db
@ -100,41 +124,41 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
int stemdiff=0; // Count of all different stems int stemdiff=0; // Count of all different stems
int stemmultiple = 0; // Count of stems with multiple derivatives int stemmultiple = 0; // Count of stems with multiple derivatives
try { try {
Xapian::Stem stemmer(lang); Xapian::Stem stemmer(lang);
Xapian::TermIterator it; Xapian::TermIterator it;
for (it = xdb.allterms_begin(); for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
it != xdb.allterms_end(); it++) { // Deciding if we try to stem the term. If it has any
// Deciding if we try to stem the term. If it has any // non-lowercase 7bit char (that is, numbers, capitals and
// non-lowercase 7bit char (that is, numbers, capitals and // punctuation) dont. We're still sending all multibyte
// punctuation) dont. We're still sending all multibyte // utf-8 chars to the stemmer, which is not too well
// utf-8 chars to the stemmer, which is not too well // defined for xapian < 1.0, but seems to work anyway. We don't
// defined for xapian < 1.0, but seems to work anyway. We don't // try to look for multibyte non alphabetic data.
// try to look for multibyte non alphabetic data. string::iterator sit = (*it).begin(), eit = sit + (*it).length();
string::iterator sit = (*it).begin(), eit = sit + (*it).length(); if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) { ++nostem;
++nostem; LOGDEB1(("stemskipped: [%s], because of 0x%x\n",
// LOGDEB(("stemskipped: [%s], because of 0x%x\n", (*it).c_str(), *sit));
// (*it).c_str(), *sit)); continue;
continue; }
} string stem = stemmer(*it);
string stem = stemmer(*it); LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
//cerr << "word " << *it << " stem " << stem << endl; stem.c_str()));
if (stem == *it) { if (stem == *it) {
++stemconst; ++stemconst;
continue; continue;
} }
assocs.insert(pair<string,string>(stem, *it)); assocs.insert(pair<string,string>(stem, *it));
} }
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str())); LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
return false; return false;
} catch (...) { } catch (...) {
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n", LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
lang.c_str())); lang.c_str()));
return false; return false;
} }
LOGDEB1(("StemDb::createDb(%s): in memory map built: %.2f S\n", LOGDEB1(("StemDb::createDb(%s): in memory map built: %.2f S\n",
lang.c_str(), cron.secs())); lang.c_str(), cron.secs()));
// Create xapian database for stem relations // Create xapian database for stem relations
string stemdbdir = stemdbname(dbdir, lang); string stemdbdir = stemdbname(dbdir, lang);
@ -144,69 +168,66 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
string ermsg; string ermsg;
Xapian::WritableDatabase sdb; Xapian::WritableDatabase sdb;
try { try {
sdb = Xapian::WritableDatabase(stemdbdir, sdb = Xapian::WritableDatabase(stemdbdir,
Xapian::DB_CREATE_OR_OVERWRITE); Xapian::DB_CREATE_OR_OVERWRITE);
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
ermsg = e.get_msg(); ermsg = e.get_msg();
} catch (const string &s) { } catch (const string &s) {
ermsg = s; ermsg = s;
} catch (const char *s) { } catch (const char *s) {
ermsg = s; ermsg = s;
} catch (...) { } catch (...) {
ermsg = "Caught unknown exception"; ermsg = "Caught unknown exception";
} }
if (!ermsg.empty()) { if (!ermsg.empty()) {
LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n", LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n",
stemdbdir.c_str(), ermsg.c_str())); stemdbdir.c_str(), ermsg.c_str()));
return false; return false;
} }
// Enter pseud-docs in db. Walk the multimap, only enter // Enter pseud-docs in db by walking the multimap.
// associations where there are several parent terms
string stem; string stem;
list<string> derivs; list<string> derivs;
for (multimap<string,string>::const_iterator it = assocs.begin(); for (multimap<string,string>::const_iterator it = assocs.begin();
it != assocs.end(); it++) { it != assocs.end(); it++) {
if (stem == it->first) { if (stem == it->first) {
// Staying with same stem // Staying with same stem
derivs.push_back(it->second); derivs.push_back(it->second);
// cerr << " " << it->second << endl; // cerr << " " << it->second << endl;
} else { } else {
// Changing stems // Changing stems
++stemdiff; ++stemdiff;
// We need an entry even if there is only one derivative LOGDEB2(("createStemDb: stem [%s]\n", stem.c_str()));
// so that it is possible to search by entering the stem
// even if it doesnt exist as a term // We need an entry even if there is only one derivative
if (derivs.size() >= 1) { // so that it is possible to search by entering the stem
// Previous stem has multiple derivatives. Enter in db // even if it doesnt exist as a term
++stemmultiple; if (!derivs.empty()) {
Xapian::Document newdocument;
newdocument.add_term(stem); if (derivs.size() > 1)
// The doc data is just parents=blank-separated-list ++stemmultiple;
string record = "parents=";
for (list<string>::const_iterator it = derivs.begin(); if (!addAssoc(sdb, stem, derivs)) {
it != derivs.end(); it++) { return false;
record += *it + " "; }
} derivs.clear();
record += "\n"; }
LOGDEB1(("stemdocument data: %s\n", record.c_str())); stem = it->first;
newdocument.set_data(record); derivs.push_back(it->second);
try { // cerr << "\n" << stem << " " << it->second;
sdb.replace_document(stem, newdocument); }
//sdb.add_document(newdocument);
} catch (...) {
LOGERR(("Db::createstemdb: replace failed\n"));
return false;
}
}
derivs.clear();
stem = it->first;
derivs.push_back(it->second);
// cerr << "\n" << stem << " " << it->second;
}
} }
if (!derivs.empty()) {
if (derivs.size() > 1)
++stemmultiple;
if (!addAssoc(sdb, stem, derivs)) {
return false;
}
}
LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n", LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n",
lang.c_str(), cron.secs())); lang.c_str(), cron.secs()));
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
assocs.size(), stemdiff, stemmultiple, nostem, stemconst)); assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
wiper.do_it = false; wiper.do_it = false;