simplified stemdb-creating code

This commit is contained in:
Jean-Francois Dockes 2012-08-24 14:50:24 +02:00
parent ee9dbda9fc
commit 688121d2f7

View File

@ -130,22 +130,18 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
// Else, we add an entry to the multimap. // Else, we add an entry to the multimap.
// At the end, we only save stem-terms associations with several terms, the // At the end, we only save stem-terms associations with several terms, the
// others are not useful // others are not useful
// Note: a map<string, list<string> > would probably be more efficient // Note: a map<string, vector<string> > would probably be more efficient
multimap<string, string> assocs; map<string, vector<string> > assocs;
// Statistics // Statistics
int nostem=0; // Dont even try: not-alphanum (incomplete for now) int nostem=0; // Dont even try: not-alphanum (incomplete for now)
int stemconst=0; // Stem == term int stemconst=0; // Stem == term
int stemdiff=0; // Count of all different stems
int stemmultiple = 0; // Count of stems with multiple derivatives int stemmultiple = 0; // Count of stems with multiple derivatives
try { try {
Xapian::Stem stemmer(lang); Xapian::Stem stemmer(lang);
Xapian::TermIterator it; Xapian::TermIterator it;
for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) { for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
// Deciding if we try to stem the term. // If the term has any non-lowercase 7bit char (that is,
// numbers, capitals and punctuation) dont stem.
// If it has any
// non-lowercase 7bit char (that is, numbers, capitals and
// punctuation) dont.
string::iterator sit = (*it).begin(), eit = sit + (*it).length(); string::iterator sit = (*it).begin(), eit = sit + (*it).length();
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) { if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
++nostem; ++nostem;
@ -158,10 +154,11 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
// We're still sending all other multibyte utf-8 chars to // We're still sending all other multibyte utf-8 chars to
// the stemmer, which is not too well defined for // the stemmer, which is not too well defined for
// xapian<1.0 (very obsolete now), but seems to work // xapian<1.0 (very obsolete now), but seems to work
// anyway. There shouldnt be too many in any case because // anyway. There shouldn't be too many in any case because
// accents are stripped at this point. Effect of stripping // accents are stripped at this point.
// accents on stemming unknown, hopefuly none, there is // The effect of stripping accents on stemming is not good,
// nothing we can do about it. // (e.g: in french partimes -> partim, parti^mes -> part)
// but fixing the issue would be complicated.
Utf8Iter utfit(*it); Utf8Iter utfit(*it);
if (TextSplit::isCJK(*utfit)) { if (TextSplit::isCJK(*utfit)) {
// LOGDEB(("stemskipped: Skipping CJK\n")); // LOGDEB(("stemskipped: Skipping CJK\n"));
@ -175,7 +172,7 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
++stemconst; ++stemconst;
continue; continue;
} }
assocs.insert(pair<string,string>(stem, *it)); assocs[stem].push_back(*it);
} }
} catch (const Xapian::Error &e) { } catch (const Xapian::Error &e) {
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str())); LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
@ -213,51 +210,25 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
return false; return false;
} }
// Enter pseud-docs in db by walking the multimap. // Enter pseud-docs in db by walking the map.
string stem; for (map<string, vector<string> >::const_iterator it = assocs.begin();
vector<string> derivs;
for (multimap<string,string>::const_iterator it = assocs.begin();
it != assocs.end(); it++) { it != assocs.end(); it++) {
if (stem == it->first) { LOGDEB2(("createStemDb: stem [%s]\n", it->first.c_str()));
// Staying with same stem // We need an entry even if there is only one derivative
derivs.push_back(it->second); // so that it is possible to search by entering the stem
// cerr << " " << it->second << endl; // even if it doesnt exist as a term
} else { if (it->second.size() > 1)
// Changing stems ++stemmultiple;
++stemdiff;
LOGDEB2(("createStemDb: stem [%s]\n", stem.c_str()));
// We need an entry even if there is only one derivative
// so that it is possible to search by entering the stem
// even if it doesnt exist as a term
if (!derivs.empty()) {
if (derivs.size() > 1)
++stemmultiple;
if (!addAssoc(sdb, stem, derivs)) { if (!addAssoc(sdb, it->first, it->second)) {
return false; return false;
} }
derivs.clear();
}
stem = it->first;
derivs.push_back(it->second);
// cerr << "\n" << stem << " " << it->second;
}
}
if (!derivs.empty()) {
if (derivs.size() > 1)
++stemmultiple;
if (!addAssoc(sdb, stem, derivs)) {
return false;
}
} }
LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n", LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n",
lang.c_str(), cron.secs())); lang.c_str(), cron.secs()));
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", LOGDEB(("Stem map size: %d mult %d const %d no %d \n",
assocs.size(), stemdiff, stemmultiple, nostem, stemconst)); assocs.size(), stemmultiple, stemconst, nostem));
wiper.do_it = false; wiper.do_it = false;
return true; return true;
} }