simplified stemdb-creating code
This commit is contained in:
parent
ee9dbda9fc
commit
688121d2f7
@ -130,22 +130,18 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
|||||||
// Else, we add an entry to the multimap.
|
// Else, we add an entry to the multimap.
|
||||||
// At the end, we only save stem-terms associations with several terms, the
|
// At the end, we only save stem-terms associations with several terms, the
|
||||||
// others are not useful
|
// others are not useful
|
||||||
// Note: a map<string, list<string> > would probably be more efficient
|
// Note: a map<string, vector<string> > would probably be more efficient
|
||||||
multimap<string, string> assocs;
|
map<string, vector<string> > assocs;
|
||||||
// Statistics
|
// Statistics
|
||||||
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
|
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
|
||||||
int stemconst=0; // Stem == term
|
int stemconst=0; // Stem == term
|
||||||
int stemdiff=0; // Count of all different stems
|
|
||||||
int stemmultiple = 0; // Count of stems with multiple derivatives
|
int stemmultiple = 0; // Count of stems with multiple derivatives
|
||||||
try {
|
try {
|
||||||
Xapian::Stem stemmer(lang);
|
Xapian::Stem stemmer(lang);
|
||||||
Xapian::TermIterator it;
|
Xapian::TermIterator it;
|
||||||
for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
|
for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
|
||||||
// Deciding if we try to stem the term.
|
// If the term has any non-lowercase 7bit char (that is,
|
||||||
|
// numbers, capitals and punctuation) dont stem.
|
||||||
// If it has any
|
|
||||||
// non-lowercase 7bit char (that is, numbers, capitals and
|
|
||||||
// punctuation) dont.
|
|
||||||
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
||||||
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
||||||
++nostem;
|
++nostem;
|
||||||
@ -158,10 +154,11 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
|||||||
// We're still sending all other multibyte utf-8 chars to
|
// We're still sending all other multibyte utf-8 chars to
|
||||||
// the stemmer, which is not too well defined for
|
// the stemmer, which is not too well defined for
|
||||||
// xapian<1.0 (very obsolete now), but seems to work
|
// xapian<1.0 (very obsolete now), but seems to work
|
||||||
// anyway. There shouldnt be too many in any case because
|
// anyway. There shouldn't be too many in any case because
|
||||||
// accents are stripped at this point. Effect of stripping
|
// accents are stripped at this point.
|
||||||
// accents on stemming unknown, hopefuly none, there is
|
// The effect of stripping accents on stemming is not good,
|
||||||
// nothing we can do about it.
|
// (e.g: in french partimes -> partim, parti^mes -> part)
|
||||||
|
// but fixing the issue would be complicated.
|
||||||
Utf8Iter utfit(*it);
|
Utf8Iter utfit(*it);
|
||||||
if (TextSplit::isCJK(*utfit)) {
|
if (TextSplit::isCJK(*utfit)) {
|
||||||
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
||||||
@ -175,7 +172,7 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
|||||||
++stemconst;
|
++stemconst;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
assocs.insert(pair<string,string>(stem, *it));
|
assocs[stem].push_back(*it);
|
||||||
}
|
}
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
|
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
|
||||||
@ -213,51 +210,25 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enter pseud-docs in db by walking the multimap.
|
// Enter pseud-docs in db by walking the map.
|
||||||
string stem;
|
for (map<string, vector<string> >::const_iterator it = assocs.begin();
|
||||||
vector<string> derivs;
|
|
||||||
for (multimap<string,string>::const_iterator it = assocs.begin();
|
|
||||||
it != assocs.end(); it++) {
|
it != assocs.end(); it++) {
|
||||||
if (stem == it->first) {
|
LOGDEB2(("createStemDb: stem [%s]\n", it->first.c_str()));
|
||||||
// Staying with same stem
|
// We need an entry even if there is only one derivative
|
||||||
derivs.push_back(it->second);
|
// so that it is possible to search by entering the stem
|
||||||
// cerr << " " << it->second << endl;
|
// even if it doesnt exist as a term
|
||||||
} else {
|
if (it->second.size() > 1)
|
||||||
// Changing stems
|
++stemmultiple;
|
||||||
++stemdiff;
|
|
||||||
LOGDEB2(("createStemDb: stem [%s]\n", stem.c_str()));
|
|
||||||
|
|
||||||
// We need an entry even if there is only one derivative
|
if (!addAssoc(sdb, it->first, it->second)) {
|
||||||
// so that it is possible to search by entering the stem
|
return false;
|
||||||
// even if it doesnt exist as a term
|
}
|
||||||
if (!derivs.empty()) {
|
|
||||||
|
|
||||||
if (derivs.size() > 1)
|
|
||||||
++stemmultiple;
|
|
||||||
|
|
||||||
if (!addAssoc(sdb, stem, derivs)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
derivs.clear();
|
|
||||||
}
|
|
||||||
stem = it->first;
|
|
||||||
derivs.push_back(it->second);
|
|
||||||
// cerr << "\n" << stem << " " << it->second;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!derivs.empty()) {
|
|
||||||
if (derivs.size() > 1)
|
|
||||||
++stemmultiple;
|
|
||||||
|
|
||||||
if (!addAssoc(sdb, stem, derivs)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n",
|
LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n",
|
||||||
lang.c_str(), cron.secs()));
|
lang.c_str(), cron.secs()));
|
||||||
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
|
LOGDEB(("Stem map size: %d mult %d const %d no %d \n",
|
||||||
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
assocs.size(), stemmultiple, stemconst, nostem));
|
||||||
wiper.do_it = false;
|
wiper.do_it = false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user