dont send cjk terms to stemmers. Sending them didnt seem to hurt, but did not make sense

This commit is contained in:
Jean-Francois Dockes 2012-03-22 15:09:40 +01:00
parent 988ec0f937
commit 14042528bd

View File

@ -31,6 +31,8 @@
#include "pathut.h"
#include "debuglog.h"
#include "smallut.h"
#include "utf8iter.h"
#include "textsplit.h"
using namespace std;
@ -139,12 +141,11 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
Xapian::Stem stemmer(lang);
Xapian::TermIterator it;
for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
// Deciding if we try to stem the term. If it has any
// Deciding if we try to stem the term.
// If it has any
// non-lowercase 7bit char (that is, numbers, capitals and
// punctuation) dont. We're still sending all multibyte
// utf-8 chars to the stemmer, which is not too well
// defined for xapian < 1.0, but seems to work anyway. We don't
// try to look for multibyte non alphabetic data.
// punctuation) dont.
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
++nostem;
@ -152,6 +153,21 @@ bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
(*it).c_str(), *sit));
continue;
}
// Detect and skip CJK terms.
// We're still sending all other multibyte utf-8 chars to
// the stemmer, which is not too well defined for
// xapian<1.0 (very obsolete now), but seems to work
// anyway. There shouldnt be too many in any case because
// accents are stripped at this point. Effect of stripping
// accents on stemming unknown, hopefuly none, there is
// nothing we can do about it.
Utf8Iter utfit(*it);
if (TextSplit::isCJK(*utfit)) {
// LOGDEB(("stemskipped: Skipping CJK\n"));
continue;
}
string stem = stemmer(*it);
LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
stem.c_str()));