dont unaccent japanese + fix bug in unac/split ordering in searchdata

This commit is contained in:
dockes 2008-12-19 09:44:39 +00:00
parent 34b3cb3d1c
commit 0821f0cc29
3 changed files with 1332 additions and 3577 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.30 2008-12-17 14:26:09 dockes Exp $ (C) 2006 J.F.Dockes"; static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.31 2008-12-19 09:44:39 dockes Exp $ (C) 2006 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -229,7 +229,7 @@ public:
} }
private: private:
void stripExpandTerm(bool dont, const string& term, list<string>& exp, void expandTerm(bool dont, const string& term, list<string>& exp,
string& sterm); string& sterm);
// After splitting entry on whitespace: process non-phrase element // After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries); void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
@ -247,7 +247,7 @@ private:
vector<vector<string> > m_groups; vector<vector<string> > m_groups;
}; };
/** Unaccent and lowercase term, possibly expand stem and wildcards /** Expand stem and wildcards
* *
* @param nostemexp don't perform stem expansion. This is mainly used to * @param nostemexp don't perform stem expansion. This is mainly used to
* prevent stem expansion inside phrases (because the user probably * prevent stem expansion inside phrases (because the user probably
@ -257,24 +257,20 @@ private:
* capitalized term, or wildcard(s) * capitalized term, or wildcard(s)
* @param term input single word * @param term input single word
* @param exp output expansion list * @param exp output expansion list
* @param sterm output lower-cased+unaccented version of the input term * @param sterm output original input term if there were no wildcards
* (only for stem expansion, not wildcards)
*/ */
void StringToXapianQ::stripExpandTerm(bool nostemexp, void StringToXapianQ::expandTerm(bool nostemexp,
const string& term, const string& term,
list<string>& exp, list<string>& exp,
string &sterm) string &sterm)
{ {
LOGDEB2(("stripExpandTerm: term [%s] stemlang [%s] nostemexp %d\n", LOGDEB2(("expandTerm: term [%s] stemlang [%s] nostemexp %d\n",
term.c_str(), m_stemlang.c_str(), nostemexp)); term.c_str(), m_stemlang.c_str(), nostemexp));
sterm.erase(); sterm.erase();
exp.clear(); exp.clear();
if (term.empty()) { if (term.empty()) {
return; return;
} }
// term1 is lowercase and without diacritics
string term1;
dumb_string(term, term1);
bool haswild = term.find_first_of("*?[") != string::npos; bool haswild = term.find_first_of("*?[") != string::npos;
@ -299,16 +295,16 @@ void StringToXapianQ::stripExpandTerm(bool nostemexp,
if (nostemexp && !haswild) { if (nostemexp && !haswild) {
// Neither stemming nor wildcard expansion: just the word // Neither stemming nor wildcard expansion: just the word
sterm = term1; sterm = term;
exp.push_front(term1); exp.push_front(term);
exp.resize(1); exp.resize(1);
} else { } else {
list<TermMatchEntry> l; list<TermMatchEntry> l;
if (haswild) { if (haswild) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l); m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l);
} else { } else {
sterm = term1; sterm = term;
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l); m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l);
} }
for (list<TermMatchEntry>::const_iterator it = l.begin(); for (list<TermMatchEntry>::const_iterator it = l.begin();
it != l.end(); it++) { it != l.end(); it++) {
@ -365,7 +361,7 @@ void StringToXapianQ::processSimpleSpan(const string& span,
{ {
list<string> exp; list<string> exp;
string sterm; // dumb version of user term string sterm; // dumb version of user term
stripExpandTerm(false, span, exp, sterm); expandTerm(false, span, exp, sterm);
m_terms.insert(m_terms.end(), exp.begin(), exp.end()); m_terms.insert(m_terms.end(), exp.begin(), exp.end());
addPrefix(exp, m_prefix); addPrefix(exp, m_prefix);
// Push either term or OR of stem-expanded set // Push either term or OR of stem-expanded set
@ -409,7 +405,7 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
string sterm; string sterm;
list<string>exp; list<string>exp;
stripExpandTerm(nostemexp, *it, exp, sterm); expandTerm(nostemexp, *it, exp, sterm);
groups.push_back(vector<string>(exp.begin(), exp.end())); groups.push_back(vector<string>(exp.begin(), exp.end()));
addPrefix(exp, m_prefix); addPrefix(exp, m_prefix);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
@ -448,7 +444,7 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
* @return the subquery count (either or'd stem-expanded terms or phrase word * @return the subquery count (either or'd stem-expanded terms or phrase word
* count) * count)
*/ */
bool StringToXapianQ::processUserString(const string &iq, bool StringToXapianQ::processUserString(const string &_iq,
string &ermsg, string &ermsg,
list<Xapian::Query> &pqueries, list<Xapian::Query> &pqueries,
const StopList& stops, const StopList& stops,
@ -456,11 +452,19 @@ bool StringToXapianQ::processUserString(const string &iq,
bool useNear bool useNear
) )
{ {
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str())); LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
ermsg.erase(); ermsg.erase();
m_terms.clear(); m_terms.clear();
m_groups.clear(); m_groups.clear();
// First unaccent/normalize the input: do it first so that it
// happens in the same order as when indexing: unac then split. As
// the character count can change during normalisation, this is
// specially important for cjk because the artificial cjk split is
// based on character counts
string iq;
dumb_string(_iq, iq);
// Simple whitespace-split input into user-level words and // Simple whitespace-split input into user-level words and
// double-quoted phrases: word1 word2 "this is a phrase". The text // double-quoted phrases: word1 word2 "this is a phrase". The text
// splitter may further still decide that the resulting "words" // splitter may further still decide that the resulting "words"

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,7 @@ extern "C" {
#define UNAC_BLOCK_SHIFT 4 #define UNAC_BLOCK_SHIFT 4
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1) #define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT) #define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
#define UNAC_BLOCK_COUNT 315 #define UNAC_BLOCK_COUNT 368
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT) #define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
/* Generated by builder. Do not modify. End defines */ /* Generated by builder. Do not modify. End defines */
@ -478,6 +478,59 @@ extern unsigned short unac_data311[];
extern unsigned short unac_data312[]; extern unsigned short unac_data312[];
extern unsigned short unac_data313[]; extern unsigned short unac_data313[];
extern unsigned short unac_data314[]; extern unsigned short unac_data314[];
extern unsigned short unac_data315[];
extern unsigned short unac_data316[];
extern unsigned short unac_data317[];
extern unsigned short unac_data318[];
extern unsigned short unac_data319[];
extern unsigned short unac_data320[];
extern unsigned short unac_data321[];
extern unsigned short unac_data322[];
extern unsigned short unac_data323[];
extern unsigned short unac_data324[];
extern unsigned short unac_data325[];
extern unsigned short unac_data326[];
extern unsigned short unac_data327[];
extern unsigned short unac_data328[];
extern unsigned short unac_data329[];
extern unsigned short unac_data330[];
extern unsigned short unac_data331[];
extern unsigned short unac_data332[];
extern unsigned short unac_data333[];
extern unsigned short unac_data334[];
extern unsigned short unac_data335[];
extern unsigned short unac_data336[];
extern unsigned short unac_data337[];
extern unsigned short unac_data338[];
extern unsigned short unac_data339[];
extern unsigned short unac_data340[];
extern unsigned short unac_data341[];
extern unsigned short unac_data342[];
extern unsigned short unac_data343[];
extern unsigned short unac_data344[];
extern unsigned short unac_data345[];
extern unsigned short unac_data346[];
extern unsigned short unac_data347[];
extern unsigned short unac_data348[];
extern unsigned short unac_data349[];
extern unsigned short unac_data350[];
extern unsigned short unac_data351[];
extern unsigned short unac_data352[];
extern unsigned short unac_data353[];
extern unsigned short unac_data354[];
extern unsigned short unac_data355[];
extern unsigned short unac_data356[];
extern unsigned short unac_data357[];
extern unsigned short unac_data358[];
extern unsigned short unac_data359[];
extern unsigned short unac_data360[];
extern unsigned short unac_data361[];
extern unsigned short unac_data362[];
extern unsigned short unac_data363[];
extern unsigned short unac_data364[];
extern unsigned short unac_data365[];
extern unsigned short unac_data366[];
extern unsigned short unac_data367[];
/* Generated by builder. Do not modify. End declarations */ /* Generated by builder. Do not modify. End declarations */
#ifdef __cplusplus #ifdef __cplusplus