dont unaccent japanese + fix bug in unac/split ordering in searchdata

This commit is contained in:
dockes 2008-12-19 09:44:39 +00:00
parent 34b3cb3d1c
commit 0821f0cc29
3 changed files with 1332 additions and 3577 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.30 2008-12-17 14:26:09 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.31 2008-12-19 09:44:39 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -229,7 +229,7 @@ public:
}
private:
void stripExpandTerm(bool dont, const string& term, list<string>& exp,
void expandTerm(bool dont, const string& term, list<string>& exp,
string& sterm);
// After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
@ -247,7 +247,7 @@ private:
vector<vector<string> > m_groups;
};
/** Unaccent and lowercase term, possibly expand stem and wildcards
/** Expand stem and wildcards
*
* @param nostemexp don't perform stem expansion. This is mainly used to
* prevent stem expansion inside phrases (because the user probably
@ -257,24 +257,20 @@ private:
* capitalized term, or wildcard(s)
* @param term input single word
* @param exp output expansion list
* @param sterm output lower-cased+unaccented version of the input term
* (only for stem expansion, not wildcards)
* @param sterm output original input term if there were no wildcards
*/
void StringToXapianQ::stripExpandTerm(bool nostemexp,
void StringToXapianQ::expandTerm(bool nostemexp,
const string& term,
list<string>& exp,
string &sterm)
{
LOGDEB2(("stripExpandTerm: term [%s] stemlang [%s] nostemexp %d\n",
LOGDEB2(("expandTerm: term [%s] stemlang [%s] nostemexp %d\n",
term.c_str(), m_stemlang.c_str(), nostemexp));
sterm.erase();
exp.clear();
if (term.empty()) {
return;
}
// term1 is lowercase and without diacritics
string term1;
dumb_string(term, term1);
bool haswild = term.find_first_of("*?[") != string::npos;
@ -299,16 +295,16 @@ void StringToXapianQ::stripExpandTerm(bool nostemexp,
if (nostemexp && !haswild) {
// Neither stemming nor wildcard expansion: just the word
sterm = term1;
exp.push_front(term1);
sterm = term;
exp.push_front(term);
exp.resize(1);
} else {
list<TermMatchEntry> l;
if (haswild) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l);
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l);
} else {
sterm = term1;
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
sterm = term;
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l);
}
for (list<TermMatchEntry>::const_iterator it = l.begin();
it != l.end(); it++) {
@ -365,7 +361,7 @@ void StringToXapianQ::processSimpleSpan(const string& span,
{
list<string> exp;
string sterm; // dumb version of user term
stripExpandTerm(false, span, exp, sterm);
expandTerm(false, span, exp, sterm);
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
addPrefix(exp, m_prefix);
// Push either term or OR of stem-expanded set
@ -409,7 +405,7 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
string sterm;
list<string>exp;
stripExpandTerm(nostemexp, *it, exp, sterm);
expandTerm(nostemexp, *it, exp, sterm);
groups.push_back(vector<string>(exp.begin(), exp.end()));
addPrefix(exp, m_prefix);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
@ -448,7 +444,7 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
* @return the subquery count (either or'd stem-expanded terms or phrase word
* count)
*/
bool StringToXapianQ::processUserString(const string &iq,
bool StringToXapianQ::processUserString(const string &_iq,
string &ermsg,
list<Xapian::Query> &pqueries,
const StopList& stops,
@ -456,11 +452,19 @@ bool StringToXapianQ::processUserString(const string &iq,
bool useNear
)
{
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
ermsg.erase();
m_terms.clear();
m_groups.clear();
// First unaccent/normalize the input: do it first so that it
// happens in the same order as when indexing: unac then split. As
// the character count can change during normalisation, this is
// specially important for cjk because the artificial cjk split is
// based on character counts
string iq;
dumb_string(_iq, iq);
// Simple whitespace-split input into user-level words and
// double-quoted phrases: word1 word2 "this is a phrase". The text
// splitter may further still decide that the resulting "words"

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,7 @@ extern "C" {
#define UNAC_BLOCK_SHIFT 4
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
#define UNAC_BLOCK_COUNT 315
#define UNAC_BLOCK_COUNT 368
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
/* Generated by builder. Do not modify. End defines */
@ -478,6 +478,59 @@ extern unsigned short unac_data311[];
extern unsigned short unac_data312[];
extern unsigned short unac_data313[];
extern unsigned short unac_data314[];
extern unsigned short unac_data315[];
extern unsigned short unac_data316[];
extern unsigned short unac_data317[];
extern unsigned short unac_data318[];
extern unsigned short unac_data319[];
extern unsigned short unac_data320[];
extern unsigned short unac_data321[];
extern unsigned short unac_data322[];
extern unsigned short unac_data323[];
extern unsigned short unac_data324[];
extern unsigned short unac_data325[];
extern unsigned short unac_data326[];
extern unsigned short unac_data327[];
extern unsigned short unac_data328[];
extern unsigned short unac_data329[];
extern unsigned short unac_data330[];
extern unsigned short unac_data331[];
extern unsigned short unac_data332[];
extern unsigned short unac_data333[];
extern unsigned short unac_data334[];
extern unsigned short unac_data335[];
extern unsigned short unac_data336[];
extern unsigned short unac_data337[];
extern unsigned short unac_data338[];
extern unsigned short unac_data339[];
extern unsigned short unac_data340[];
extern unsigned short unac_data341[];
extern unsigned short unac_data342[];
extern unsigned short unac_data343[];
extern unsigned short unac_data344[];
extern unsigned short unac_data345[];
extern unsigned short unac_data346[];
extern unsigned short unac_data347[];
extern unsigned short unac_data348[];
extern unsigned short unac_data349[];
extern unsigned short unac_data350[];
extern unsigned short unac_data351[];
extern unsigned short unac_data352[];
extern unsigned short unac_data353[];
extern unsigned short unac_data354[];
extern unsigned short unac_data355[];
extern unsigned short unac_data356[];
extern unsigned short unac_data357[];
extern unsigned short unac_data358[];
extern unsigned short unac_data359[];
extern unsigned short unac_data360[];
extern unsigned short unac_data361[];
extern unsigned short unac_data362[];
extern unsigned short unac_data363[];
extern unsigned short unac_data364[];
extern unsigned short unac_data365[];
extern unsigned short unac_data366[];
extern unsigned short unac_data367[];
/* Generated by builder. Do not modify. End declarations */
#ifdef __cplusplus