dont unaccent japanese + fix bug in unac/split ordering in searchdata
This commit is contained in:
parent
34b3cb3d1c
commit
0821f0cc29
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.30 2008-12-17 14:26:09 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.31 2008-12-19 09:44:39 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -229,7 +229,7 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
void stripExpandTerm(bool dont, const string& term, list<string>& exp,
|
||||
void expandTerm(bool dont, const string& term, list<string>& exp,
|
||||
string& sterm);
|
||||
// After splitting entry on whitespace: process non-phrase element
|
||||
void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
|
||||
@ -247,7 +247,7 @@ private:
|
||||
vector<vector<string> > m_groups;
|
||||
};
|
||||
|
||||
/** Unaccent and lowercase term, possibly expand stem and wildcards
|
||||
/** Expand stem and wildcards
|
||||
*
|
||||
* @param nostemexp don't perform stem expansion. This is mainly used to
|
||||
* prevent stem expansion inside phrases (because the user probably
|
||||
@ -257,24 +257,20 @@ private:
|
||||
* capitalized term, or wildcard(s)
|
||||
* @param term input single word
|
||||
* @param exp output expansion list
|
||||
* @param sterm output lower-cased+unaccented version of the input term
|
||||
* (only for stem expansion, not wildcards)
|
||||
* @param sterm output original input term if there were no wildcards
|
||||
*/
|
||||
void StringToXapianQ::stripExpandTerm(bool nostemexp,
|
||||
void StringToXapianQ::expandTerm(bool nostemexp,
|
||||
const string& term,
|
||||
list<string>& exp,
|
||||
string &sterm)
|
||||
{
|
||||
LOGDEB2(("stripExpandTerm: term [%s] stemlang [%s] nostemexp %d\n",
|
||||
LOGDEB2(("expandTerm: term [%s] stemlang [%s] nostemexp %d\n",
|
||||
term.c_str(), m_stemlang.c_str(), nostemexp));
|
||||
sterm.erase();
|
||||
exp.clear();
|
||||
if (term.empty()) {
|
||||
return;
|
||||
}
|
||||
// term1 is lowercase and without diacritics
|
||||
string term1;
|
||||
dumb_string(term, term1);
|
||||
|
||||
bool haswild = term.find_first_of("*?[") != string::npos;
|
||||
|
||||
@ -299,16 +295,16 @@ void StringToXapianQ::stripExpandTerm(bool nostemexp,
|
||||
|
||||
if (nostemexp && !haswild) {
|
||||
// Neither stemming nor wildcard expansion: just the word
|
||||
sterm = term1;
|
||||
exp.push_front(term1);
|
||||
sterm = term;
|
||||
exp.push_front(term);
|
||||
exp.resize(1);
|
||||
} else {
|
||||
list<TermMatchEntry> l;
|
||||
if (haswild) {
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l);
|
||||
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l);
|
||||
} else {
|
||||
sterm = term1;
|
||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
|
||||
sterm = term;
|
||||
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l);
|
||||
}
|
||||
for (list<TermMatchEntry>::const_iterator it = l.begin();
|
||||
it != l.end(); it++) {
|
||||
@ -365,7 +361,7 @@ void StringToXapianQ::processSimpleSpan(const string& span,
|
||||
{
|
||||
list<string> exp;
|
||||
string sterm; // dumb version of user term
|
||||
stripExpandTerm(false, span, exp, sterm);
|
||||
expandTerm(false, span, exp, sterm);
|
||||
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
||||
addPrefix(exp, m_prefix);
|
||||
// Push either term or OR of stem-expanded set
|
||||
@ -409,7 +405,7 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
|
||||
|
||||
string sterm;
|
||||
list<string>exp;
|
||||
stripExpandTerm(nostemexp, *it, exp, sterm);
|
||||
expandTerm(nostemexp, *it, exp, sterm);
|
||||
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
||||
addPrefix(exp, m_prefix);
|
||||
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||
@ -448,7 +444,7 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
|
||||
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
||||
* count)
|
||||
*/
|
||||
bool StringToXapianQ::processUserString(const string &iq,
|
||||
bool StringToXapianQ::processUserString(const string &_iq,
|
||||
string &ermsg,
|
||||
list<Xapian::Query> &pqueries,
|
||||
const StopList& stops,
|
||||
@ -456,11 +452,19 @@ bool StringToXapianQ::processUserString(const string &iq,
|
||||
bool useNear
|
||||
)
|
||||
{
|
||||
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
||||
LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
|
||||
ermsg.erase();
|
||||
m_terms.clear();
|
||||
m_groups.clear();
|
||||
|
||||
// First unaccent/normalize the input: do it first so that it
|
||||
// happens in the same order as when indexing: unac then split. As
|
||||
// the character count can change during normalisation, this is
|
||||
// specially important for cjk because the artificial cjk split is
|
||||
// based on character counts
|
||||
string iq;
|
||||
dumb_string(_iq, iq);
|
||||
|
||||
// Simple whitespace-split input into user-level words and
|
||||
// double-quoted phrases: word1 word2 "this is a phrase". The text
|
||||
// splitter may further still decide that the resulting "words"
|
||||
|
||||
4812
src/unac/unac.c
4812
src/unac/unac.c
File diff suppressed because it is too large
Load Diff
@ -35,7 +35,7 @@ extern "C" {
|
||||
#define UNAC_BLOCK_SHIFT 4
|
||||
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
|
||||
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
|
||||
#define UNAC_BLOCK_COUNT 315
|
||||
#define UNAC_BLOCK_COUNT 368
|
||||
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
|
||||
/* Generated by builder. Do not modify. End defines */
|
||||
|
||||
@ -478,6 +478,59 @@ extern unsigned short unac_data311[];
|
||||
extern unsigned short unac_data312[];
|
||||
extern unsigned short unac_data313[];
|
||||
extern unsigned short unac_data314[];
|
||||
extern unsigned short unac_data315[];
|
||||
extern unsigned short unac_data316[];
|
||||
extern unsigned short unac_data317[];
|
||||
extern unsigned short unac_data318[];
|
||||
extern unsigned short unac_data319[];
|
||||
extern unsigned short unac_data320[];
|
||||
extern unsigned short unac_data321[];
|
||||
extern unsigned short unac_data322[];
|
||||
extern unsigned short unac_data323[];
|
||||
extern unsigned short unac_data324[];
|
||||
extern unsigned short unac_data325[];
|
||||
extern unsigned short unac_data326[];
|
||||
extern unsigned short unac_data327[];
|
||||
extern unsigned short unac_data328[];
|
||||
extern unsigned short unac_data329[];
|
||||
extern unsigned short unac_data330[];
|
||||
extern unsigned short unac_data331[];
|
||||
extern unsigned short unac_data332[];
|
||||
extern unsigned short unac_data333[];
|
||||
extern unsigned short unac_data334[];
|
||||
extern unsigned short unac_data335[];
|
||||
extern unsigned short unac_data336[];
|
||||
extern unsigned short unac_data337[];
|
||||
extern unsigned short unac_data338[];
|
||||
extern unsigned short unac_data339[];
|
||||
extern unsigned short unac_data340[];
|
||||
extern unsigned short unac_data341[];
|
||||
extern unsigned short unac_data342[];
|
||||
extern unsigned short unac_data343[];
|
||||
extern unsigned short unac_data344[];
|
||||
extern unsigned short unac_data345[];
|
||||
extern unsigned short unac_data346[];
|
||||
extern unsigned short unac_data347[];
|
||||
extern unsigned short unac_data348[];
|
||||
extern unsigned short unac_data349[];
|
||||
extern unsigned short unac_data350[];
|
||||
extern unsigned short unac_data351[];
|
||||
extern unsigned short unac_data352[];
|
||||
extern unsigned short unac_data353[];
|
||||
extern unsigned short unac_data354[];
|
||||
extern unsigned short unac_data355[];
|
||||
extern unsigned short unac_data356[];
|
||||
extern unsigned short unac_data357[];
|
||||
extern unsigned short unac_data358[];
|
||||
extern unsigned short unac_data359[];
|
||||
extern unsigned short unac_data360[];
|
||||
extern unsigned short unac_data361[];
|
||||
extern unsigned short unac_data362[];
|
||||
extern unsigned short unac_data363[];
|
||||
extern unsigned short unac_data364[];
|
||||
extern unsigned short unac_data365[];
|
||||
extern unsigned short unac_data366[];
|
||||
extern unsigned short unac_data367[];
|
||||
/* Generated by builder. Do not modify. End declarations */
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user