reorganize code + add boost to phrase element to match boost of original user terms
This commit is contained in:
parent
64ef8d0b81
commit
5d27917c66
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.27 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.28 2008-12-15 09:24:24 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -43,6 +43,8 @@ namespace Rcl {
|
|||||||
typedef vector<SearchDataClause *>::iterator qlist_it_t;
|
typedef vector<SearchDataClause *>::iterator qlist_it_t;
|
||||||
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
|
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
|
||||||
|
|
||||||
|
static const int original_term_wqf_booster = 10;
|
||||||
|
|
||||||
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
||||||
{
|
{
|
||||||
Xapian::Query xq;
|
Xapian::Query xq;
|
||||||
@ -172,8 +174,10 @@ bool SearchData::getTerms(vector<string>& terms,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Splitter callback for breaking a user query string into simple
|
// Splitter callback for breaking a user string into simple terms and
|
||||||
// terms and phrases.
|
// phrases. This is for parts of the user entry which would appear as
|
||||||
|
// a single word because there is no white space inside, but are
|
||||||
|
// actually multiple terms to rcldb (ie term1,term2)
|
||||||
class wsQData : public TextSplitCB {
|
class wsQData : public TextSplitCB {
|
||||||
public:
|
public:
|
||||||
wsQData(const StopList &_stops)
|
wsQData(const StopList &_stops)
|
||||||
@ -191,32 +195,33 @@ class wsQData : public TextSplitCB {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
const StopList &stops;
|
const StopList &stops;
|
||||||
int alltermcount; // Count of terms including stopwords: this is
|
// Count of terms including stopwords: this is for adjusting
|
||||||
// for adjusting phrase/near slack
|
// phrase/near slack
|
||||||
|
int alltermcount;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
// A class used to translate a user compound string (*not* a query
|
||||||
* Translate a user compound string as may be entered in recoll's
|
// language string) as may be entered in any_terms/all_terms search
|
||||||
* search entry fields, ex: [term1 "a phrase" term3] into a xapian
|
// entry fields, ex: [term1 "a phrase" term3] into a xapian query
|
||||||
* query tree.
|
// tree.
|
||||||
* The object keeps track of the query terms and term groups while
|
// The object keeps track of the query terms and term groups while
|
||||||
* translating.
|
// translating.
|
||||||
*/
|
|
||||||
class StringToXapianQ {
|
class StringToXapianQ {
|
||||||
public:
|
public:
|
||||||
StringToXapianQ(Db& db, const string &stmlng, bool boostUser)
|
StringToXapianQ(Db& db, const string& prefix,
|
||||||
: m_db(db), m_stemlang(stmlng), m_doBoostUserTerms(boostUser)
|
const string &stmlng, bool boostUser)
|
||||||
|
: m_db(db), m_prefix(prefix), m_stemlang(stmlng),
|
||||||
|
m_doBoostUserTerms(boostUser)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
bool processUserString(const string &iq,
|
bool processUserString(const string &iq,
|
||||||
const string &prefix,
|
|
||||||
string &ermsg,
|
string &ermsg,
|
||||||
list<Xapian::Query> &pqueries,
|
list<Xapian::Query> &pqueries,
|
||||||
const StopList &stops,
|
const StopList &stops,
|
||||||
int slack = 0, bool useNear = false);
|
int slack = 0, bool useNear = false);
|
||||||
|
// After processing the string: return search terms and term
|
||||||
bool getTerms(vector<string>& terms,
|
// groups (ie: for highlighting)
|
||||||
vector<vector<string> >& groups)
|
bool getTerms(vector<string>& terms, vector<vector<string> >& groups)
|
||||||
{
|
{
|
||||||
terms.insert(terms.end(), m_terms.begin(), m_terms.end());
|
terms.insert(terms.end(), m_terms.begin(), m_terms.end());
|
||||||
groups.insert(groups.end(), m_groups.begin(), m_groups.end());
|
groups.insert(groups.end(), m_groups.begin(), m_groups.end());
|
||||||
@ -226,8 +231,15 @@ public:
|
|||||||
private:
|
private:
|
||||||
void stripExpandTerm(bool dont, const string& term, list<string>& exp,
|
void stripExpandTerm(bool dont, const string& term, list<string>& exp,
|
||||||
string& sterm);
|
string& sterm);
|
||||||
|
// After splitting entry on whitespace: process non-phrase element
|
||||||
|
void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
|
||||||
|
// Process phrase/near element
|
||||||
|
void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
|
||||||
|
list<Xapian::Query> &pqueries,
|
||||||
|
bool useNear, int slack);
|
||||||
|
|
||||||
Db& m_db;
|
Db& m_db;
|
||||||
|
const string& m_prefix;
|
||||||
const string& m_stemlang;
|
const string& m_stemlang;
|
||||||
bool m_doBoostUserTerms;
|
bool m_doBoostUserTerms;
|
||||||
// Single terms and phrases resulting from breaking up text;
|
// Single terms and phrases resulting from breaking up text;
|
||||||
@ -348,25 +360,100 @@ static void addPrefix(list<string>& terms, const string& prefix)
|
|||||||
it->insert(0, prefix);
|
it->insert(0, prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void StringToXapianQ::processSimpleSpan(const string& span,
|
||||||
|
list<Xapian::Query> &pqueries)
|
||||||
|
{
|
||||||
|
list<string> exp;
|
||||||
|
string sterm; // dumb version of user term
|
||||||
|
stripExpandTerm(false, span, exp, sterm);
|
||||||
|
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
||||||
|
addPrefix(exp, m_prefix);
|
||||||
|
// Push either term or OR of stem-expanded set
|
||||||
|
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
|
||||||
|
|
||||||
|
// If sterm (simplified original user term) is not null, give it a
|
||||||
|
// relevance boost. We do this even if no expansion occurred (else
|
||||||
|
// the non-expanded terms in a term list would end-up with even
|
||||||
|
// less wqf). This does not happen if there are wildcards anywhere
|
||||||
|
// in the search.
|
||||||
|
if (m_doBoostUserTerms && !sterm.empty()) {
|
||||||
|
xq = Xapian::Query(Xapian::Query::OP_OR,
|
||||||
|
xq,
|
||||||
|
Xapian::Query(m_prefix+sterm,
|
||||||
|
original_term_wqf_booster));
|
||||||
|
}
|
||||||
|
pqueries.push_back(xq);
|
||||||
|
}
|
||||||
|
|
||||||
|
// User entry element had several terms: transform into a PHRASE or
|
||||||
|
// NEAR xapian query, the elements of which can themselves be OR
|
||||||
|
// queries if the terms get expanded by stemming or wildcards (we
|
||||||
|
// don't do stemming for PHRASE though)
|
||||||
|
void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
|
||||||
|
list<Xapian::Query> &pqueries,
|
||||||
|
bool useNear, int slack)
|
||||||
|
{
|
||||||
|
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
||||||
|
Xapian::Query::OP_PHRASE;
|
||||||
|
list<Xapian::Query> orqueries;
|
||||||
|
bool hadmultiple = false;
|
||||||
|
vector<vector<string> >groups;
|
||||||
|
|
||||||
|
// Go through the list and perform stem/wildcard expansion for each element
|
||||||
|
for (vector<string>::iterator it = splitData->terms.begin();
|
||||||
|
it != splitData->terms.end(); it++) {
|
||||||
|
// Adjust when we do stem expansion. Not inside phrases, and
|
||||||
|
// some versions of xapian will accept only one OR clause
|
||||||
|
// inside NEAR, all others must be leafs.
|
||||||
|
bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
|
||||||
|
|
||||||
|
string sterm;
|
||||||
|
list<string>exp;
|
||||||
|
stripExpandTerm(nostemexp, *it, exp, sterm);
|
||||||
|
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
||||||
|
addPrefix(exp, m_prefix);
|
||||||
|
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||||
|
exp.begin(), exp.end()));
|
||||||
|
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||||
|
if (exp.size() > 1)
|
||||||
|
hadmultiple = true;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
||||||
|
// For phrases, give a relevance boost like we do for original terms
|
||||||
|
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
||||||
|
splitData->alltermcount + slack);
|
||||||
|
if (op == Xapian::Query::OP_PHRASE)
|
||||||
|
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
|
||||||
|
original_term_wqf_booster);
|
||||||
|
pqueries.push_back(xq);
|
||||||
|
|
||||||
|
// Add all combinations of NEAR/PHRASE groups to the highlighting data.
|
||||||
|
vector<vector<string> > allcombs;
|
||||||
|
vector<string> comb;
|
||||||
|
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
|
||||||
|
m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Turn string into list of xapian queries. There is little
|
* Turn user entry string (NOT query language) into a list of xapian queries.
|
||||||
* interpretation done on the string (no +term -term or filename:term
|
* We just separate words and phrases, and do wildcard and stemp expansion,
|
||||||
* stuff). We just separate words and phrases, and interpret
|
*
|
||||||
* capitalized terms as wanting no stem expansion.
|
|
||||||
* The final list contains one query for each term or phrase
|
* The final list contains one query for each term or phrase
|
||||||
* - Elements corresponding to a stem-expanded part are an OP_OR
|
* - Elements corresponding to a stem-expanded part are an OP_OR
|
||||||
* composition of the stem-expanded terms (or a single term query).
|
* composition of the stem-expanded terms (or a single term query).
|
||||||
* - Elements corresponding to a phrase are an OP_PHRASE composition of the
|
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
|
||||||
* phrase terms (no stem expansion in this case)
|
* composition of the phrase terms (no stem expansion in this case)
|
||||||
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
||||||
* count)
|
* count)
|
||||||
*/
|
*/
|
||||||
bool StringToXapianQ::processUserString(const string &iq,
|
bool StringToXapianQ::processUserString(const string &iq,
|
||||||
const string &prefix,
|
|
||||||
string &ermsg,
|
string &ermsg,
|
||||||
list<Xapian::Query> &pqueries,
|
list<Xapian::Query> &pqueries,
|
||||||
const StopList& stops,
|
const StopList& stops,
|
||||||
int slack, bool useNear
|
int slack,
|
||||||
|
bool useNear
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
||||||
@ -374,11 +461,12 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
m_terms.clear();
|
m_terms.clear();
|
||||||
m_groups.clear();
|
m_groups.clear();
|
||||||
|
|
||||||
// Split input into user-level words and double-quoted phrases:
|
// Simple whitespace-split input into user-level words and
|
||||||
// word1 word2 "this is a phrase". The text splitter may still
|
// double-quoted phrases: word1 word2 "this is a phrase". The text
|
||||||
// decide that the resulting "words" are really phrases, this
|
// splitter may further still decide that the resulting "words"
|
||||||
// depends on separators: [paul@dom.net] would still be a word
|
// are really phrases, this depends on separators: [paul@dom.net]
|
||||||
// (span), but [about:me] will probably be handled as a phrase.
|
// would still be a word (span), but [about:me] will probably be
|
||||||
|
// handled as a phrase.
|
||||||
list<string> phrases;
|
list<string> phrases;
|
||||||
TextSplit::stringToStrings(iq, phrases);
|
TextSplit::stringToStrings(iq, phrases);
|
||||||
|
|
||||||
@ -387,10 +475,10 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
try {
|
try {
|
||||||
for (list<string>::iterator it = phrases.begin();
|
for (list<string>::iterator it = phrases.begin();
|
||||||
it != phrases.end(); it++) {
|
it != phrases.end(); it++) {
|
||||||
LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));
|
LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
|
||||||
|
|
||||||
// If there are multiple spans in this element, including
|
// If there are multiple spans in this element, including
|
||||||
// at least one composite, we have to do something
|
// at least one composite, we have to increase the slack
|
||||||
// else a phrase query including a span would fail.
|
// else a phrase query including a span would fail.
|
||||||
// Ex: "term0@term1 term2" is onlyspans-split as:
|
// Ex: "term0@term1 term2" is onlyspans-split as:
|
||||||
// 0 term0@term1 0 12
|
// 0 term0@term1 0 12
|
||||||
@ -419,81 +507,15 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
// used to: splitData = &splitDataW;
|
// used to: splitData = &splitDataW;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGDEB(("strToXapianQ: splitter term count: %d\n",
|
LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));
|
||||||
splitData->terms.size()));
|
|
||||||
switch (splitData->terms.size()) {
|
switch (splitData->terms.size()) {
|
||||||
case 0: continue;// ??
|
case 0:
|
||||||
|
continue;// ??
|
||||||
case 1:
|
case 1:
|
||||||
// Just a term. Still may be expanded (by stem or
|
processSimpleSpan(splitData->terms.front(), pqueries);
|
||||||
// wildcard) to an OR list.
|
|
||||||
{
|
|
||||||
string term = splitData->terms.front();
|
|
||||||
list<string> exp;
|
|
||||||
string sterm; // dumb version of user term
|
|
||||||
stripExpandTerm(false, term, exp, sterm);
|
|
||||||
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
|
||||||
// Push either term or OR of stem-expanded set
|
|
||||||
addPrefix(exp, prefix);
|
|
||||||
Xapian::Query xq(Xapian::Query::OP_OR,
|
|
||||||
exp.begin(), exp.end());
|
|
||||||
|
|
||||||
// If sterm is not null, give a relevance boost to
|
|
||||||
// the original term. We do this even if no
|
|
||||||
// expansion occurred (else the non-expanded terms
|
|
||||||
// in a term list would end-up with even less
|
|
||||||
// wqf). This does not happen if there are
|
|
||||||
// wildcards anywhere in the search.
|
|
||||||
if (m_doBoostUserTerms && !sterm.empty()) {
|
|
||||||
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
|
|
||||||
Xapian::Query(prefix+sterm, 10));
|
|
||||||
}
|
|
||||||
pqueries.push_back(xq);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
// Element had several terms: transform into a PHRASE
|
processPhraseOrNear(splitData, pqueries, useNear, slack);
|
||||||
// or NEAR xapian query, the elements of which can
|
|
||||||
// themselves be OR queries if the terms get expanded
|
|
||||||
// by stemming or wildcards (we don't do stemming for
|
|
||||||
// PHRASE though)
|
|
||||||
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
|
||||||
Xapian::Query::OP_PHRASE;
|
|
||||||
list<Xapian::Query> orqueries;
|
|
||||||
bool hadmultiple = false;
|
|
||||||
vector<vector<string> >groups;
|
|
||||||
for (vector<string>::iterator it = splitData->terms.begin();
|
|
||||||
it != splitData->terms.end(); it++) {
|
|
||||||
// Some version of xapian will accept only one OR clause
|
|
||||||
// inside NEAR, all others must be leafs
|
|
||||||
bool nostemexp =
|
|
||||||
op == Xapian::Query::OP_PHRASE || hadmultiple;
|
|
||||||
|
|
||||||
string sterm;
|
|
||||||
list<string>exp;
|
|
||||||
stripExpandTerm(nostemexp, *it, exp, sterm);
|
|
||||||
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
|
||||||
addPrefix(exp, prefix);
|
|
||||||
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
|
||||||
exp.begin(), exp.end()));
|
|
||||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
|
||||||
if (exp.size() > 1)
|
|
||||||
hadmultiple = true;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
pqueries.push_back(Xapian::Query(op,
|
|
||||||
orqueries.begin(),
|
|
||||||
orqueries.end(),
|
|
||||||
splitData->alltermcount
|
|
||||||
+ slack));
|
|
||||||
// Add NEAR/PHRASE groups to the highlighting data. Must
|
|
||||||
// push all combinations
|
|
||||||
vector<vector<string> > allcombs;
|
|
||||||
vector<string> comb;
|
|
||||||
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
|
|
||||||
m_groups.insert(m_groups.end(), allcombs.begin(),
|
|
||||||
allcombs.end());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
@ -547,8 +569,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
|||||||
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
||||||
(m_parentSearch == 0 && !m_haveWildCards);
|
(m_parentSearch == 0 && !m_haveWildCards);
|
||||||
|
|
||||||
StringToXapianQ tr(db, l_stemlang, doBoostUserTerm);
|
StringToXapianQ tr(db, prefix, l_stemlang, doBoostUserTerm);
|
||||||
if (!tr.processUserString(m_text, prefix, m_reason, pqueries,
|
if (!tr.processUserString(m_text, m_reason, pqueries,
|
||||||
db.getStopList()))
|
db.getStopList()))
|
||||||
return false;
|
return false;
|
||||||
if (pqueries.empty()) {
|
if (pqueries.empty()) {
|
||||||
@ -617,8 +639,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
|||||||
}
|
}
|
||||||
string s = string("\"") + m_text + string("\"");
|
string s = string("\"") + m_text + string("\"");
|
||||||
bool useNear = (m_tp == SCLT_NEAR);
|
bool useNear = (m_tp == SCLT_NEAR);
|
||||||
StringToXapianQ tr(db, l_stemlang, doBoostUserTerm);
|
StringToXapianQ tr(db, prefix, l_stemlang, doBoostUserTerm);
|
||||||
if (!tr.processUserString(s, prefix, m_reason, pqueries, db.getStopList(),
|
if (!tr.processUserString(s, m_reason, pqueries, db.getStopList(),
|
||||||
m_slack, useNear))
|
m_slack, useNear))
|
||||||
return false;
|
return false;
|
||||||
if (pqueries.empty()) {
|
if (pqueries.empty()) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user