reorganize code + add boost to phrase element to match boost of original user terms

This commit is contained in:
dockes 2008-12-15 09:24:24 +00:00
parent 64ef8d0b81
commit 5d27917c66

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.27 2008-12-05 11:09:31 dockes Exp $ (C) 2006 J.F.Dockes";
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.28 2008-12-15 09:24:24 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -43,6 +43,8 @@ namespace Rcl {
typedef vector<SearchDataClause *>::iterator qlist_it_t;
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
static const int original_term_wqf_booster = 10;
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
{
Xapian::Query xq;
@ -172,8 +174,10 @@ bool SearchData::getTerms(vector<string>& terms,
return true;
}
// Splitter callback for breaking a user query string into simple
// terms and phrases.
// Splitter callback for breaking a user string into simple terms and
// phrases. This is for parts of the user entry which would appear as
// a single word because there is no white space inside, but are
// actually multiple terms to rcldb (ie term1,term2)
class wsQData : public TextSplitCB {
public:
wsQData(const StopList &_stops)
@ -191,32 +195,33 @@ class wsQData : public TextSplitCB {
return true;
}
const StopList &stops;
int alltermcount; // Count of terms including stopwords: this is
// for adjusting phrase/near slack
// Count of terms including stopwords: this is for adjusting
// phrase/near slack
int alltermcount;
};
/**
* Translate a user compound string as may be entered in recoll's
* search entry fields, ex: [term1 "a phrase" term3] into a xapian
* query tree.
* The object keeps track of the query terms and term groups while
* translating.
*/
// A class used to translate a user compound string (*not* a query
// language string) as may be entered in any_terms/all_terms search
// entry fields, ex: [term1 "a phrase" term3] into a xapian query
// tree.
// The object keeps track of the query terms and term groups while
// translating.
class StringToXapianQ {
public:
StringToXapianQ(Db& db, const string &stmlng, bool boostUser)
: m_db(db), m_stemlang(stmlng), m_doBoostUserTerms(boostUser)
StringToXapianQ(Db& db, const string& prefix,
const string &stmlng, bool boostUser)
: m_db(db), m_prefix(prefix), m_stemlang(stmlng),
m_doBoostUserTerms(boostUser)
{ }
bool processUserString(const string &iq,
const string &prefix,
string &ermsg,
list<Xapian::Query> &pqueries,
const StopList &stops,
int slack = 0, bool useNear = false);
bool getTerms(vector<string>& terms,
vector<vector<string> >& groups)
// After processing the string: return search terms and term
// groups (ie: for highlighting)
bool getTerms(vector<string>& terms, vector<vector<string> >& groups)
{
terms.insert(terms.end(), m_terms.begin(), m_terms.end());
groups.insert(groups.end(), m_groups.begin(), m_groups.end());
@ -226,8 +231,15 @@ public:
private:
void stripExpandTerm(bool dont, const string& term, list<string>& exp,
string& sterm);
// After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
// Process phrase/near element
void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
list<Xapian::Query> &pqueries,
bool useNear, int slack);
Db& m_db;
const string& m_prefix;
const string& m_stemlang;
bool m_doBoostUserTerms;
// Single terms and phrases resulting from breaking up text;
@ -348,25 +360,100 @@ static void addPrefix(list<string>& terms, const string& prefix)
it->insert(0, prefix);
}
void StringToXapianQ::processSimpleSpan(const string& span,
list<Xapian::Query> &pqueries)
{
list<string> exp;
string sterm; // dumb version of user term
stripExpandTerm(false, span, exp, sterm);
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
addPrefix(exp, m_prefix);
// Push either term or OR of stem-expanded set
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
// If sterm (simplified original user term) is not null, give it a
// relevance boost. We do this even if no expansion occurred (else
// the non-expanded terms in a term list would end-up with even
// less wqf). This does not happen if there are wildcards anywhere
// in the search.
if (m_doBoostUserTerms && !sterm.empty()) {
xq = Xapian::Query(Xapian::Query::OP_OR,
xq,
Xapian::Query(m_prefix+sterm,
original_term_wqf_booster));
}
pqueries.push_back(xq);
}
// User entry element had several terms: transform into a PHRASE or
// NEAR xapian query, the elements of which can themselves be OR
// queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though)
void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
list<Xapian::Query> &pqueries,
bool useNear, int slack)
{
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
Xapian::Query::OP_PHRASE;
list<Xapian::Query> orqueries;
bool hadmultiple = false;
vector<vector<string> >groups;
// Go through the list and perform stem/wildcard expansion for each element
for (vector<string>::iterator it = splitData->terms.begin();
it != splitData->terms.end(); it++) {
// Adjust when we do stem expansion. Not inside phrases, and
// some versions of xapian will accept only one OR clause
// inside NEAR, all others must be leafs.
bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
string sterm;
list<string>exp;
stripExpandTerm(nostemexp, *it, exp, sterm);
groups.push_back(vector<string>(exp.begin(), exp.end()));
addPrefix(exp, m_prefix);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(), exp.end()));
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
if (exp.size() > 1)
hadmultiple = true;
#endif
}
// Generate an appropriate PHRASE/NEAR query with adjusted slack
// For phrases, give a relevance boost like we do for original terms
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
splitData->alltermcount + slack);
if (op == Xapian::Query::OP_PHRASE)
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
original_term_wqf_booster);
pqueries.push_back(xq);
// Add all combinations of NEAR/PHRASE groups to the highlighting data.
vector<vector<string> > allcombs;
vector<string> comb;
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end());
}
/**
* Turn string into list of xapian queries. There is little
* interpretation done on the string (no +term -term or filename:term
* stuff). We just separate words and phrases, and interpret
* capitalized terms as wanting no stem expansion.
* Turn user entry string (NOT query language) into a list of xapian queries.
* We just separate words and phrases, and do wildcard and stemp expansion,
*
* The final list contains one query for each term or phrase
* - Elements corresponding to a stem-expanded part are an OP_OR
* composition of the stem-expanded terms (or a single term query).
* - Elements corresponding to a phrase are an OP_PHRASE composition of the
* phrase terms (no stem expansion in this case)
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
* composition of the phrase terms (no stem expansion in this case)
* @return the subquery count (either or'd stem-expanded terms or phrase word
* count)
*/
bool StringToXapianQ::processUserString(const string &iq,
const string &prefix,
string &ermsg,
list<Xapian::Query> &pqueries,
const StopList& stops,
int slack, bool useNear
int slack,
bool useNear
)
{
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
@ -374,11 +461,12 @@ bool StringToXapianQ::processUserString(const string &iq,
m_terms.clear();
m_groups.clear();
// Split input into user-level words and double-quoted phrases:
// word1 word2 "this is a phrase". The text splitter may still
// decide that the resulting "words" are really phrases, this
// depends on separators: [paul@dom.net] would still be a word
// (span), but [about:me] will probably be handled as a phrase.
// Simple whitespace-split input into user-level words and
// double-quoted phrases: word1 word2 "this is a phrase". The text
// splitter may further still decide that the resulting "words"
// are really phrases, this depends on separators: [paul@dom.net]
// would still be a word (span), but [about:me] will probably be
// handled as a phrase.
list<string> phrases;
TextSplit::stringToStrings(iq, phrases);
@ -387,10 +475,10 @@ bool StringToXapianQ::processUserString(const string &iq,
try {
for (list<string>::iterator it = phrases.begin();
it != phrases.end(); it++) {
LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));
LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
// If there are multiple spans in this element, including
// at least one composite, we have to do something
// at least one composite, we have to increase the slack
// else a phrase query including a span would fail.
// Ex: "term0@term1 term2" is onlyspans-split as:
// 0 term0@term1 0 12
@ -419,81 +507,15 @@ bool StringToXapianQ::processUserString(const string &iq,
// used to: splitData = &splitDataW;
}
LOGDEB(("strToXapianQ: splitter term count: %d\n",
splitData->terms.size()));
LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));
switch (splitData->terms.size()) {
case 0: continue;// ??
case 0:
continue;// ??
case 1:
// Just a term. Still may be expanded (by stem or
// wildcard) to an OR list.
{
string term = splitData->terms.front();
list<string> exp;
string sterm; // dumb version of user term
stripExpandTerm(false, term, exp, sterm);
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
// Push either term or OR of stem-expanded set
addPrefix(exp, prefix);
Xapian::Query xq(Xapian::Query::OP_OR,
exp.begin(), exp.end());
// If sterm is not null, give a relevance boost to
// the original term. We do this even if no
// expansion occurred (else the non-expanded terms
// in a term list would end-up with even less
// wqf). This does not happen if there are
// wildcards anywhere in the search.
if (m_doBoostUserTerms && !sterm.empty()) {
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
Xapian::Query(prefix+sterm, 10));
}
pqueries.push_back(xq);
}
processSimpleSpan(splitData->terms.front(), pqueries);
break;
default:
// Element had several terms: transform into a PHRASE
// or NEAR xapian query, the elements of which can
// themselves be OR queries if the terms get expanded
// by stemming or wildcards (we don't do stemming for
// PHRASE though)
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
Xapian::Query::OP_PHRASE;
list<Xapian::Query> orqueries;
bool hadmultiple = false;
vector<vector<string> >groups;
for (vector<string>::iterator it = splitData->terms.begin();
it != splitData->terms.end(); it++) {
// Some version of xapian will accept only one OR clause
// inside NEAR, all others must be leafs
bool nostemexp =
op == Xapian::Query::OP_PHRASE || hadmultiple;
string sterm;
list<string>exp;
stripExpandTerm(nostemexp, *it, exp, sterm);
groups.push_back(vector<string>(exp.begin(), exp.end()));
addPrefix(exp, prefix);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(), exp.end()));
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
if (exp.size() > 1)
hadmultiple = true;
#endif
}
pqueries.push_back(Xapian::Query(op,
orqueries.begin(),
orqueries.end(),
splitData->alltermcount
+ slack));
// Add NEAR/PHRASE groups to the highlighting data. Must
// push all combinations
vector<vector<string> > allcombs;
vector<string> comb;
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
m_groups.insert(m_groups.end(), allcombs.begin(),
allcombs.end());
processPhraseOrNear(splitData, pqueries, useNear, slack);
}
}
} catch (const Xapian::Error &e) {
@ -547,8 +569,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
(m_parentSearch == 0 && !m_haveWildCards);
StringToXapianQ tr(db, l_stemlang, doBoostUserTerm);
if (!tr.processUserString(m_text, prefix, m_reason, pqueries,
StringToXapianQ tr(db, prefix, l_stemlang, doBoostUserTerm);
if (!tr.processUserString(m_text, m_reason, pqueries,
db.getStopList()))
return false;
if (pqueries.empty()) {
@ -617,8 +639,8 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
}
string s = string("\"") + m_text + string("\"");
bool useNear = (m_tp == SCLT_NEAR);
StringToXapianQ tr(db, l_stemlang, doBoostUserTerm);
if (!tr.processUserString(s, prefix, m_reason, pqueries, db.getStopList(),
StringToXapianQ tr(db, prefix, l_stemlang, doBoostUserTerm);
if (!tr.processUserString(s, m_reason, pqueries, db.getStopList(),
m_slack, useNear))
return false;
if (pqueries.empty()) {