better wildcards handling. Tuning of user term boosting
This commit is contained in:
parent
5a9b90d26c
commit
2c3dc3e54c
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.10 2007-01-19 10:23:26 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.11 2007-01-25 15:50:54 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -94,6 +94,8 @@ bool SearchData::addClause(SearchDataClause* cl)
|
||||
m_reason = "No Negative (AND_NOT) clauses allowed in OR queries";
|
||||
return false;
|
||||
}
|
||||
cl->setParent(this);
|
||||
m_haveWildCards = m_haveWildCards || cl->m_haveWildCards;
|
||||
m_query.push_back(cl);
|
||||
return true;
|
||||
}
|
||||
@ -142,17 +144,20 @@ class wsQData : public TextSplitCB {
|
||||
}
|
||||
};
|
||||
|
||||
/// Translate user string (ie: term1 "a phrase" term3) into a xapian
|
||||
/// query tree
|
||||
// This used to be a static function, but we couldn't just keep adding
|
||||
// parameters to the interface!
|
||||
/**
|
||||
* Translate a user compound string as may be entered in recoll's
|
||||
* search entry fields, ex: [term1 "a phrase" term3] into a xapian
|
||||
* query tree.
|
||||
* The object keeps track of the query terms and term groups while
|
||||
* translating.
|
||||
*/
|
||||
class StringToXapianQ {
|
||||
public:
|
||||
StringToXapianQ(Db& db, const string &stmlng)
|
||||
: m_db(db), m_stemlang(stmlng)
|
||||
StringToXapianQ(Db& db, const string &stmlng, bool boostUser)
|
||||
: m_db(db), m_stemlang(stmlng), m_doBoostUserTerms(boostUser)
|
||||
{ }
|
||||
|
||||
bool translate(const string &iq,
|
||||
bool processUserString(const string &iq,
|
||||
const string &prefix,
|
||||
string &ermsg,
|
||||
list<Xapian::Query> &pqueries,
|
||||
@ -167,12 +172,12 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
void maybeStemExp(bool dont, const string& term, list<string>& exp,
|
||||
void stripExpandTerm(bool dont, const string& term, list<string>& exp,
|
||||
string& sterm);
|
||||
|
||||
Db& m_db;
|
||||
const string& m_stemlang;
|
||||
|
||||
bool m_doBoostUserTerms;
|
||||
// Single terms and phrases resulting from breaking up text;
|
||||
vector<string> m_terms;
|
||||
vector<vector<string> > m_groups;
|
||||
@ -181,31 +186,33 @@ private:
|
||||
/** Unaccent and lowercase term, possibly expand stem and wildcards
|
||||
*
|
||||
* @param nostemexp don't perform stem expansion. This is mainly used to
|
||||
* prevent stem expansion inside phrases. 2 other factors can turn
|
||||
* stem expansion off: a null stemlang, resulting from a global user
|
||||
* preference, or a capitalized term.
|
||||
* prevent stem expansion inside phrases (because the user probably
|
||||
* does not expect it). This does NOT prevent wild card expansion.
|
||||
* Other factors than nostemexp can prevent stem expansion:
|
||||
* a null stemlang, resulting from a global user preference, a
|
||||
* capitalized term, or wildcard(s)
|
||||
* @param term input single word
|
||||
* @param exp output expansion list
|
||||
* @param sterm output lower-cased+unaccented version of the input term
|
||||
* (only if stem expansion actually occured, else empty)
|
||||
* (only for stem expansion, not wildcards)
|
||||
*/
|
||||
void StringToXapianQ::maybeStemExp(bool nostemexp,
|
||||
const string& term,
|
||||
list<string>& exp,
|
||||
string &sterm)
|
||||
void StringToXapianQ::stripExpandTerm(bool nostemexp,
|
||||
const string& term,
|
||||
list<string>& exp,
|
||||
string &sterm)
|
||||
{
|
||||
LOGDEB2(("maybeStemExp: term [%s] stemlang [%s] nostemexp %d\n",
|
||||
LOGDEB2(("stripExpandTerm: term [%s] stemlang [%s] nostemexp %d\n",
|
||||
term.c_str(), m_stemlang.c_str(), nostemexp));
|
||||
sterm.erase();
|
||||
exp.clear();
|
||||
if (term.empty()) {
|
||||
exp.clear();
|
||||
return;
|
||||
}
|
||||
// term1 is lowercase and without diacritics
|
||||
string term1;
|
||||
dumb_string(term, term1);
|
||||
|
||||
bool haswild = term.find_first_of("*?") != string::npos;
|
||||
bool haswild = term.find_first_of("*?[") != string::npos;
|
||||
|
||||
// No stemming if there are wildcards or prevented globally.
|
||||
if (haswild || m_stemlang.empty())
|
||||
@ -228,6 +235,7 @@ void StringToXapianQ::maybeStemExp(bool nostemexp,
|
||||
|
||||
if (nostemexp && !haswild) {
|
||||
// Neither stemming nor wildcard expansion: just the word
|
||||
sterm = term1;
|
||||
exp.push_front(term1);
|
||||
exp.resize(1);
|
||||
} else {
|
||||
@ -279,6 +287,7 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
||||
}
|
||||
}
|
||||
|
||||
/** Add prefix to all strings in list */
|
||||
static void addPrefix(list<string>& terms, const string& prefix)
|
||||
{
|
||||
if (prefix.empty())
|
||||
@ -300,23 +309,27 @@ static void addPrefix(list<string>& terms, const string& prefix)
|
||||
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
||||
* count)
|
||||
*/
|
||||
bool StringToXapianQ::translate(const string &iq,
|
||||
bool StringToXapianQ::processUserString(const string &iq,
|
||||
const string &prefix,
|
||||
string &ermsg,
|
||||
list<Xapian::Query> &pqueries,
|
||||
int slack, bool useNear)
|
||||
{
|
||||
LOGDEB2(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
||||
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
||||
ermsg.erase();
|
||||
m_terms.clear();
|
||||
m_groups.clear();
|
||||
|
||||
// Split into words and phrases (word1 word2 "this is a phrase"):
|
||||
// Split input into user-level words and double-quoted phrases:
|
||||
// word1 word2 "this is a phrase". The text splitter may still
|
||||
// decide that the resulting "words" are really phrases, this
|
||||
// depends on separators: [paul@dom.net] would still be a word
|
||||
// (span), but [about:me] will probably be handled as a phrase.
|
||||
list<string> phrases;
|
||||
stringToStrings(iq, phrases);
|
||||
|
||||
// Then process each word/phrase: split into terms and transform
|
||||
// into appropriate Xapian Query
|
||||
// Process each element: textsplit into terms, handle stem/wildcard
|
||||
// expansion and transform into an appropriate Xapian::Query
|
||||
try {
|
||||
for (list<string>::iterator it = phrases.begin();
|
||||
it != phrases.end(); it++) {
|
||||
@ -340,32 +353,43 @@ bool StringToXapianQ::translate(const string &iq,
|
||||
splitDataS.terms.size() != splitDataW.terms.size())
|
||||
splitData = &splitDataW;
|
||||
|
||||
LOGDEB1(("strToXapianQ: splitter term count: %d\n",
|
||||
LOGDEB(("strToXapianQ: splitter term count: %d\n",
|
||||
splitData->terms.size()));
|
||||
switch(splitData->terms.size()) {
|
||||
switch (splitData->terms.size()) {
|
||||
case 0: continue;// ??
|
||||
case 1: // Not a real phrase: one term
|
||||
case 1:
|
||||
// Not a real phrase: one term. Still may be expanded
|
||||
// (stem or wildcard)
|
||||
{
|
||||
string term = splitData->terms.front();
|
||||
list<string> exp;
|
||||
string sterm;
|
||||
maybeStemExp(false, term, exp, sterm);
|
||||
stripExpandTerm(false, term, exp, sterm);
|
||||
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
||||
// Push either term or OR of stem-expanded set
|
||||
addPrefix(exp, prefix);
|
||||
Xapian::Query xq(Xapian::Query::OP_OR,
|
||||
exp.begin(), exp.end());
|
||||
// Give a relevance boost to the original term
|
||||
if (exp.size() > 1 && !sterm.empty()) {
|
||||
xq = Xapian::Query(Xapian::Query::OP_OR,
|
||||
xq, Xapian::Query(prefix+sterm, 10));
|
||||
|
||||
// If sterm is not null, give a relevance boost to
|
||||
// the original term. We do this even if no
|
||||
// expansion occurred (else the non-expanded terms
|
||||
// in a term list would end-up with even less
|
||||
// wqf). This does not happen if there are
|
||||
// wildcards anywhere in the search.
|
||||
if (m_doBoostUserTerms && !sterm.empty()) {
|
||||
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
|
||||
Xapian::Query(prefix+sterm, 10));
|
||||
}
|
||||
pqueries.push_back(xq);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
// Phrase/near
|
||||
// Phrase/near: transform into a PHRASE or NEAR xapian
|
||||
// query, the element of which can themselves be OR
|
||||
// queries if the terms get expanded by stemming or
|
||||
// wildcards (we don't do stemming for PHRASE though)
|
||||
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
||||
Xapian::Query::OP_PHRASE;
|
||||
list<Xapian::Query> orqueries;
|
||||
@ -380,8 +404,7 @@ bool StringToXapianQ::translate(const string &iq,
|
||||
true : false;
|
||||
string sterm;
|
||||
list<string>exp;
|
||||
maybeStemExp(nostemexp, *it, exp, sterm);
|
||||
|
||||
stripExpandTerm(nostemexp, *it, exp, sterm);
|
||||
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
||||
addPrefix(exp, prefix);
|
||||
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||
@ -466,8 +489,15 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
||||
if (!m_field.empty())
|
||||
prefix = fieldToPrefix(m_field);
|
||||
list<Xapian::Query> pqueries;
|
||||
StringToXapianQ tr(db, stemlang);
|
||||
if (!tr.translate(m_text, prefix, m_reason, pqueries))
|
||||
|
||||
// We normally boost the original term in the stem expansion list. Don't
|
||||
// do it if there are wildcards anywhere, this would skew the results.
|
||||
bool doBoostUserTerm =
|
||||
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
||||
(m_parentSearch == 0 && !m_haveWildCards);
|
||||
|
||||
StringToXapianQ tr(db, stemlang, doBoostUserTerm);
|
||||
if (!tr.processUserString(m_text, prefix, m_reason, pqueries))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||
@ -496,6 +526,7 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
||||
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
||||
const string& stemlang)
|
||||
{
|
||||
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
|
||||
m_terms.clear();
|
||||
m_groups.clear();
|
||||
|
||||
@ -509,12 +540,24 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
||||
if (!m_field.empty())
|
||||
prefix = fieldToPrefix(m_field);
|
||||
|
||||
// Use stringToXapianQueries to lowercase and simplify the phrase
|
||||
// terms etc. The result should be a single element list
|
||||
// We normally boost the original term in the stem expansion list. Don't
|
||||
// do it if there are wildcards anywhere, this would skew the results.
|
||||
bool doBoostUserTerm =
|
||||
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
||||
(m_parentSearch == 0 && !m_haveWildCards);
|
||||
|
||||
// We produce a single phrase out of the user entry (there should be
|
||||
// no dquotes in there), then use stringToXapianQueries() to
|
||||
// lowercase and simplify the phrase terms etc. This will result
|
||||
// into a single (complex) Xapian::Query.
|
||||
if (m_text.find_first_of("\"") != string::npos) {
|
||||
LOGDEB(("Double quotes inside phrase/near field\n"));
|
||||
return false;
|
||||
}
|
||||
string s = string("\"") + m_text + string("\"");
|
||||
bool useNear = m_tp == SCLT_NEAR;
|
||||
StringToXapianQ tr(db, stemlang);
|
||||
if (!tr.translate(s, prefix, m_reason, pqueries, m_slack, useNear))
|
||||
bool useNear = (m_tp == SCLT_NEAR);
|
||||
StringToXapianQ tr(db, stemlang, doBoostUserTerm);
|
||||
if (!tr.processUserString(s, prefix, m_reason, pqueries, m_slack, useNear))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _SEARCHDATA_H_INCLUDED_
|
||||
#define _SEARCHDATA_H_INCLUDED_
|
||||
/* @(#$Id: searchdata.h,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: searchdata.h,v 1.9 2007-01-25 15:50:54 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
/**
|
||||
* Structures to hold data coming almost directly from the gui
|
||||
@ -65,7 +65,7 @@ class SearchDataClause;
|
||||
*/
|
||||
class SearchData {
|
||||
public:
|
||||
SearchData(SClType tp) : m_tp(tp) {}
|
||||
SearchData(SClType tp) : m_tp(tp), m_haveWildCards(false) {}
|
||||
~SearchData() {erase();}
|
||||
|
||||
/** Make pristine */
|
||||
@ -74,6 +74,9 @@ public:
|
||||
/** Is there anything but a file name search in here ? */
|
||||
bool fileNameOnly();
|
||||
|
||||
/** Do we have wildcards anywhere apart from filename searches ? */
|
||||
bool haveWildCards() {return m_haveWildCards;}
|
||||
|
||||
/** Translate to Xapian query. rcldb knows about the void* */
|
||||
bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||
|
||||
@ -110,7 +113,7 @@ private:
|
||||
// from rcldb after the Xapian::setQuery() call
|
||||
string m_description;
|
||||
string m_reason;
|
||||
|
||||
bool m_haveWildCards;
|
||||
/* Copyconst and assignment private and forbidden */
|
||||
SearchData(const SearchData &) {}
|
||||
SearchData& operator=(const SearchData&) {return *this;};
|
||||
@ -118,7 +121,7 @@ private:
|
||||
|
||||
class SearchDataClause {
|
||||
public:
|
||||
SearchDataClause(SClType tp) : m_tp(tp) {}
|
||||
SearchDataClause(SClType tp) : m_tp(tp), m_parentSearch(0) {}
|
||||
virtual ~SearchDataClause() {}
|
||||
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string&) = 0;
|
||||
@ -132,11 +135,14 @@ public:
|
||||
{return true;}
|
||||
virtual SClType getTp() {return m_tp;}
|
||||
|
||||
virtual void setParent(SearchData *p) {m_parentSearch = p;}
|
||||
friend class SearchData;
|
||||
|
||||
protected:
|
||||
string m_reason;
|
||||
string m_reason;
|
||||
SClType m_tp;
|
||||
SearchData *m_parentSearch;
|
||||
bool m_haveWildCards;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -147,7 +153,9 @@ class SearchDataClauseSimple : public SearchDataClause {
|
||||
public:
|
||||
SearchDataClauseSimple(SClType tp, const string& txt,
|
||||
const string& fld = "")
|
||||
: SearchDataClause(tp), m_text(txt), m_field(fld), m_slack(0) {}
|
||||
: SearchDataClause(tp), m_text(txt), m_field(fld), m_slack(0) {
|
||||
m_haveWildCards = (txt.find_first_of("*?[") != string::npos);
|
||||
}
|
||||
|
||||
virtual ~SearchDataClauseSimple() {}
|
||||
|
||||
@ -181,7 +189,10 @@ protected:
|
||||
class SearchDataClauseFilename : public SearchDataClauseSimple {
|
||||
public:
|
||||
SearchDataClauseFilename(const string& txt)
|
||||
: SearchDataClauseSimple(SCLT_FILENAME, txt) {}
|
||||
: SearchDataClauseSimple(SCLT_FILENAME, txt) {
|
||||
// File name searches don't count when looking for wild cards.
|
||||
m_haveWildCards = false;
|
||||
}
|
||||
virtual ~SearchDataClauseFilename() {}
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
|
||||
};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user