better wildcards handling. Tuning of user term boosting

This commit is contained in:
dockes 2007-01-25 15:50:54 +00:00
parent 5a9b90d26c
commit 2c3dc3e54c
2 changed files with 105 additions and 51 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.10 2007-01-19 10:23:26 dockes Exp $ (C) 2006 J.F.Dockes"; static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.11 2007-01-25 15:50:54 dockes Exp $ (C) 2006 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -94,6 +94,8 @@ bool SearchData::addClause(SearchDataClause* cl)
m_reason = "No Negative (AND_NOT) clauses allowed in OR queries"; m_reason = "No Negative (AND_NOT) clauses allowed in OR queries";
return false; return false;
} }
cl->setParent(this);
m_haveWildCards = m_haveWildCards || cl->m_haveWildCards;
m_query.push_back(cl); m_query.push_back(cl);
return true; return true;
} }
@ -142,17 +144,20 @@ class wsQData : public TextSplitCB {
} }
}; };
/// Translate user string (ie: term1 "a phrase" term3) into a xapian /**
/// query tree * Translate a user compound string as may be entered in recoll's
// This used to be a static function, but we couldn't just keep adding * search entry fields, ex: [term1 "a phrase" term3] into a xapian
// parameters to the interface! * query tree.
* The object keeps track of the query terms and term groups while
* translating.
*/
class StringToXapianQ { class StringToXapianQ {
public: public:
StringToXapianQ(Db& db, const string &stmlng) StringToXapianQ(Db& db, const string &stmlng, bool boostUser)
: m_db(db), m_stemlang(stmlng) : m_db(db), m_stemlang(stmlng), m_doBoostUserTerms(boostUser)
{ } { }
bool translate(const string &iq, bool processUserString(const string &iq,
const string &prefix, const string &prefix,
string &ermsg, string &ermsg,
list<Xapian::Query> &pqueries, list<Xapian::Query> &pqueries,
@ -167,12 +172,12 @@ public:
} }
private: private:
void maybeStemExp(bool dont, const string& term, list<string>& exp, void stripExpandTerm(bool dont, const string& term, list<string>& exp,
string& sterm); string& sterm);
Db& m_db; Db& m_db;
const string& m_stemlang; const string& m_stemlang;
bool m_doBoostUserTerms;
// Single terms and phrases resulting from breaking up text; // Single terms and phrases resulting from breaking up text;
vector<string> m_terms; vector<string> m_terms;
vector<vector<string> > m_groups; vector<vector<string> > m_groups;
@ -181,31 +186,33 @@ private:
/** Unaccent and lowercase term, possibly expand stem and wildcards /** Unaccent and lowercase term, possibly expand stem and wildcards
* *
* @param nostemexp don't perform stem expansion. This is mainly used to * @param nostemexp don't perform stem expansion. This is mainly used to
* prevent stem expansion inside phrases. 2 other factors can turn * prevent stem expansion inside phrases (because the user probably
* stem expansion off: a null stemlang, resulting from a global user * does not expect it). This does NOT prevent wild card expansion.
* preference, or a capitalized term. * Other factors than nostemexp can prevent stem expansion:
* a null stemlang, resulting from a global user preference, a
* capitalized term, or wildcard(s)
* @param term input single word * @param term input single word
* @param exp output expansion list * @param exp output expansion list
* @param sterm output lower-cased+unaccented version of the input term * @param sterm output lower-cased+unaccented version of the input term
* (only if stem expansion actually occured, else empty) * (only for stem expansion, not wildcards)
*/ */
void StringToXapianQ::maybeStemExp(bool nostemexp, void StringToXapianQ::stripExpandTerm(bool nostemexp,
const string& term, const string& term,
list<string>& exp, list<string>& exp,
string &sterm) string &sterm)
{ {
LOGDEB2(("maybeStemExp: term [%s] stemlang [%s] nostemexp %d\n", LOGDEB2(("stripExpandTerm: term [%s] stemlang [%s] nostemexp %d\n",
term.c_str(), m_stemlang.c_str(), nostemexp)); term.c_str(), m_stemlang.c_str(), nostemexp));
sterm.erase(); sterm.erase();
exp.clear();
if (term.empty()) { if (term.empty()) {
exp.clear();
return; return;
} }
// term1 is lowercase and without diacritics // term1 is lowercase and without diacritics
string term1; string term1;
dumb_string(term, term1); dumb_string(term, term1);
bool haswild = term.find_first_of("*?") != string::npos; bool haswild = term.find_first_of("*?[") != string::npos;
// No stemming if there are wildcards or prevented globally. // No stemming if there are wildcards or prevented globally.
if (haswild || m_stemlang.empty()) if (haswild || m_stemlang.empty())
@ -228,6 +235,7 @@ void StringToXapianQ::maybeStemExp(bool nostemexp,
if (nostemexp && !haswild) { if (nostemexp && !haswild) {
// Neither stemming nor wildcard expansion: just the word // Neither stemming nor wildcard expansion: just the word
sterm = term1;
exp.push_front(term1); exp.push_front(term1);
exp.resize(1); exp.resize(1);
} else { } else {
@ -279,6 +287,7 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
} }
} }
/** Add prefix to all strings in list */
static void addPrefix(list<string>& terms, const string& prefix) static void addPrefix(list<string>& terms, const string& prefix)
{ {
if (prefix.empty()) if (prefix.empty())
@ -300,23 +309,27 @@ static void addPrefix(list<string>& terms, const string& prefix)
* @return the subquery count (either or'd stem-expanded terms or phrase word * @return the subquery count (either or'd stem-expanded terms or phrase word
* count) * count)
*/ */
bool StringToXapianQ::translate(const string &iq, bool StringToXapianQ::processUserString(const string &iq,
const string &prefix, const string &prefix,
string &ermsg, string &ermsg,
list<Xapian::Query> &pqueries, list<Xapian::Query> &pqueries,
int slack, bool useNear) int slack, bool useNear)
{ {
LOGDEB2(("StringToXapianQ:: query string: [%s]\n", iq.c_str())); LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
ermsg.erase(); ermsg.erase();
m_terms.clear(); m_terms.clear();
m_groups.clear(); m_groups.clear();
// Split into words and phrases (word1 word2 "this is a phrase"): // Split input into user-level words and double-quoted phrases:
// word1 word2 "this is a phrase". The text splitter may still
// decide that the resulting "words" are really phrases, this
// depends on separators: [paul@dom.net] would still be a word
// (span), but [about:me] will probably be handled as a phrase.
list<string> phrases; list<string> phrases;
stringToStrings(iq, phrases); stringToStrings(iq, phrases);
// Then process each word/phrase: split into terms and transform // Process each element: textsplit into terms, handle stem/wildcard
// into appropriate Xapian Query // expansion and transform into an appropriate Xapian::Query
try { try {
for (list<string>::iterator it = phrases.begin(); for (list<string>::iterator it = phrases.begin();
it != phrases.end(); it++) { it != phrases.end(); it++) {
@ -340,32 +353,43 @@ bool StringToXapianQ::translate(const string &iq,
splitDataS.terms.size() != splitDataW.terms.size()) splitDataS.terms.size() != splitDataW.terms.size())
splitData = &splitDataW; splitData = &splitDataW;
LOGDEB1(("strToXapianQ: splitter term count: %d\n", LOGDEB(("strToXapianQ: splitter term count: %d\n",
splitData->terms.size())); splitData->terms.size()));
switch(splitData->terms.size()) { switch (splitData->terms.size()) {
case 0: continue;// ?? case 0: continue;// ??
case 1: // Not a real phrase: one term case 1:
// Not a real phrase: one term. Still may be expanded
// (stem or wildcard)
{ {
string term = splitData->terms.front(); string term = splitData->terms.front();
list<string> exp; list<string> exp;
string sterm; string sterm;
maybeStemExp(false, term, exp, sterm); stripExpandTerm(false, term, exp, sterm);
m_terms.insert(m_terms.end(), exp.begin(), exp.end()); m_terms.insert(m_terms.end(), exp.begin(), exp.end());
// Push either term or OR of stem-expanded set // Push either term or OR of stem-expanded set
addPrefix(exp, prefix); addPrefix(exp, prefix);
Xapian::Query xq(Xapian::Query::OP_OR, Xapian::Query xq(Xapian::Query::OP_OR,
exp.begin(), exp.end()); exp.begin(), exp.end());
// Give a relevance boost to the original term
if (exp.size() > 1 && !sterm.empty()) { // If sterm is not null, give a relevance boost to
xq = Xapian::Query(Xapian::Query::OP_OR, // the original term. We do this even if no
xq, Xapian::Query(prefix+sterm, 10)); // expansion occurred (else the non-expanded terms
// in a term list would end-up with even less
// wqf). This does not happen if there are
// wildcards anywhere in the search.
if (m_doBoostUserTerms && !sterm.empty()) {
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
Xapian::Query(prefix+sterm, 10));
} }
pqueries.push_back(xq); pqueries.push_back(xq);
} }
break; break;
default: default:
// Phrase/near // Phrase/near: transform into a PHRASE or NEAR xapian
// query, the element of which can themselves be OR
// queries if the terms get expanded by stemming or
// wildcards (we don't do stemming for PHRASE though)
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
Xapian::Query::OP_PHRASE; Xapian::Query::OP_PHRASE;
list<Xapian::Query> orqueries; list<Xapian::Query> orqueries;
@ -380,8 +404,7 @@ bool StringToXapianQ::translate(const string &iq,
true : false; true : false;
string sterm; string sterm;
list<string>exp; list<string>exp;
maybeStemExp(nostemexp, *it, exp, sterm); stripExpandTerm(nostemexp, *it, exp, sterm);
groups.push_back(vector<string>(exp.begin(), exp.end())); groups.push_back(vector<string>(exp.begin(), exp.end()));
addPrefix(exp, prefix); addPrefix(exp, prefix);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
@ -466,8 +489,15 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
if (!m_field.empty()) if (!m_field.empty())
prefix = fieldToPrefix(m_field); prefix = fieldToPrefix(m_field);
list<Xapian::Query> pqueries; list<Xapian::Query> pqueries;
StringToXapianQ tr(db, stemlang);
if (!tr.translate(m_text, prefix, m_reason, pqueries)) // We normally boost the original term in the stem expansion list. Don't
// do it if there are wildcards anywhere, this would skew the results.
bool doBoostUserTerm =
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
(m_parentSearch == 0 && !m_haveWildCards);
StringToXapianQ tr(db, stemlang, doBoostUserTerm);
if (!tr.processUserString(m_text, prefix, m_reason, pqueries))
return false; return false;
if (pqueries.empty()) { if (pqueries.empty()) {
LOGERR(("SearchDataClauseSimple: resolved to null query\n")); LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
@ -496,6 +526,7 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p, bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
const string& stemlang) const string& stemlang)
{ {
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
m_terms.clear(); m_terms.clear();
m_groups.clear(); m_groups.clear();
@ -509,12 +540,24 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
if (!m_field.empty()) if (!m_field.empty())
prefix = fieldToPrefix(m_field); prefix = fieldToPrefix(m_field);
// Use stringToXapianQueries to lowercase and simplify the phrase // We normally boost the original term in the stem expansion list. Don't
// terms etc. The result should be a single element list // do it if there are wildcards anywhere, this would skew the results.
bool doBoostUserTerm =
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
(m_parentSearch == 0 && !m_haveWildCards);
// We produce a single phrase out of the user entry (there should be
// no dquotes in there), then use stringToXapianQueries() to
// lowercase and simplify the phrase terms etc. This will result
// into a single (complex) Xapian::Query.
if (m_text.find_first_of("\"") != string::npos) {
LOGDEB(("Double quotes inside phrase/near field\n"));
return false;
}
string s = string("\"") + m_text + string("\""); string s = string("\"") + m_text + string("\"");
bool useNear = m_tp == SCLT_NEAR; bool useNear = (m_tp == SCLT_NEAR);
StringToXapianQ tr(db, stemlang); StringToXapianQ tr(db, stemlang, doBoostUserTerm);
if (!tr.translate(s, prefix, m_reason, pqueries, m_slack, useNear)) if (!tr.processUserString(s, prefix, m_reason, pqueries, m_slack, useNear))
return false; return false;
if (pqueries.empty()) { if (pqueries.empty()) {
LOGERR(("SearchDataClauseDist: resolved to null query\n")); LOGERR(("SearchDataClauseDist: resolved to null query\n"));

View File

@ -16,7 +16,7 @@
*/ */
#ifndef _SEARCHDATA_H_INCLUDED_ #ifndef _SEARCHDATA_H_INCLUDED_
#define _SEARCHDATA_H_INCLUDED_ #define _SEARCHDATA_H_INCLUDED_
/* @(#$Id: searchdata.h,v 1.8 2007-01-17 13:53:41 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: searchdata.h,v 1.9 2007-01-25 15:50:54 dockes Exp $ (C) 2004 J.F.Dockes */
/** /**
* Structures to hold data coming almost directly from the gui * Structures to hold data coming almost directly from the gui
@ -65,7 +65,7 @@ class SearchDataClause;
*/ */
class SearchData { class SearchData {
public: public:
SearchData(SClType tp) : m_tp(tp) {} SearchData(SClType tp) : m_tp(tp), m_haveWildCards(false) {}
~SearchData() {erase();} ~SearchData() {erase();}
/** Make pristine */ /** Make pristine */
@ -74,6 +74,9 @@ public:
/** Is there anything but a file name search in here ? */ /** Is there anything but a file name search in here ? */
bool fileNameOnly(); bool fileNameOnly();
/** Do we have wildcards anywhere apart from filename searches ? */
bool haveWildCards() {return m_haveWildCards;}
/** Translate to Xapian query. rcldb knows about the void* */ /** Translate to Xapian query. rcldb knows about the void* */
bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang); bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
@ -110,7 +113,7 @@ private:
// from rcldb after the Xapian::setQuery() call // from rcldb after the Xapian::setQuery() call
string m_description; string m_description;
string m_reason; string m_reason;
bool m_haveWildCards;
/* Copyconst and assignment private and forbidden */ /* Copyconst and assignment private and forbidden */
SearchData(const SearchData &) {} SearchData(const SearchData &) {}
SearchData& operator=(const SearchData&) {return *this;}; SearchData& operator=(const SearchData&) {return *this;};
@ -118,7 +121,7 @@ private:
class SearchDataClause { class SearchDataClause {
public: public:
SearchDataClause(SClType tp) : m_tp(tp) {} SearchDataClause(SClType tp) : m_tp(tp), m_parentSearch(0) {}
virtual ~SearchDataClause() {} virtual ~SearchDataClause() {}
virtual bool toNativeQuery(Rcl::Db &db, void *, const string&) = 0; virtual bool toNativeQuery(Rcl::Db &db, void *, const string&) = 0;
@ -132,11 +135,14 @@ public:
{return true;} {return true;}
virtual SClType getTp() {return m_tp;} virtual SClType getTp() {return m_tp;}
virtual void setParent(SearchData *p) {m_parentSearch = p;}
friend class SearchData; friend class SearchData;
protected: protected:
string m_reason; string m_reason;
SClType m_tp; SClType m_tp;
SearchData *m_parentSearch;
bool m_haveWildCards;
}; };
/** /**
@ -147,7 +153,9 @@ class SearchDataClauseSimple : public SearchDataClause {
public: public:
SearchDataClauseSimple(SClType tp, const string& txt, SearchDataClauseSimple(SClType tp, const string& txt,
const string& fld = "") const string& fld = "")
: SearchDataClause(tp), m_text(txt), m_field(fld), m_slack(0) {} : SearchDataClause(tp), m_text(txt), m_field(fld), m_slack(0) {
m_haveWildCards = (txt.find_first_of("*?[") != string::npos);
}
virtual ~SearchDataClauseSimple() {} virtual ~SearchDataClauseSimple() {}
@ -181,7 +189,10 @@ protected:
class SearchDataClauseFilename : public SearchDataClauseSimple { class SearchDataClauseFilename : public SearchDataClauseSimple {
public: public:
SearchDataClauseFilename(const string& txt) SearchDataClauseFilename(const string& txt)
: SearchDataClauseSimple(SCLT_FILENAME, txt) {} : SearchDataClauseSimple(SCLT_FILENAME, txt) {
// File name searches don't count when looking for wild cards.
m_haveWildCards = false;
}
virtual ~SearchDataClauseFilename() {} virtual ~SearchDataClauseFilename() {}
virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang); virtual bool toNativeQuery(Rcl::Db &db, void *, const string& stemlang);
}; };