Term expansion: handle field issues inside rcldb::termmatch, ensuring that we take the field name into account for all expansions. Ensures that File Name searches and filename: query language searches work the same, + overall better consistency

This commit is contained in:
dockes 2009-12-07 13:27:57 +00:00
parent e932144440
commit bab030f846
3 changed files with 48 additions and 43 deletions

View File

@ -1329,11 +1329,7 @@ bool Db::purgeFile(const string &udi, bool *existed)
// File name wild card expansion. This is a specialisation ot termMatch // File name wild card expansion. This is a specialisation ot termMatch
bool Db::filenameWildExp(const string& fnexp, list<string>& names) bool Db::filenameWildExp(const string& fnexp, list<string>& names)
{ {
string pattern; string pattern = fnexp;
if (!unacmaybefold(fnexp, pattern, "UTF-8", true)) {
LOGERR(("Db::filenameWildExp: unac error for [%s]\n", fnexp.c_str()));
return false;
}
names.clear(); names.clear();
// If pattern is not quoted, and has no wildcards, we add * at // If pattern is not quoted, and has no wildcards, we add * at
@ -1350,12 +1346,12 @@ bool Db::filenameWildExp(const string& fnexp, list<string>& names)
return false; return false;
for (list<TermMatchEntry>::const_iterator it = entries.begin(); for (list<TermMatchEntry>::const_iterator it = entries.begin();
it != entries.end(); it++) it != entries.end(); it++)
names.push_back("XSFN"+it->term); names.push_back(it->term);
if (names.empty()) { if (names.empty()) {
// Build an impossible query: we know its impossible because we // Build an impossible query: we know its impossible because we
// control the prefixes! // control the prefixes!
names.push_back("XIMPOSSIBLE"); names.push_back("XNONENoMatchingTerms");
} }
return true; return true;
} }
@ -1398,6 +1394,16 @@ bool Db::stemExpand(const string &lang, const string &term,
return true; return true;
} }
/** Add prefix to all strings in list */
static void addPrefix(list<TermMatchEntry>& terms, const string& prefix)
{
if (prefix.empty())
return;
for (list<TermMatchEntry>::iterator it = terms.begin();
it != terms.end(); it++)
it->term.insert(0, prefix);
}
// Characters that can begin a wildcard or regexp expression. We use skipto // Characters that can begin a wildcard or regexp expression. We use skipto
// to begin the allterms search with terms that begin with the portion of // to begin the allterms search with terms that begin with the portion of
// the input string prior to these chars. // the input string prior to these chars.
@ -1409,7 +1415,9 @@ bool Db::termMatch(MatchType typ, const string &lang,
const string &root, const string &root,
list<TermMatchEntry>& res, list<TermMatchEntry>& res,
int max, int max,
const string& field) const string& field,
string *prefixp
)
{ {
if (!m_ndb || !m_ndb->m_isopen) if (!m_ndb || !m_ndb->m_isopen)
return false; return false;
@ -1428,6 +1436,12 @@ bool Db::termMatch(MatchType typ, const string &lang,
string prefix; string prefix;
if (!field.empty()) { if (!field.empty()) {
(void)fieldToPrefix(field, prefix); (void)fieldToPrefix(field, prefix);
if (prefix.empty()) {
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
field.c_str()));
}
if (prefixp)
*prefixp = prefix;
} }
if (typ == ET_STEM) { if (typ == ET_STEM) {
@ -1443,6 +1457,8 @@ bool Db::termMatch(MatchType typ, const string &lang,
return false; return false;
LOGDEB1(("termMatch: %d [%s]\n", it->wcf, it->term.c_str())); LOGDEB1(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
} }
if (!prefix.empty())
addPrefix(res, prefix);
} else { } else {
regex_t reg; regex_t reg;
int errcode; int errcode;
@ -1493,7 +1509,7 @@ bool Db::termMatch(MatchType typ, const string &lang,
continue; continue;
} }
// Do we want stem expansion here? We don't do it for now // Do we want stem expansion here? We don't do it for now
res.push_back(TermMatchEntry(term, it.get_termfreq())); res.push_back(TermMatchEntry(*it, it.get_termfreq()));
++n; ++n;
} }
m_reason.erase(); m_reason.erase();

View File

@ -158,7 +158,9 @@ class Db {
enum MatchType {ET_WILD, ET_REGEXP, ET_STEM}; enum MatchType {ET_WILD, ET_REGEXP, ET_STEM};
bool termMatch(MatchType typ, const string &lang, const string &s, bool termMatch(MatchType typ, const string &lang, const string &s,
list<TermMatchEntry>& result, int max = -1, list<TermMatchEntry>& result, int max = -1,
const string& field = ""); const string& field = "",
string *prefix = 0
);
/** Special filename wildcard to XSFN terms expansion. /** Special filename wildcard to XSFN terms expansion.
internal/searchdata use only */ internal/searchdata use only */

View File

@ -240,9 +240,9 @@ class wsQData : public TextSplitCB {
// translating. // translating.
class StringToXapianQ { class StringToXapianQ {
public: public:
StringToXapianQ(Db& db, const string& prefix, StringToXapianQ(Db& db, const string& field,
const string &stmlng, bool boostUser) const string &stmlng, bool boostUser)
: m_db(db), m_prefix(prefix), m_stemlang(stmlng), : m_db(db), m_field(field), m_stemlang(stmlng),
m_doBoostUserTerms(boostUser) m_doBoostUserTerms(boostUser)
{ } { }
@ -267,7 +267,7 @@ public:
private: private:
void expandTerm(bool dont, const string& term, list<string>& exp, void expandTerm(bool dont, const string& term, list<string>& exp,
string& sterm); string& sterm, string *prefix);
// After splitting entry on whitespace: process non-phrase element // After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries); void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
// Process phrase/near element // Process phrase/near element
@ -276,7 +276,7 @@ private:
bool useNear, int slack); bool useNear, int slack);
Db& m_db; Db& m_db;
const string& m_prefix; const string& m_field;
const string& m_stemlang; const string& m_stemlang;
bool m_doBoostUserTerms; bool m_doBoostUserTerms;
// Single terms and phrases resulting from breaking up text; // Single terms and phrases resulting from breaking up text;
@ -309,9 +309,9 @@ static void listVector(const string& what, const vector<string>&l)
* @param sterm output original input term if there were no wildcards * @param sterm output original input term if there were no wildcards
*/ */
void StringToXapianQ::expandTerm(bool nostemexp, void StringToXapianQ::expandTerm(bool nostemexp,
const string& term, const string& term,
list<string>& exp, list<string>& exp,
string &sterm) string &sterm, string *prefix)
{ {
LOGDEB2(("expandTerm: term [%s] stemlang [%s] nostemexp %d\n", LOGDEB2(("expandTerm: term [%s] stemlang [%s] nostemexp %d\n",
term.c_str(), m_stemlang.c_str(), nostemexp)); term.c_str(), m_stemlang.c_str(), nostemexp));
@ -336,11 +336,13 @@ void StringToXapianQ::expandTerm(bool nostemexp,
} else { } else {
list<TermMatchEntry> l; list<TermMatchEntry> l;
if (haswild) { if (haswild) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l); m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l, -1, m_field,
prefix);
} else { } else {
sterm = term; sterm = term;
m_uterms.push_back(sterm); m_uterms.push_back(sterm);
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l); m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l, -1, m_field,
prefix);
} }
for (list<TermMatchEntry>::const_iterator it = l.begin(); for (list<TermMatchEntry>::const_iterator it = l.begin();
it != l.end(); it++) { it != l.end(); it++) {
@ -384,23 +386,14 @@ void multiply_groups(vector<vector<string> >::const_iterator vvit,
} }
} }
/** Add prefix to all strings in list */
static void addPrefix(list<string>& terms, const string& prefix)
{
if (prefix.empty())
return;
for (list<string>::iterator it = terms.begin(); it != terms.end(); it++)
it->insert(0, prefix);
}
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp, void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
list<Xapian::Query> &pqueries) list<Xapian::Query> &pqueries)
{ {
list<string> exp; list<string> exp;
string sterm; // dumb version of user term string sterm; // dumb version of user term
expandTerm(nostemexp, span, exp, sterm); string prefix;
expandTerm(nostemexp, span, exp, sterm, &prefix);
m_terms.insert(m_terms.end(), exp.begin(), exp.end()); m_terms.insert(m_terms.end(), exp.begin(), exp.end());
addPrefix(exp, m_prefix);
// Push either term or OR of stem-expanded set // Push either term or OR of stem-expanded set
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end()); Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
@ -412,7 +405,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
if (m_doBoostUserTerms && !sterm.empty()) { if (m_doBoostUserTerms && !sterm.empty()) {
xq = Xapian::Query(Xapian::Query::OP_OR, xq = Xapian::Query(Xapian::Query::OP_OR,
xq, xq,
Xapian::Query(m_prefix+sterm, Xapian::Query(prefix+sterm,
original_term_wqf_booster)); original_term_wqf_booster));
} }
pqueries.push_back(xq); pqueries.push_back(xq);
@ -443,9 +436,9 @@ void StringToXapianQ::processPhraseOrNear(wsQData *splitData,
string sterm; string sterm;
list<string>exp; list<string>exp;
expandTerm(nostemexp, *it, exp, sterm); string prefix;
expandTerm(nostemexp, *it, exp, sterm, &prefix);
groups.push_back(vector<string>(exp.begin(), exp.end())); groups.push_back(vector<string>(exp.begin(), exp.end()));
addPrefix(exp, m_prefix);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(), exp.end())); exp.begin(), exp.end()));
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF #ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
@ -597,9 +590,6 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp)); LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
return false; return false;
} }
string prefix;
if (!m_field.empty())
db.fieldToPrefix(m_field, prefix);
list<Xapian::Query> pqueries; list<Xapian::Query> pqueries;
// We normally boost the original term in the stem expansion list. Don't // We normally boost the original term in the stem expansion list. Don't
@ -608,7 +598,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
(m_parentSearch && !m_parentSearch->haveWildCards()) || (m_parentSearch && !m_parentSearch->haveWildCards()) ||
(m_parentSearch == 0 && !m_haveWildCards); (m_parentSearch == 0 && !m_haveWildCards);
StringToXapianQ tr(db, prefix, l_stemlang, doBoostUserTerm); StringToXapianQ tr(db, m_field, l_stemlang, doBoostUserTerm);
if (!tr.processUserString(m_text, m_reason, pqueries, if (!tr.processUserString(m_text, m_reason, pqueries,
db.getStopList())) db.getStopList()))
return false; return false;
@ -623,7 +613,8 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
return true; return true;
} }
// Translate a FILENAME search clause. // Translate a FILENAME search clause. Actually this is now mostly
// a "filename" field search.
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p, bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
const string&) const string&)
{ {
@ -660,10 +651,6 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
list<Xapian::Query> pqueries; list<Xapian::Query> pqueries;
Xapian::Query nq; Xapian::Query nq;
string prefix;
if (!m_field.empty())
db.fieldToPrefix(m_field, prefix);
// We normally boost the original term in the stem expansion list. Don't // We normally boost the original term in the stem expansion list. Don't
// do it if there are wildcards anywhere, this would skew the results. // do it if there are wildcards anywhere, this would skew the results.
bool doBoostUserTerm = bool doBoostUserTerm =
@ -680,7 +667,7 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
} }
string s = string("\"") + m_text + string("\""); string s = string("\"") + m_text + string("\"");
bool useNear = (m_tp == SCLT_NEAR); bool useNear = (m_tp == SCLT_NEAR);
StringToXapianQ tr(db, prefix, l_stemlang, doBoostUserTerm); StringToXapianQ tr(db, m_field, l_stemlang, doBoostUserTerm);
if (!tr.processUserString(s, m_reason, pqueries, db.getStopList(), if (!tr.processUserString(s, m_reason, pqueries, db.getStopList(),
m_slack, useNear)) m_slack, useNear))
return false; return false;