Search: allow setting weights on terms, ie: "important"2.5

This commit is contained in:
Jean-Francois Dockes 2011-05-30 14:03:01 +02:00
parent 72fe512c5a
commit 91f277ec26
5 changed files with 70 additions and 23 deletions

View File

@ -84,17 +84,16 @@ void WasaQuery::describe(string &desc) const
desc.erase(desc.length() - 1); desc.erase(desc.length() - 1);
desc += ")"; desc += ")";
if (m_modifiers != 0) { if (m_modifiers != 0) {
if (m_modifiers & WQM_BOOST) desc += "BOOST|"; if (m_modifiers & WQM_BOOST) desc += "BOOST|";
if (m_modifiers & WQM_CASESENS) desc += "CASESENS|"; if (m_modifiers & WQM_CASESENS) desc += "CASESENS|";
if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|"; if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|";
if (m_modifiers & WQM_FUZZY) desc += "FUZZY|";
if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|"; if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|";
if (m_modifiers & WQM_BOOST) desc += "BOOST|"; if (m_modifiers & WQM_PHRASESLACK) desc += "PHRASESLACK|";
if (m_modifiers & WQM_PROX) desc += "PROX|"; if (m_modifiers & WQM_PROX) desc += "PROX|";
if (m_modifiers & WQM_REGEX) desc += "REGEX|";
if (m_modifiers & WQM_SLOPPY) desc += "SLOPPY|"; if (m_modifiers & WQM_SLOPPY) desc += "SLOPPY|";
if (m_modifiers & WQM_WORDS) desc += "WORDS|"; if (m_modifiers & WQM_WORDS) desc += "WORDS|";
if (m_modifiers & WQM_PHRASESLACK) desc += "PHRASESLACK|";
if (m_modifiers & WQM_REGEX) desc += "REGEX|";
if (m_modifiers & WQM_FUZZY) desc += "FUZZY|";
if (desc.length() > 0 && desc[desc.length()-1] == '|') if (desc.length() > 0 && desc[desc.length()-1] == '|')
desc = desc.substr(0, desc.length()-1); desc = desc.substr(0, desc.length()-1);
} }
@ -132,7 +131,7 @@ static const char * parserExpr =
"(\"" //9 "(\"" //9
"([^\"]+)" //10 "A quoted term" "([^\"]+)" //10 "A quoted term"
"\")" "\")"
"([a-zA-Z0-9]*)" //11 modifiers "([bcCdDeflLoprsw.0-9]*)" //11 modifiers
"|" "|"
"([^[:space:]\"]+)" //12 ANormalTerm "([^[:space:]\"]+)" //12 ANormalTerm
")" ")"
@ -152,7 +151,7 @@ static const char *matchNames[] = {
/* 8*/ "", /* 8*/ "",
/* 9*/ "", /* 9*/ "",
/*10*/ "QUOTEDTERM", /*10*/ "QUOTEDTERM",
/*11*/ "MODIIFIERS", /*11*/ "MODIFIERS",
/*12*/ "TERM", /*12*/ "TERM",
}; };
#define NMATCH (sizeof(matchNames) / sizeof(char *)) #define NMATCH (sizeof(matchNames) / sizeof(char *))
@ -328,12 +327,18 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
unsigned int mods = 0; unsigned int mods = 0;
for (unsigned int i = 0; i < strlen(match); i++) { for (unsigned int i = 0; i < strlen(match); i++) {
switch (match[i]) { switch (match[i]) {
case 'b': mods |= WasaQuery::WQM_BOOST; break; case 'b':
mods |= WasaQuery::WQM_BOOST;
nclause->m_weight = 10.0;
break;
case 'c': break; case 'c': break;
case 'C': mods |= WasaQuery::WQM_CASESENS; break; case 'C': mods |= WasaQuery::WQM_CASESENS; break;
case 'd': break; case 'd': break;
case 'D': mods |= WasaQuery::WQM_DIACSENS; break; case 'D': mods |= WasaQuery::WQM_DIACSENS; break;
case 'e': mods |= WasaQuery::WQM_CASESENS | WasaQuery::WQM_DIACSENS | WasaQuery::WQM_NOSTEM; break; case 'e': mods |= WasaQuery::WQM_CASESENS |
WasaQuery::WQM_DIACSENS |
WasaQuery::WQM_NOSTEM;
break;
case 'f': mods |= WasaQuery::WQM_FUZZY; break; case 'f': mods |= WasaQuery::WQM_FUZZY; break;
case 'l': mods |= WasaQuery::WQM_NOSTEM; break; case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
case 'L': break; case 'L': break;
@ -342,6 +347,19 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
case 'r': mods |= WasaQuery::WQM_REGEX; break; case 'r': mods |= WasaQuery::WQM_REGEX; break;
case 's': mods |= WasaQuery::WQM_SLOPPY; break; case 's': mods |= WasaQuery::WQM_SLOPPY; break;
case 'w': mods |= WasaQuery::WQM_WORDS; break; case 'w': mods |= WasaQuery::WQM_WORDS; break;
case '.':case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':
{
int n;
float factor;
if (sscanf(match+i, "%f %n", &factor, &n)) {
nclause->m_weight = factor;
DPRINT((stderr, "Got factor %.2f len %d\n",
factor, n));
}
if (n)
i += n-1;
}
} }
} }
nclause->m_modifiers = WasaQuery::Modifier(mods); nclause->m_modifiers = WasaQuery::Modifier(mods);

View File

@ -63,7 +63,7 @@ public:
typedef vector<WasaQuery*> subqlist_t; typedef vector<WasaQuery*> subqlist_t;
WasaQuery() WasaQuery()
: m_op(OP_NULL), m_modifiers(0) : m_op(OP_NULL), m_modifiers(0), m_weight(1.0)
{} {}
~WasaQuery(); ~WasaQuery();
@ -86,6 +86,7 @@ public:
vector<WasaQuery*> m_subs; vector<WasaQuery*> m_subs;
unsigned int m_modifiers; unsigned int m_modifiers;
float m_weight;
}; };
/** /**

View File

@ -101,7 +101,8 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
// Filtering on location // Filtering on location
if (!stringicmp("dir", (*it)->m_fieldspec)) { if (!stringicmp("dir", (*it)->m_fieldspec)) {
sdata->setTopdir((*it)->m_value, (*it)->m_op == WasaQuery::OP_EXCL); sdata->setTopdir((*it)->m_value, (*it)->m_op == WasaQuery::OP_EXCL,
(*it)->m_weight);
continue; continue;
} }
@ -174,6 +175,8 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
if (mods & WasaQuery::WQM_NOSTEM) { if (mods & WasaQuery::WQM_NOSTEM) {
nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING); nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING);
} }
if ((*it)->m_weight != 1.0)
nclause->setWeight((*it)->m_weight);
sdata->addClause(nclause); sdata->addClause(nclause);
} }
break; break;
@ -203,6 +206,8 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
} }
if ((*it)->m_modifiers & WasaQuery::WQM_NOSTEM) if ((*it)->m_modifiers & WasaQuery::WQM_NOSTEM)
nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING); nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING);
if ((*it)->m_weight != 1.0)
nclause->setWeight((*it)->m_weight);
sdata->addClause(nclause); sdata->addClause(nclause);
break; break;

View File

@ -278,7 +278,8 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq); xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq);
} }
// Add the directory filtering clause // Add the directory filtering clause. This is a phrase of terms
// prefixed with the pathelt prefix XP
if (!m_topdir.empty()) { if (!m_topdir.empty()) {
vector<string> vpath; vector<string> vpath;
stringToTokens(m_topdir, vpath, "/"); stringToTokens(m_topdir, vpath, "/");
@ -288,10 +289,21 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
it != vpath.end(); it++){ it != vpath.end(); it++){
pvpath.push_back(pathelt_prefix + *it); pvpath.push_back(pathelt_prefix + *it);
} }
xq = Xapian::Query(m_topdirexcl ? Xapian::Query::op tdop;
Xapian::Query::OP_AND_NOT:Xapian::Query::OP_FILTER, if (m_topdirweight == 1.0) {
xq, Xapian::Query(Xapian::Query::OP_PHRASE, tdop = m_topdirexcl ?
pvpath.begin(), pvpath.end())); Xapian::Query::OP_AND_NOT : Xapian::Query::OP_FILTER;
} else {
tdop = m_topdirexcl ?
Xapian::Query::OP_AND_NOT : Xapian::Query::OP_AND_MAYBE;
}
Xapian::Query tdq = Xapian::Query(Xapian::Query::OP_PHRASE,
pvpath.begin(), pvpath.end());
if (m_topdirweight != 1.0)
tdq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT,
tdq, m_topdirweight);
xq = Xapian::Query(tdop, xq, tdq);
} }
*((Xapian::Query *)d) = xq; *((Xapian::Query *)d) = xq;
@ -847,8 +859,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
(m_parentSearch == 0 && !m_haveWildCards); (m_parentSearch == 0 && !m_haveWildCards);
StringToXapianQ tr(db, m_field, l_stemlang, doBoostUserTerm); StringToXapianQ tr(db, m_field, l_stemlang, doBoostUserTerm);
if (!tr.processUserString(m_text, m_reason, pqueries, if (!tr.processUserString(m_text, m_reason, pqueries, db.getStopList()))
db.getStopList()))
return false; return false;
if (pqueries.empty()) { if (pqueries.empty()) {
LOGERR(("SearchDataClauseSimple: resolved to null query\n")); LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
@ -858,6 +869,9 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
tr.getUTerms(m_uterms); tr.getUTerms(m_uterms);
//listVector("SearchDataClauseSimple: Uterms: ", m_uterms); //listVector("SearchDataClauseSimple: Uterms: ", m_uterms);
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end()); *qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
if (m_weight != 1.0) {
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
}
return true; return true;
} }
@ -887,6 +901,9 @@ bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
more.end()); more.end());
*qp = qp->empty() ? tq : Xapian::Query(Xapian::Query::OP_AND, *qp, tq); *qp = qp->empty() ? tq : Xapian::Query(Xapian::Query::OP_AND, *qp, tq);
} }
if (m_weight != 1.0) {
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
}
return true; return true;
} }
@ -932,6 +949,9 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
tr.getTerms(m_terms, m_groups); tr.getTerms(m_terms, m_groups);
tr.getUTerms(m_uterms); tr.getUTerms(m_uterms);
*qp = *pqueries.begin(); *qp = *pqueries.begin();
if (m_weight != 1.0) {
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
}
return true; return true;
} }

View File

@ -73,8 +73,8 @@ class SearchDataClause;
class SearchData { class SearchData {
public: public:
SearchData(SClType tp) SearchData(SClType tp)
: m_tp(tp), m_topdirexcl(false), m_haveDates(false), : m_tp(tp), m_topdirexcl(false), m_topdirweight(1.0),
m_haveWildCards(false) m_haveDates(false), m_haveWildCards(false)
{ {
if (m_tp != SCLT_OR && m_tp != SCLT_AND) if (m_tp != SCLT_OR && m_tp != SCLT_AND)
m_tp = SCLT_OR; m_tp = SCLT_OR;
@ -104,10 +104,11 @@ public:
bool maybeAddAutoPhrase(); bool maybeAddAutoPhrase();
/** Set/get top subdirectory for filtering results */ /** Set/get top subdirectory for filtering results */
void setTopdir(const string& t, bool excl = false) void setTopdir(const string& t, bool excl = false, float w = 1.0)
{ {
m_topdir = t; m_topdir = t;
m_topdirexcl = excl; m_topdirexcl = excl;
m_topdirweight = w;
} }
/** Set date span for filtering results */ /** Set date span for filtering results */
@ -147,6 +148,7 @@ private:
vector<string> m_nfiletypes; // Unwanted file types vector<string> m_nfiletypes; // Unwanted file types
string m_topdir; // Restrict to subtree. string m_topdir; // Restrict to subtree.
bool m_topdirexcl; // Invert meaning bool m_topdirexcl; // Invert meaning
float m_topdirweight; // affect weight instead of filter
bool m_haveDates; bool m_haveDates;
DateInterval m_dates; // Restrict to date interval DateInterval m_dates; // Restrict to date interval
// Printable expanded version of the complete query, retrieved/set // Printable expanded version of the complete query, retrieved/set
@ -167,7 +169,7 @@ public:
SearchDataClause(SClType tp) SearchDataClause(SClType tp)
: m_tp(tp), m_parentSearch(0), m_haveWildCards(0), : m_tp(tp), m_parentSearch(0), m_haveWildCards(0),
m_modifiers(SDCM_NONE) m_modifiers(SDCM_NONE), m_weight(1.0)
{} {}
virtual ~SearchDataClause() {} virtual ~SearchDataClause() {}
virtual bool toNativeQuery(Rcl::Db &db, void *, const string&) = 0; virtual bool toNativeQuery(Rcl::Db &db, void *, const string&) = 0;
@ -180,7 +182,7 @@ public:
SClType getTp() {return m_tp;} SClType getTp() {return m_tp;}
void setParent(SearchData *p) {m_parentSearch = p;} void setParent(SearchData *p) {m_parentSearch = p;}
virtual void setModifiers(Modifier mod) {m_modifiers = mod;} virtual void setModifiers(Modifier mod) {m_modifiers = mod;}
virtual void setWeight(float w) {m_weight = w;}
friend class SearchData; friend class SearchData;
protected: protected:
@ -189,6 +191,7 @@ protected:
SearchData *m_parentSearch; SearchData *m_parentSearch;
bool m_haveWildCards; bool m_haveWildCards;
Modifier m_modifiers; Modifier m_modifiers;
float m_weight;
private: private:
SearchDataClause(const SearchDataClause&) {} SearchDataClause(const SearchDataClause&) {}
SearchDataClause& operator=(const SearchDataClause&) { SearchDataClause& operator=(const SearchDataClause&) {