rearranged a bit the query text splitting and arranged to generate an error when an excessively long term results in a null query (but not when there are other usable terms)
This commit is contained in:
parent
94b94593e3
commit
b53686a084
@ -319,7 +319,7 @@ private:
|
|||||||
* "Simple" data clause with user-entered query text. This can include
|
* "Simple" data clause with user-entered query text. This can include
|
||||||
* multiple phrases and words, but no specified distance.
|
* multiple phrases and words, but no specified distance.
|
||||||
*/
|
*/
|
||||||
class TextSplitQ;
|
class TermProcQ;
|
||||||
class SearchDataClauseSimple : public SearchDataClause {
|
class SearchDataClauseSimple : public SearchDataClause {
|
||||||
public:
|
public:
|
||||||
SearchDataClauseSimple(SClType tp, const std::string& txt,
|
SearchDataClauseSimple(SClType tp, const std::string& txt,
|
||||||
@ -375,7 +375,7 @@ protected:
|
|||||||
void processSimpleSpan(Rcl::Db &db, string& ermsg, const string& span,
|
void processSimpleSpan(Rcl::Db &db, string& ermsg, const string& span,
|
||||||
int mods, void *pq);
|
int mods, void *pq);
|
||||||
// Process phrase/near element
|
// Process phrase/near element
|
||||||
void processPhraseOrNear(Rcl::Db &db, string& ermsg, TextSplitQ *splitData,
|
void processPhraseOrNear(Rcl::Db &db, string& ermsg, TermProcQ *splitData,
|
||||||
int mods, void *pq, bool useNear, int slack);
|
int mods, void *pq, bool useNear, int slack);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -288,48 +288,50 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Splitter callback for breaking a user string into simple terms and
|
// Splitter for breaking a user string into simple terms and
|
||||||
// phrases. This is for parts of the user entry which would appear as
|
// phrases. This is for parts of the user entry which would appear as
|
||||||
// a single word because there is no white space inside, but are
|
// a single word because there is no white space inside, but are
|
||||||
// actually multiple terms to rcldb (ie term1,term2)
|
// actually multiple terms to rcldb (ie term1,term2). Still, most of
|
||||||
|
// the time, the result of our splitting will be a single term.
|
||||||
class TextSplitQ : public TextSplitP {
|
class TextSplitQ : public TextSplitP {
|
||||||
public:
|
public:
|
||||||
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
|
TextSplitQ(Flags flags, TermProc *prc)
|
||||||
: TextSplitP(prc, flags),
|
: TextSplitP(prc, flags), m_nostemexp(false) {
|
||||||
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
|
}
|
||||||
{}
|
|
||||||
|
|
||||||
bool takeword(const std::string &term, int pos, int bs, int be)
|
bool takeword(const std::string &term, int pos, int bs, int be) {
|
||||||
{
|
|
||||||
// Check if the first letter is a majuscule in which
|
// Check if the first letter is a majuscule in which
|
||||||
// case we do not want to do stem expansion. Need to do this
|
// case we do not want to do stem expansion. Need to do this
|
||||||
// before unac of course...
|
// before unac of course...
|
||||||
curnostemexp = unaciscapital(term);
|
m_nostemexp = unaciscapital(term);
|
||||||
|
|
||||||
return TextSplitP::takeword(term, pos, bs, be);
|
return TextSplitP::takeword(term, pos, bs, be);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool curnostemexp;
|
bool nostemexp() const {
|
||||||
vector<string> terms;
|
return m_nostemexp;
|
||||||
vector<bool> nostemexps;
|
}
|
||||||
const StopList &stops;
|
private:
|
||||||
// Count of terms including stopwords: this is for adjusting
|
bool m_nostemexp;
|
||||||
// phrase/near slack
|
|
||||||
int alltermcount;
|
|
||||||
int lastpos;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class TermProcQ : public TermProc {
|
class TermProcQ : public TermProc {
|
||||||
public:
|
public:
|
||||||
TermProcQ() : TermProc(0), m_ts(0) {}
|
TermProcQ() : TermProc(0), m_alltermcount(0), m_lastpos(0), m_ts(0) {}
|
||||||
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
|
|
||||||
|
// We need a ref to the splitter (only it knows about orig term
|
||||||
|
// capitalization for controlling stemming. The ref can't be set
|
||||||
|
// in the constructor because the splitter is not built yet when
|
||||||
|
// we are born (chicken and egg).
|
||||||
|
void setTSQ(const TextSplitQ *ts) {
|
||||||
|
m_ts = ts;
|
||||||
|
}
|
||||||
|
|
||||||
bool takeword(const std::string &term, int pos, int bs, int be)
|
bool takeword(const std::string &term, int pos, int bs, int be) {
|
||||||
{
|
m_alltermcount++;
|
||||||
m_ts->alltermcount++;
|
if (m_lastpos < pos)
|
||||||
if (m_ts->lastpos < pos)
|
m_lastpos = pos;
|
||||||
m_ts->lastpos = pos;
|
bool noexpand = be ? m_ts->nostemexp() : true;
|
||||||
bool noexpand = be ? m_ts->curnostemexp : true;
|
|
||||||
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
|
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
|
||||||
term.c_str(), pos, noexpand));
|
term.c_str(), pos, noexpand));
|
||||||
if (m_terms[pos].size() < term.size()) {
|
if (m_terms[pos].size() < term.size()) {
|
||||||
@ -338,17 +340,36 @@ public:
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool flush()
|
|
||||||
{
|
bool flush() {
|
||||||
for (map<int, string>::const_iterator it = m_terms.begin();
|
for (map<int, string>::const_iterator it = m_terms.begin();
|
||||||
it != m_terms.end(); it++) {
|
it != m_terms.end(); it++) {
|
||||||
m_ts->terms.push_back(it->second);
|
m_vterms.push_back(it->second);
|
||||||
m_ts->nostemexps.push_back(m_nste[it->first]);
|
m_vnostemexps.push_back(m_nste[it->first]);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int alltermcount() const {
|
||||||
|
return m_alltermcount;
|
||||||
|
}
|
||||||
|
int lastpos() const {
|
||||||
|
return m_lastpos;
|
||||||
|
}
|
||||||
|
const vector<string>& terms() {
|
||||||
|
return m_vterms;
|
||||||
|
}
|
||||||
|
const vector<bool>& nostemexps() {
|
||||||
|
return m_vnostemexps;
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
TextSplitQ *m_ts;
|
// Count of terms including stopwords: this is for adjusting
|
||||||
|
// phrase/near slack
|
||||||
|
int m_alltermcount;
|
||||||
|
int m_lastpos;
|
||||||
|
const TextSplitQ *m_ts;
|
||||||
|
vector<string> m_vterms;
|
||||||
|
vector<bool> m_vnostemexps;
|
||||||
map<int, string> m_terms;
|
map<int, string> m_terms;
|
||||||
map<int, bool> m_nste;
|
map<int, bool> m_nste;
|
||||||
};
|
};
|
||||||
@ -588,7 +609,7 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
|
|||||||
// queries if the terms get expanded by stemming or wildcards (we
|
// queries if the terms get expanded by stemming or wildcards (we
|
||||||
// don't do stemming for PHRASE though)
|
// don't do stemming for PHRASE though)
|
||||||
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
||||||
TextSplitQ *splitData,
|
TermProcQ *splitData,
|
||||||
int mods, void *pq,
|
int mods, void *pq,
|
||||||
bool useNear, int slack)
|
bool useNear, int slack)
|
||||||
{
|
{
|
||||||
@ -613,9 +634,9 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Go through the list and perform stem/wildcard expansion for each element
|
// Go through the list and perform stem/wildcard expansion for each element
|
||||||
vector<bool>::iterator nxit = splitData->nostemexps.begin();
|
vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
|
||||||
for (vector<string>::iterator it = splitData->terms.begin();
|
for (vector<string>::const_iterator it = splitData->terms().begin();
|
||||||
it != splitData->terms.end(); it++, nxit++) {
|
it != splitData->terms().end(); it++, nxit++) {
|
||||||
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
|
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
|
||||||
// Adjust when we do stem expansion. Not if disabled by
|
// Adjust when we do stem expansion. Not if disabled by
|
||||||
// caller, not inside phrases, and some versions of xapian
|
// caller, not inside phrases, and some versions of xapian
|
||||||
@ -660,9 +681,9 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
|||||||
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
||||||
// For phrases, give a relevance boost like we do for original terms
|
// For phrases, give a relevance boost like we do for original terms
|
||||||
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
||||||
splitData->alltermcount, splitData->lastpos));
|
splitData->alltermcount(), splitData->lastpos()));
|
||||||
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
||||||
splitData->lastpos + 1 + slack);
|
splitData->lastpos() + 1 + slack);
|
||||||
if (op == Xapian::Query::OP_PHRASE)
|
if (op == Xapian::Query::OP_PHRASE)
|
||||||
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
|
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
|
||||||
original_term_wqf_booster);
|
original_term_wqf_booster);
|
||||||
@ -772,6 +793,7 @@ bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
|
|||||||
// and the last position
|
// and the last position
|
||||||
|
|
||||||
// The term processing pipeline:
|
// The term processing pipeline:
|
||||||
|
// split -> [unac/case ->] stops -> store terms
|
||||||
TermProcQ tpq;
|
TermProcQ tpq;
|
||||||
TermProc *nxt = &tpq;
|
TermProc *nxt = &tpq;
|
||||||
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
||||||
@ -783,28 +805,28 @@ bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
|
|||||||
|
|
||||||
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||||
TextSplit::TXTS_KEEPWILD),
|
TextSplit::TXTS_KEEPWILD),
|
||||||
stops, nxt);
|
nxt);
|
||||||
tpq.setTSQ(&splitter);
|
tpq.setTSQ(&splitter);
|
||||||
splitter.text_to_words(*it);
|
splitter.text_to_words(*it);
|
||||||
|
|
||||||
slack += splitter.lastpos - splitter.terms.size() + 1;
|
slack += tpq.lastpos() - tpq.terms().size() + 1;
|
||||||
|
|
||||||
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
|
LOGDEB0(("strToXapianQ: termcount: %d\n", tpq.terms().size()));
|
||||||
switch (splitter.terms.size() + terminc) {
|
switch (tpq.terms().size() + terminc) {
|
||||||
case 0:
|
case 0:
|
||||||
continue;// ??
|
continue;// ??
|
||||||
case 1: {
|
case 1: {
|
||||||
int lmods = mods;
|
int lmods = mods;
|
||||||
if (splitter.nostemexps.front())
|
if (tpq.nostemexps().front())
|
||||||
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||||
m_hldata.ugroups.push_back(splitter.terms);
|
m_hldata.ugroups.push_back(tpq.terms());
|
||||||
processSimpleSpan(db, ermsg, splitter.terms.front(),
|
processSimpleSpan(db, ermsg, tpq.terms().front(),
|
||||||
lmods, &pqueries);
|
lmods, &pqueries);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
m_hldata.ugroups.push_back(splitter.terms);
|
m_hldata.ugroups.push_back(tpq.terms());
|
||||||
processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
|
processPhraseOrNear(db, ermsg, &tpq, mods, &pqueries,
|
||||||
useNear, slack);
|
useNear, slack);
|
||||||
}
|
}
|
||||||
if (m_curcl >= getMaxCl()) {
|
if (m_curcl >= getMaxCl()) {
|
||||||
@ -846,6 +868,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
|
|||||||
case SCLT_OR: op = Xapian::Query::OP_OR; break;
|
case SCLT_OR: op = Xapian::Query::OP_OR; break;
|
||||||
default:
|
default:
|
||||||
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
||||||
|
m_reason = "Internal error";
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -854,7 +877,9 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
|
|||||||
return false;
|
return false;
|
||||||
if (pqueries.empty()) {
|
if (pqueries.empty()) {
|
||||||
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||||
return true;
|
m_reason = string("Resolved to null query. Term too long ? : [" +
|
||||||
|
m_text + string("]"));
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
|
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
|
||||||
@ -970,7 +995,9 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
|
|||||||
return false;
|
return false;
|
||||||
if (pqueries.empty()) {
|
if (pqueries.empty()) {
|
||||||
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
||||||
return true;
|
m_reason = string("Resolved to null query. Term too long ? : [" +
|
||||||
|
m_text + string("]"));
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
*qp = *pqueries.begin();
|
*qp = *pqueries.begin();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user