rearranged a bit the query text splitting and arranged to generate an error when an excessively long term results in a null query (but not when there are other usable terms)

This commit is contained in:
Jean-Francois Dockes 2015-06-09 19:44:19 +02:00
parent 94b94593e3
commit b53686a084
2 changed files with 76 additions and 49 deletions

View File

@ -319,7 +319,7 @@ private:
* "Simple" data clause with user-entered query text. This can include * "Simple" data clause with user-entered query text. This can include
* multiple phrases and words, but no specified distance. * multiple phrases and words, but no specified distance.
*/ */
class TextSplitQ; class TermProcQ;
class SearchDataClauseSimple : public SearchDataClause { class SearchDataClauseSimple : public SearchDataClause {
public: public:
SearchDataClauseSimple(SClType tp, const std::string& txt, SearchDataClauseSimple(SClType tp, const std::string& txt,
@ -375,7 +375,7 @@ protected:
void processSimpleSpan(Rcl::Db &db, string& ermsg, const string& span, void processSimpleSpan(Rcl::Db &db, string& ermsg, const string& span,
int mods, void *pq); int mods, void *pq);
// Process phrase/near element // Process phrase/near element
void processPhraseOrNear(Rcl::Db &db, string& ermsg, TextSplitQ *splitData, void processPhraseOrNear(Rcl::Db &db, string& ermsg, TermProcQ *splitData,
int mods, void *pq, bool useNear, int slack); int mods, void *pq, bool useNear, int slack);
}; };

View File

@ -288,48 +288,50 @@ bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
return true; return true;
} }
// Splitter callback for breaking a user string into simple terms and // Splitter for breaking a user string into simple terms and
// phrases. This is for parts of the user entry which would appear as // phrases. This is for parts of the user entry which would appear as
// a single word because there is no white space inside, but are // a single word because there is no white space inside, but are
// actually multiple terms to rcldb (ie term1,term2) // actually multiple terms to rcldb (ie term1,term2). Still, most of
// the time, the result of our splitting will be a single term.
class TextSplitQ : public TextSplitP { class TextSplitQ : public TextSplitP {
public: public:
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc) TextSplitQ(Flags flags, TermProc *prc)
: TextSplitP(prc, flags), : TextSplitP(prc, flags), m_nostemexp(false) {
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0) }
{}
bool takeword(const std::string &term, int pos, int bs, int be) bool takeword(const std::string &term, int pos, int bs, int be) {
{
// Check if the first letter is a majuscule in which // Check if the first letter is a majuscule in which
// case we do not want to do stem expansion. Need to do this // case we do not want to do stem expansion. Need to do this
// before unac of course... // before unac of course...
curnostemexp = unaciscapital(term); m_nostemexp = unaciscapital(term);
return TextSplitP::takeword(term, pos, bs, be); return TextSplitP::takeword(term, pos, bs, be);
} }
bool curnostemexp; bool nostemexp() const {
vector<string> terms; return m_nostemexp;
vector<bool> nostemexps; }
const StopList &stops; private:
// Count of terms including stopwords: this is for adjusting bool m_nostemexp;
// phrase/near slack
int alltermcount;
int lastpos;
}; };
class TermProcQ : public TermProc { class TermProcQ : public TermProc {
public: public:
TermProcQ() : TermProc(0), m_ts(0) {} TermProcQ() : TermProc(0), m_alltermcount(0), m_lastpos(0), m_ts(0) {}
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
// We need a ref to the splitter (only it knows about orig term
// capitalization for controlling stemming. The ref can't be set
// in the constructor because the splitter is not built yet when
// we are born (chicken and egg).
void setTSQ(const TextSplitQ *ts) {
m_ts = ts;
}
bool takeword(const std::string &term, int pos, int bs, int be) bool takeword(const std::string &term, int pos, int bs, int be) {
{ m_alltermcount++;
m_ts->alltermcount++; if (m_lastpos < pos)
if (m_ts->lastpos < pos) m_lastpos = pos;
m_ts->lastpos = pos; bool noexpand = be ? m_ts->nostemexp() : true;
bool noexpand = be ? m_ts->curnostemexp : true;
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n", LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
term.c_str(), pos, noexpand)); term.c_str(), pos, noexpand));
if (m_terms[pos].size() < term.size()) { if (m_terms[pos].size() < term.size()) {
@ -338,17 +340,36 @@ public:
} }
return true; return true;
} }
bool flush()
{ bool flush() {
for (map<int, string>::const_iterator it = m_terms.begin(); for (map<int, string>::const_iterator it = m_terms.begin();
it != m_terms.end(); it++) { it != m_terms.end(); it++) {
m_ts->terms.push_back(it->second); m_vterms.push_back(it->second);
m_ts->nostemexps.push_back(m_nste[it->first]); m_vnostemexps.push_back(m_nste[it->first]);
} }
return true; return true;
} }
int alltermcount() const {
return m_alltermcount;
}
int lastpos() const {
return m_lastpos;
}
const vector<string>& terms() {
return m_vterms;
}
const vector<bool>& nostemexps() {
return m_vnostemexps;
}
private: private:
TextSplitQ *m_ts; // Count of terms including stopwords: this is for adjusting
// phrase/near slack
int m_alltermcount;
int m_lastpos;
const TextSplitQ *m_ts;
vector<string> m_vterms;
vector<bool> m_vnostemexps;
map<int, string> m_terms; map<int, string> m_terms;
map<int, bool> m_nste; map<int, bool> m_nste;
}; };
@ -588,7 +609,7 @@ void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
// queries if the terms get expanded by stemming or wildcards (we // queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though) // don't do stemming for PHRASE though)
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
TextSplitQ *splitData, TermProcQ *splitData,
int mods, void *pq, int mods, void *pq,
bool useNear, int slack) bool useNear, int slack)
{ {
@ -613,9 +634,9 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
} }
// Go through the list and perform stem/wildcard expansion for each element // Go through the list and perform stem/wildcard expansion for each element
vector<bool>::iterator nxit = splitData->nostemexps.begin(); vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
for (vector<string>::iterator it = splitData->terms.begin(); for (vector<string>::const_iterator it = splitData->terms().begin();
it != splitData->terms.end(); it++, nxit++) { it != splitData->terms().end(); it++, nxit++) {
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str())); LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
// Adjust when we do stem expansion. Not if disabled by // Adjust when we do stem expansion. Not if disabled by
// caller, not inside phrases, and some versions of xapian // caller, not inside phrases, and some versions of xapian
@ -660,9 +681,9 @@ void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
// Generate an appropriate PHRASE/NEAR query with adjusted slack // Generate an appropriate PHRASE/NEAR query with adjusted slack
// For phrases, give a relevance boost like we do for original terms // For phrases, give a relevance boost like we do for original terms
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n", LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
splitData->alltermcount, splitData->lastpos)); splitData->alltermcount(), splitData->lastpos()));
Xapian::Query xq(op, orqueries.begin(), orqueries.end(), Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
splitData->lastpos + 1 + slack); splitData->lastpos() + 1 + slack);
if (op == Xapian::Query::OP_PHRASE) if (op == Xapian::Query::OP_PHRASE)
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq, xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
original_term_wqf_booster); original_term_wqf_booster);
@ -772,6 +793,7 @@ bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
// and the last position // and the last position
// The term processing pipeline: // The term processing pipeline:
// split -> [unac/case ->] stops -> store terms
TermProcQ tpq; TermProcQ tpq;
TermProc *nxt = &tpq; TermProc *nxt = &tpq;
TermProcStop tpstop(nxt, stops); nxt = &tpstop; TermProcStop tpstop(nxt, stops); nxt = &tpstop;
@ -783,28 +805,28 @@ bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD), TextSplit::TXTS_KEEPWILD),
stops, nxt); nxt);
tpq.setTSQ(&splitter); tpq.setTSQ(&splitter);
splitter.text_to_words(*it); splitter.text_to_words(*it);
slack += splitter.lastpos - splitter.terms.size() + 1; slack += tpq.lastpos() - tpq.terms().size() + 1;
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size())); LOGDEB0(("strToXapianQ: termcount: %d\n", tpq.terms().size()));
switch (splitter.terms.size() + terminc) { switch (tpq.terms().size() + terminc) {
case 0: case 0:
continue;// ?? continue;// ??
case 1: { case 1: {
int lmods = mods; int lmods = mods;
if (splitter.nostemexps.front()) if (tpq.nostemexps().front())
lmods |= SearchDataClause::SDCM_NOSTEMMING; lmods |= SearchDataClause::SDCM_NOSTEMMING;
m_hldata.ugroups.push_back(splitter.terms); m_hldata.ugroups.push_back(tpq.terms());
processSimpleSpan(db, ermsg, splitter.terms.front(), processSimpleSpan(db, ermsg, tpq.terms().front(),
lmods, &pqueries); lmods, &pqueries);
} }
break; break;
default: default:
m_hldata.ugroups.push_back(splitter.terms); m_hldata.ugroups.push_back(tpq.terms());
processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries, processPhraseOrNear(db, ermsg, &tpq, mods, &pqueries,
useNear, slack); useNear, slack);
} }
if (m_curcl >= getMaxCl()) { if (m_curcl >= getMaxCl()) {
@ -846,6 +868,7 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
case SCLT_OR: op = Xapian::Query::OP_OR; break; case SCLT_OR: op = Xapian::Query::OP_OR; break;
default: default:
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp)); LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
m_reason = "Internal error";
return false; return false;
} }
@ -854,7 +877,9 @@ bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
return false; return false;
if (pqueries.empty()) { if (pqueries.empty()) {
LOGERR(("SearchDataClauseSimple: resolved to null query\n")); LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
return true; m_reason = string("Resolved to null query. Term too long ? : [" +
m_text + string("]"));
return false;
} }
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end()); *qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
@ -970,7 +995,9 @@ bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
return false; return false;
if (pqueries.empty()) { if (pqueries.empty()) {
LOGERR(("SearchDataClauseDist: resolved to null query\n")); LOGERR(("SearchDataClauseDist: resolved to null query\n"));
return true; m_reason = string("Resolved to null query. Term too long ? : [" +
m_text + string("]"));
return false;
} }
*qp = *pqueries.begin(); *qp = *pqueries.begin();