Fixed the autophrase fix: need a full tree flatten to work
This commit is contained in:
parent
3e9581345d
commit
2a69d30701
@ -25,6 +25,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <iostream>
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
#include "xapian.h"
|
#include "xapian.h"
|
||||||
@ -69,7 +70,7 @@ SearchData::~SearchData()
|
|||||||
{
|
{
|
||||||
LOGDEB0(("SearchData::~SearchData\n"));
|
LOGDEB0(("SearchData::~SearchData\n"));
|
||||||
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
|
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
|
||||||
delete *it;
|
delete *it;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is called by the GUI simple search if the option is set: add
|
// This is called by the GUI simple search if the option is set: add
|
||||||
@ -79,9 +80,13 @@ SearchData::~SearchData()
|
|||||||
bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
||||||
{
|
{
|
||||||
LOGDEB0(("SearchData::maybeAddAutoPhrase()\n"));
|
LOGDEB0(("SearchData::maybeAddAutoPhrase()\n"));
|
||||||
|
// cerr << "BEFORE SIMPLIFY\n"; dump(cerr);
|
||||||
|
simplify();
|
||||||
|
// cerr << "AFTER SIMPLIFY\n"; dump(cerr);
|
||||||
|
|
||||||
if (!m_query.size()) {
|
if (!m_query.size()) {
|
||||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: empty query\n"));
|
LOGDEB2(("SearchData::maybeAddAutoPhrase: empty query\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
string field;
|
string field;
|
||||||
@ -89,53 +94,39 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
|||||||
// Walk the clause list. If we find any non simple clause or different
|
// Walk the clause list. If we find any non simple clause or different
|
||||||
// field names, bail out.
|
// field names, bail out.
|
||||||
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
|
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
|
||||||
SClType tp = (*it)->m_tp;
|
SClType tp = (*it)->m_tp;
|
||||||
SearchDataClauseSimple *clp = 0;
|
if (tp != SCLT_AND && tp != SCLT_OR) {
|
||||||
if (tp == SCLT_SUB) {
|
|
||||||
// The query language parser produces subqueries for simple terms
|
|
||||||
SearchDataClauseSub *subclp =
|
|
||||||
dynamic_cast<SearchDataClauseSub*>(*it);
|
|
||||||
if (subclp == 0) {
|
|
||||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: "
|
|
||||||
"dyncast to clauseSub failed\n"));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!subclp->getSub()->singleSimple()) {
|
|
||||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: !pureSingle\n"));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
clp = dynamic_cast<SearchDataClauseSimple*>(
|
|
||||||
*(subclp->getSub()->m_query.begin()));
|
|
||||||
} else if (tp != SCLT_AND && tp != SCLT_OR) {
|
|
||||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: wrong tp %d\n", tp));
|
LOGDEB2(("SearchData::maybeAddAutoPhrase: wrong tp %d\n", tp));
|
||||||
return false;
|
return false;
|
||||||
} else {
|
}
|
||||||
clp = dynamic_cast<SearchDataClauseSimple*>(*it);
|
SearchDataClauseSimple *clp =
|
||||||
|
dynamic_cast<SearchDataClauseSimple*>(*it);
|
||||||
|
if (clp == 0) {
|
||||||
|
LOGDEB2(("SearchData::maybeAddAutoPhrase: dyncast failed\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (it == m_query.begin()) {
|
||||||
|
field = clp->getfield();
|
||||||
|
} else {
|
||||||
|
if (clp->getfield().compare(field)) {
|
||||||
|
LOGDEB2(("SearchData::maybeAddAutoPhrase: diff. fields\n"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (clp == 0) {
|
|
||||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: dyncast failed\n"));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (it == m_query.begin()) {
|
|
||||||
field = clp->getfield();
|
|
||||||
} else {
|
|
||||||
if (clp->getfield().compare(field)) {
|
|
||||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: diff. fields\n"));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If there are wildcards or quotes in there, bail out
|
// If there are wildcards or quotes in there, bail out
|
||||||
if (clp->gettext().find_first_of("\"*[?") != string::npos) {
|
if (clp->gettext().find_first_of("\"*[?") != string::npos) {
|
||||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards\n"));
|
LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Do a simple word-split here, don't bother with the full-blown
|
|
||||||
// textsplit. The autophrase thing is just "best effort", it's
|
// Do a simple word-split here, not the full-blown
|
||||||
// normal that it won't work in strange cases.
|
// textsplit. Spans of stopwords should not be trimmed later
|
||||||
vector<string> wl;
|
// in this function, they will be properly split when the
|
||||||
stringToStrings(clp->gettext(), wl);
|
// phrase gets processed by toNativeQuery() later on.
|
||||||
words.insert(words.end(), wl.begin(), wl.end());
|
vector<string> wl;
|
||||||
|
stringToStrings(clp->gettext(), wl);
|
||||||
|
words.insert(words.end(), wl.begin(), wl.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -144,27 +135,27 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
|||||||
int slack = 0;
|
int slack = 0;
|
||||||
int doccnt = db.docCnt();
|
int doccnt = db.docCnt();
|
||||||
if (!doccnt)
|
if (!doccnt)
|
||||||
doccnt = 1;
|
doccnt = 1;
|
||||||
string swords;
|
string swords;
|
||||||
for (vector<string>::iterator it = words.begin();
|
for (vector<string>::iterator it = words.begin();
|
||||||
it != words.end(); it++) {
|
it != words.end(); it++) {
|
||||||
double freq = double(db.termDocCnt(*it)) / doccnt;
|
double freq = double(db.termDocCnt(*it)) / doccnt;
|
||||||
if (freq < freqThreshold) {
|
if (freq < freqThreshold) {
|
||||||
if (!swords.empty())
|
if (!swords.empty())
|
||||||
swords.append(1, ' ');
|
swords.append(1, ' ');
|
||||||
swords += *it;
|
swords += *it;
|
||||||
} else {
|
} else {
|
||||||
LOGDEB0(("Autophrase: [%s] too frequent (%.2f %%)\n",
|
LOGDEB0(("SearchData::Autophrase: [%s] too frequent (%.2f %%)\n",
|
||||||
it->c_str(), 100 * freq));
|
it->c_str(), 100 * freq));
|
||||||
slack++;
|
slack++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// We can't make a phrase with a single word :)
|
// We can't make a phrase with a single word :)
|
||||||
int nwords = TextSplit::countWords(swords);
|
int nwords = TextSplit::countWords(swords);
|
||||||
if (nwords <= 1) {
|
if (nwords <= 1) {
|
||||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: ended with 1 word\n"));
|
LOGDEB2(("SearchData::maybeAddAutoPhrase: ended with 1 word\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Increase the slack: we want to be a little more laxist than for
|
// Increase the slack: we want to be a little more laxist than for
|
||||||
@ -172,7 +163,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
|||||||
slack += 1 + nwords / 3;
|
slack += 1 + nwords / 3;
|
||||||
|
|
||||||
m_autophrase = RefCntr<SearchDataClauseDist>(
|
m_autophrase = RefCntr<SearchDataClauseDist>(
|
||||||
new SearchDataClauseDist(SCLT_PHRASE, swords, slack, field));
|
new SearchDataClauseDist(SCLT_PHRASE, swords, slack, field));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -180,9 +171,9 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
|||||||
bool SearchData::addClause(SearchDataClause* cl)
|
bool SearchData::addClause(SearchDataClause* cl)
|
||||||
{
|
{
|
||||||
if (m_tp == SCLT_OR && cl->getexclude()) {
|
if (m_tp == SCLT_OR && cl->getexclude()) {
|
||||||
LOGERR(("SearchData::addClause: cant add EXCL to OR list\n"));
|
LOGERR(("SearchData::addClause: cant add EXCL to OR list\n"));
|
||||||
m_reason = "No Negative (AND_NOT) clauses allowed in OR queries";
|
m_reason = "No Negative (AND_NOT) clauses allowed in OR queries";
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
cl->setParent(this);
|
cl->setParent(this);
|
||||||
m_haveWildCards = m_haveWildCards || cl->m_haveWildCards;
|
m_haveWildCards = m_haveWildCards || cl->m_haveWildCards;
|
||||||
@ -194,11 +185,70 @@ bool SearchData::addClause(SearchDataClause* cl)
|
|||||||
bool SearchData::fileNameOnly()
|
bool SearchData::fileNameOnly()
|
||||||
{
|
{
|
||||||
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
|
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
|
||||||
if (!(*it)->isFileName())
|
if (!(*it)->isFileName())
|
||||||
return false;
|
return false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SearchData::simplify()
|
||||||
|
{
|
||||||
|
for (unsigned int i = 0; i < m_query.size(); i++) {
|
||||||
|
if (m_query[i]->m_tp != SCLT_SUB)
|
||||||
|
continue;
|
||||||
|
//C[est ce dyncast qui crashe??
|
||||||
|
SearchDataClauseSub *clsubp =
|
||||||
|
dynamic_cast<SearchDataClauseSub*>(m_query[i]);
|
||||||
|
if (clsubp == 0) {
|
||||||
|
// ??
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (clsubp->getSub()->m_tp != m_tp)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
clsubp->getSub()->simplify();
|
||||||
|
|
||||||
|
// If this subquery has special attributes, it's not a
|
||||||
|
// candidate for collapsing
|
||||||
|
if (!clsubp->getSub()->m_filetypes.empty() ||
|
||||||
|
!clsubp->getSub()->m_nfiletypes.empty() ||
|
||||||
|
clsubp->getSub()->m_haveDates ||
|
||||||
|
clsubp->getSub()->m_maxSize != size_t(-1) ||
|
||||||
|
clsubp->getSub()->m_minSize != size_t(-1) ||
|
||||||
|
clsubp->getSub()->m_haveWildCards)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
bool allsametp = true;
|
||||||
|
for (qlist_it_t it1 = clsubp->getSub()->m_query.begin();
|
||||||
|
it1 != clsubp->getSub()->m_query.end(); it1++) {
|
||||||
|
// We want all AND or OR clause, and same as our conjunction
|
||||||
|
if (((*it1)->getTp() != SCLT_AND && (*it1)->getTp() != SCLT_OR) ||
|
||||||
|
(*it1)->getTp() != m_tp) {
|
||||||
|
allsametp = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!allsametp)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// All ok: delete the clause_sub, and insert the queries from
|
||||||
|
// its searchdata in its place
|
||||||
|
m_query.erase(m_query.begin() + i);
|
||||||
|
m_query.insert(m_query.begin() + i,
|
||||||
|
clsubp->getSub()->m_query.begin(),
|
||||||
|
clsubp->getSub()->m_query.end());
|
||||||
|
for (unsigned int j = i;
|
||||||
|
j < i + clsubp->getSub()->m_query.size(); j++) {
|
||||||
|
m_query[j]->setParent(this);
|
||||||
|
}
|
||||||
|
i += clsubp->getSub()->m_query.size() - 1;
|
||||||
|
|
||||||
|
// We don't want the clauses to be deleted when the parent is, as we
|
||||||
|
// know own them.
|
||||||
|
clsubp->getSub()->m_query.clear();
|
||||||
|
delete clsubp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool SearchData::singleSimple()
|
bool SearchData::singleSimple()
|
||||||
{
|
{
|
||||||
if (m_query.size() != 1 || !m_filetypes.empty() || !m_nfiletypes.empty() ||
|
if (m_query.size() != 1 || !m_filetypes.empty() || !m_nfiletypes.empty() ||
|
||||||
@ -206,8 +256,9 @@ bool SearchData::singleSimple()
|
|||||||
m_haveWildCards)
|
m_haveWildCards)
|
||||||
return false;
|
return false;
|
||||||
SearchDataClause *clp = *m_query.begin();
|
SearchDataClause *clp = *m_query.begin();
|
||||||
if (clp->getTp() != SCLT_AND && clp->getTp() != SCLT_OR)
|
if (clp->getTp() != SCLT_AND && clp->getTp() != SCLT_OR) {
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -215,8 +266,76 @@ bool SearchData::singleSimple()
|
|||||||
void SearchData::getTerms(HighlightData &hld) const
|
void SearchData::getTerms(HighlightData &hld) const
|
||||||
{
|
{
|
||||||
for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++)
|
for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++)
|
||||||
(*it)->getTerms(hld);
|
(*it)->getTerms(hld);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SearchData::dump(ostream& o) const
|
||||||
|
{
|
||||||
|
o << "SearchData: " << " qs " << int(m_query.size()) <<
|
||||||
|
" ft " << m_filetypes.size() << " nft " << m_nfiletypes.size() <<
|
||||||
|
" hd " << m_haveDates << " maxs " << int(m_maxSize) << " mins " <<
|
||||||
|
int(m_minSize) << " wc " << m_haveWildCards << "\n";
|
||||||
|
for (std::vector<SearchDataClause*>::const_iterator it =
|
||||||
|
m_query.begin(); it != m_query.end(); it++) {
|
||||||
|
(*it)->dump(o);
|
||||||
|
o << "\n";
|
||||||
|
}
|
||||||
|
o << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
void SearchDataClause::dump(ostream& o) const
|
||||||
|
{
|
||||||
|
o << "SearchDataClause??";
|
||||||
|
}
|
||||||
|
|
||||||
|
void SearchDataClauseSimple::dump(ostream& o) const
|
||||||
|
{
|
||||||
|
o << "ClauseSimple: ";
|
||||||
|
if (m_exclude)
|
||||||
|
o << "- ";
|
||||||
|
o << "[" ;
|
||||||
|
if (!m_field.empty())
|
||||||
|
o << m_field << " : ";
|
||||||
|
o << m_text << "]";
|
||||||
|
}
|
||||||
|
|
||||||
|
void SearchDataClauseFilename::dump(ostream& o) const
|
||||||
|
{
|
||||||
|
o << "ClauseFN: ";
|
||||||
|
if (m_exclude)
|
||||||
|
o << " - ";
|
||||||
|
o << "[" << m_text << "]";
|
||||||
|
}
|
||||||
|
|
||||||
|
void SearchDataClausePath::dump(ostream& o) const
|
||||||
|
{
|
||||||
|
o << "ClausePath: ";
|
||||||
|
if (m_exclude)
|
||||||
|
o << " - ";
|
||||||
|
o << "[" << m_text << "]";
|
||||||
|
}
|
||||||
|
|
||||||
|
void SearchDataClauseDist::dump(ostream& o) const
|
||||||
|
{
|
||||||
|
if (m_tp == SCLT_NEAR)
|
||||||
|
o << "ClauseDist: NEAR: ";
|
||||||
|
else
|
||||||
|
o << "ClauseDist: PHRA: ";
|
||||||
|
|
||||||
|
if (m_exclude)
|
||||||
|
o << " - ";
|
||||||
|
o << "[";
|
||||||
|
if (!m_field.empty())
|
||||||
|
o << m_field << " : ";
|
||||||
|
o << m_text << "]";
|
||||||
|
}
|
||||||
|
|
||||||
|
void SearchDataClauseSub::dump(ostream& o) const
|
||||||
|
{
|
||||||
|
o << "ClauseSub {\n";
|
||||||
|
m_sub.getconstptr()->dump(o);
|
||||||
|
o << "}";
|
||||||
|
}
|
||||||
|
|
||||||
} // Namespace Rcl
|
} // Namespace Rcl
|
||||||
|
|||||||
@ -26,6 +26,7 @@
|
|||||||
*/
|
*/
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <ostream>
|
||||||
|
|
||||||
#include "rcldb.h"
|
#include "rcldb.h"
|
||||||
#include "refcntr.h"
|
#include "refcntr.h"
|
||||||
@ -161,6 +162,7 @@ public:
|
|||||||
int getMaxExp() {return m_maxexp;}
|
int getMaxExp() {return m_maxexp;}
|
||||||
int getMaxCl() {return m_maxcl;}
|
int getMaxCl() {return m_maxcl;}
|
||||||
int getSoftMaxExp() {return m_softmaxexpand;}
|
int getSoftMaxExp() {return m_softmaxexpand;}
|
||||||
|
void dump(ostream& o) const;
|
||||||
|
|
||||||
friend class ::AdvSearch;
|
friend class ::AdvSearch;
|
||||||
|
|
||||||
@ -207,6 +209,11 @@ private:
|
|||||||
// value during "find-as-you-type" operations from the GUI
|
// value during "find-as-you-type" operations from the GUI
|
||||||
int m_softmaxexpand;
|
int m_softmaxexpand;
|
||||||
|
|
||||||
|
// Collapse bogus subqueries generated by the query parser, mostly
|
||||||
|
// so that we can check if this is an autophrase candidate (else
|
||||||
|
// Xapian will do it anyway)
|
||||||
|
void simplify();
|
||||||
|
|
||||||
bool expandFileTypes(Rcl::Db &db, std::vector<std::string>& exptps);
|
bool expandFileTypes(Rcl::Db &db, std::vector<std::string>& exptps);
|
||||||
bool clausesToQuery(Rcl::Db &db, SClType tp,
|
bool clausesToQuery(Rcl::Db &db, SClType tp,
|
||||||
std::vector<SearchDataClause*>& query,
|
std::vector<SearchDataClause*>& query,
|
||||||
@ -297,6 +304,8 @@ public:
|
|||||||
virtual Relation getrel() {
|
virtual Relation getrel() {
|
||||||
return m_rel;
|
return m_rel;
|
||||||
}
|
}
|
||||||
|
virtual void dump(ostream& o) const;
|
||||||
|
|
||||||
friend class SearchData;
|
friend class SearchData;
|
||||||
protected:
|
protected:
|
||||||
std::string m_reason;
|
std::string m_reason;
|
||||||
@ -361,6 +370,8 @@ public:
|
|||||||
virtual void setfield(const string& field) {
|
virtual void setfield(const string& field) {
|
||||||
m_field = field;
|
m_field = field;
|
||||||
}
|
}
|
||||||
|
virtual void dump(ostream& o) const;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
std::string m_text; // Raw user entry text.
|
std::string m_text; // Raw user entry text.
|
||||||
std::string m_field; // Field specification if any
|
std::string m_field; // Field specification if any
|
||||||
@ -382,6 +393,7 @@ protected:
|
|||||||
int mods, void *pq, bool useNear, int slack);
|
int mods, void *pq, bool useNear, int slack);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filename search clause. This is special because term expansion is only
|
* Filename search clause. This is special because term expansion is only
|
||||||
* performed against the unsplit file name terms.
|
* performed against the unsplit file name terms.
|
||||||
@ -404,6 +416,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
virtual bool toNativeQuery(Rcl::Db &, void *);
|
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||||
|
virtual void dump(ostream& o) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -441,7 +454,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
virtual bool toNativeQuery(Rcl::Db &, void *);
|
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||||
|
virtual void dump(ostream& o) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -468,6 +481,7 @@ public:
|
|||||||
virtual void setslack(int slack) {
|
virtual void setslack(int slack) {
|
||||||
m_slack = slack;
|
m_slack = slack;
|
||||||
}
|
}
|
||||||
|
virtual void dump(ostream& o) const;
|
||||||
private:
|
private:
|
||||||
int m_slack;
|
int m_slack;
|
||||||
};
|
};
|
||||||
@ -494,6 +508,8 @@ public:
|
|||||||
virtual RefCntr<SearchData> getSub() {
|
virtual RefCntr<SearchData> getSub() {
|
||||||
return m_sub;
|
return m_sub;
|
||||||
}
|
}
|
||||||
|
virtual void dump(ostream& o) const;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
RefCntr<SearchData> m_sub;
|
RefCntr<SearchData> m_sub;
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user