Fixed the autophrase fix: need a full tree flatten to work
This commit is contained in:
parent
3e9581345d
commit
2a69d30701
@ -25,6 +25,7 @@
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
#include "xapian.h"
|
||||
@ -79,6 +80,10 @@ SearchData::~SearchData()
|
||||
bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
||||
{
|
||||
LOGDEB0(("SearchData::maybeAddAutoPhrase()\n"));
|
||||
// cerr << "BEFORE SIMPLIFY\n"; dump(cerr);
|
||||
simplify();
|
||||
// cerr << "AFTER SIMPLIFY\n"; dump(cerr);
|
||||
|
||||
if (!m_query.size()) {
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: empty query\n"));
|
||||
return false;
|
||||
@ -90,28 +95,12 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
||||
// field names, bail out.
|
||||
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
|
||||
SClType tp = (*it)->m_tp;
|
||||
SearchDataClauseSimple *clp = 0;
|
||||
if (tp == SCLT_SUB) {
|
||||
// The query language parser produces subqueries for simple terms
|
||||
SearchDataClauseSub *subclp =
|
||||
dynamic_cast<SearchDataClauseSub*>(*it);
|
||||
if (subclp == 0) {
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: "
|
||||
"dyncast to clauseSub failed\n"));
|
||||
return false;
|
||||
}
|
||||
if (!subclp->getSub()->singleSimple()) {
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: !pureSingle\n"));
|
||||
return false;
|
||||
}
|
||||
clp = dynamic_cast<SearchDataClauseSimple*>(
|
||||
*(subclp->getSub()->m_query.begin()));
|
||||
} else if (tp != SCLT_AND && tp != SCLT_OR) {
|
||||
if (tp != SCLT_AND && tp != SCLT_OR) {
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: wrong tp %d\n", tp));
|
||||
return false;
|
||||
} else {
|
||||
clp = dynamic_cast<SearchDataClauseSimple*>(*it);
|
||||
}
|
||||
SearchDataClauseSimple *clp =
|
||||
dynamic_cast<SearchDataClauseSimple*>(*it);
|
||||
if (clp == 0) {
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: dyncast failed\n"));
|
||||
return false;
|
||||
@ -130,9 +119,11 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
||||
LOGDEB2(("SearchData::maybeAddAutoPhrase: wildcards\n"));
|
||||
return false;
|
||||
}
|
||||
// Do a simple word-split here, don't bother with the full-blown
|
||||
// textsplit. The autophrase thing is just "best effort", it's
|
||||
// normal that it won't work in strange cases.
|
||||
|
||||
// Do a simple word-split here, not the full-blown
|
||||
// textsplit. Spans of stopwords should not be trimmed later
|
||||
// in this function, they will be properly split when the
|
||||
// phrase gets processed by toNativeQuery() later on.
|
||||
vector<string> wl;
|
||||
stringToStrings(clp->gettext(), wl);
|
||||
words.insert(words.end(), wl.begin(), wl.end());
|
||||
@ -154,7 +145,7 @@ bool SearchData::maybeAddAutoPhrase(Rcl::Db& db, double freqThreshold)
|
||||
swords.append(1, ' ');
|
||||
swords += *it;
|
||||
} else {
|
||||
LOGDEB0(("Autophrase: [%s] too frequent (%.2f %%)\n",
|
||||
LOGDEB0(("SearchData::Autophrase: [%s] too frequent (%.2f %%)\n",
|
||||
it->c_str(), 100 * freq));
|
||||
slack++;
|
||||
}
|
||||
@ -199,6 +190,65 @@ bool SearchData::fileNameOnly()
|
||||
return true;
|
||||
}
|
||||
|
||||
void SearchData::simplify()
|
||||
{
|
||||
for (unsigned int i = 0; i < m_query.size(); i++) {
|
||||
if (m_query[i]->m_tp != SCLT_SUB)
|
||||
continue;
|
||||
//C[est ce dyncast qui crashe??
|
||||
SearchDataClauseSub *clsubp =
|
||||
dynamic_cast<SearchDataClauseSub*>(m_query[i]);
|
||||
if (clsubp == 0) {
|
||||
// ??
|
||||
continue;
|
||||
}
|
||||
if (clsubp->getSub()->m_tp != m_tp)
|
||||
continue;
|
||||
|
||||
clsubp->getSub()->simplify();
|
||||
|
||||
// If this subquery has special attributes, it's not a
|
||||
// candidate for collapsing
|
||||
if (!clsubp->getSub()->m_filetypes.empty() ||
|
||||
!clsubp->getSub()->m_nfiletypes.empty() ||
|
||||
clsubp->getSub()->m_haveDates ||
|
||||
clsubp->getSub()->m_maxSize != size_t(-1) ||
|
||||
clsubp->getSub()->m_minSize != size_t(-1) ||
|
||||
clsubp->getSub()->m_haveWildCards)
|
||||
continue;
|
||||
|
||||
bool allsametp = true;
|
||||
for (qlist_it_t it1 = clsubp->getSub()->m_query.begin();
|
||||
it1 != clsubp->getSub()->m_query.end(); it1++) {
|
||||
// We want all AND or OR clause, and same as our conjunction
|
||||
if (((*it1)->getTp() != SCLT_AND && (*it1)->getTp() != SCLT_OR) ||
|
||||
(*it1)->getTp() != m_tp) {
|
||||
allsametp = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!allsametp)
|
||||
continue;
|
||||
|
||||
// All ok: delete the clause_sub, and insert the queries from
|
||||
// its searchdata in its place
|
||||
m_query.erase(m_query.begin() + i);
|
||||
m_query.insert(m_query.begin() + i,
|
||||
clsubp->getSub()->m_query.begin(),
|
||||
clsubp->getSub()->m_query.end());
|
||||
for (unsigned int j = i;
|
||||
j < i + clsubp->getSub()->m_query.size(); j++) {
|
||||
m_query[j]->setParent(this);
|
||||
}
|
||||
i += clsubp->getSub()->m_query.size() - 1;
|
||||
|
||||
// We don't want the clauses to be deleted when the parent is, as we
|
||||
// know own them.
|
||||
clsubp->getSub()->m_query.clear();
|
||||
delete clsubp;
|
||||
}
|
||||
}
|
||||
|
||||
bool SearchData::singleSimple()
|
||||
{
|
||||
if (m_query.size() != 1 || !m_filetypes.empty() || !m_nfiletypes.empty() ||
|
||||
@ -206,8 +256,9 @@ bool SearchData::singleSimple()
|
||||
m_haveWildCards)
|
||||
return false;
|
||||
SearchDataClause *clp = *m_query.begin();
|
||||
if (clp->getTp() != SCLT_AND && clp->getTp() != SCLT_OR)
|
||||
if (clp->getTp() != SCLT_AND && clp->getTp() != SCLT_OR) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -219,4 +270,72 @@ void SearchData::getTerms(HighlightData &hld) const
|
||||
return;
|
||||
}
|
||||
|
||||
void SearchData::dump(ostream& o) const
|
||||
{
|
||||
o << "SearchData: " << " qs " << int(m_query.size()) <<
|
||||
" ft " << m_filetypes.size() << " nft " << m_nfiletypes.size() <<
|
||||
" hd " << m_haveDates << " maxs " << int(m_maxSize) << " mins " <<
|
||||
int(m_minSize) << " wc " << m_haveWildCards << "\n";
|
||||
for (std::vector<SearchDataClause*>::const_iterator it =
|
||||
m_query.begin(); it != m_query.end(); it++) {
|
||||
(*it)->dump(o);
|
||||
o << "\n";
|
||||
}
|
||||
o << "\n";
|
||||
}
|
||||
|
||||
void SearchDataClause::dump(ostream& o) const
|
||||
{
|
||||
o << "SearchDataClause??";
|
||||
}
|
||||
|
||||
void SearchDataClauseSimple::dump(ostream& o) const
|
||||
{
|
||||
o << "ClauseSimple: ";
|
||||
if (m_exclude)
|
||||
o << "- ";
|
||||
o << "[" ;
|
||||
if (!m_field.empty())
|
||||
o << m_field << " : ";
|
||||
o << m_text << "]";
|
||||
}
|
||||
|
||||
void SearchDataClauseFilename::dump(ostream& o) const
|
||||
{
|
||||
o << "ClauseFN: ";
|
||||
if (m_exclude)
|
||||
o << " - ";
|
||||
o << "[" << m_text << "]";
|
||||
}
|
||||
|
||||
void SearchDataClausePath::dump(ostream& o) const
|
||||
{
|
||||
o << "ClausePath: ";
|
||||
if (m_exclude)
|
||||
o << " - ";
|
||||
o << "[" << m_text << "]";
|
||||
}
|
||||
|
||||
void SearchDataClauseDist::dump(ostream& o) const
|
||||
{
|
||||
if (m_tp == SCLT_NEAR)
|
||||
o << "ClauseDist: NEAR: ";
|
||||
else
|
||||
o << "ClauseDist: PHRA: ";
|
||||
|
||||
if (m_exclude)
|
||||
o << " - ";
|
||||
o << "[";
|
||||
if (!m_field.empty())
|
||||
o << m_field << " : ";
|
||||
o << m_text << "]";
|
||||
}
|
||||
|
||||
void SearchDataClauseSub::dump(ostream& o) const
|
||||
{
|
||||
o << "ClauseSub {\n";
|
||||
m_sub.getconstptr()->dump(o);
|
||||
o << "}";
|
||||
}
|
||||
|
||||
} // Namespace Rcl
|
||||
|
||||
@ -26,6 +26,7 @@
|
||||
*/
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <ostream>
|
||||
|
||||
#include "rcldb.h"
|
||||
#include "refcntr.h"
|
||||
@ -161,6 +162,7 @@ public:
|
||||
int getMaxExp() {return m_maxexp;}
|
||||
int getMaxCl() {return m_maxcl;}
|
||||
int getSoftMaxExp() {return m_softmaxexpand;}
|
||||
void dump(ostream& o) const;
|
||||
|
||||
friend class ::AdvSearch;
|
||||
|
||||
@ -207,6 +209,11 @@ private:
|
||||
// value during "find-as-you-type" operations from the GUI
|
||||
int m_softmaxexpand;
|
||||
|
||||
// Collapse bogus subqueries generated by the query parser, mostly
|
||||
// so that we can check if this is an autophrase candidate (else
|
||||
// Xapian will do it anyway)
|
||||
void simplify();
|
||||
|
||||
bool expandFileTypes(Rcl::Db &db, std::vector<std::string>& exptps);
|
||||
bool clausesToQuery(Rcl::Db &db, SClType tp,
|
||||
std::vector<SearchDataClause*>& query,
|
||||
@ -297,6 +304,8 @@ public:
|
||||
virtual Relation getrel() {
|
||||
return m_rel;
|
||||
}
|
||||
virtual void dump(ostream& o) const;
|
||||
|
||||
friend class SearchData;
|
||||
protected:
|
||||
std::string m_reason;
|
||||
@ -361,6 +370,8 @@ public:
|
||||
virtual void setfield(const string& field) {
|
||||
m_field = field;
|
||||
}
|
||||
virtual void dump(ostream& o) const;
|
||||
|
||||
protected:
|
||||
std::string m_text; // Raw user entry text.
|
||||
std::string m_field; // Field specification if any
|
||||
@ -382,6 +393,7 @@ protected:
|
||||
int mods, void *pq, bool useNear, int slack);
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Filename search clause. This is special because term expansion is only
|
||||
* performed against the unsplit file name terms.
|
||||
@ -404,6 +416,7 @@ public:
|
||||
}
|
||||
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||
virtual void dump(ostream& o) const;
|
||||
};
|
||||
|
||||
|
||||
@ -441,7 +454,7 @@ public:
|
||||
}
|
||||
|
||||
virtual bool toNativeQuery(Rcl::Db &, void *);
|
||||
|
||||
virtual void dump(ostream& o) const;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -468,6 +481,7 @@ public:
|
||||
virtual void setslack(int slack) {
|
||||
m_slack = slack;
|
||||
}
|
||||
virtual void dump(ostream& o) const;
|
||||
private:
|
||||
int m_slack;
|
||||
};
|
||||
@ -494,6 +508,8 @@ public:
|
||||
virtual RefCntr<SearchData> getSub() {
|
||||
return m_sub;
|
||||
}
|
||||
virtual void dump(ostream& o) const;
|
||||
|
||||
protected:
|
||||
RefCntr<SearchData> m_sub;
|
||||
};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user