Converted query language parser from the old regexp jungle to bison. Allow using parentheses for clearer syntax.

This commit is contained in:
Jean-Francois Dockes 2015-01-29 16:15:17 +01:00
parent 88bccb47b3
commit 3fb7183eae
18 changed files with 1765 additions and 1862 deletions

View File

@ -15,6 +15,7 @@ QTGUI = @QTGUI@
RCLLIBVERSION=@RCLLIBVERSION@
all: configure mk/sysconf
${MAKE} -C query wasaparse.tab.cpp
(cd lib; sh mkMake)
${MAKE} -C lib
${MAKE} -C index depend recollindex
@ -59,6 +60,7 @@ clean:
# Note: we don't remove the top Makefile, to keep the "clean" targets
# available but a "Make" won't work without a configure anyway
distclean: clean
${MAKE} -C query distclean
-${MAKE} -C desktop/unity-lens-recoll distclean
-${MAKE} -C python/recoll distclean
rm -f mk/sysconf mk/localdefs sampleconf/recoll.conf \

View File

@ -34,7 +34,6 @@ using namespace std;
#include "pathut.h"
#include "searchdata.h"
#include "rclquery.h"
#include "wasastringtoquery.h"
#include "wasatorcl.h"
#include "kio_recoll.h"
#include "docseqdb.h"

View File

@ -38,7 +38,6 @@ using namespace std;
#include "pathut.h"
#include "searchdata.h"
#include "rclquery.h"
#include "wasastringtoquery.h"
#include "wasatorcl.h"
#include "kio_recoll.h"
#include "docseqdb.h"

View File

@ -42,8 +42,8 @@ ${depth}/query/plaintorich.cpp \
${depth}/query/recollq.cpp \
${depth}/query/reslistpager.cpp \
${depth}/query/sortseq.cpp \
${depth}/query/wasastringtoquery.cpp \
${depth}/query/wasatorcl.cpp \
${depth}/query/wasaparse.cpp \
${depth}/query/wasaparse.tab.cpp \
${depth}/rcldb/daterange.cpp \
${depth}/rcldb/expansiondbs.cpp \
${depth}/rcldb/rclabstract.cpp \
@ -53,6 +53,7 @@ ${depth}/rcldb/rcldups.cpp \
${depth}/rcldb/rclquery.cpp \
${depth}/rcldb/rclterms.cpp \
${depth}/rcldb/searchdata.cpp \
${depth}/rcldb/searchdatatox.cpp \
${depth}/rcldb/searchdataxml.cpp \
${depth}/rcldb/stemdb.cpp \
${depth}/rcldb/stoplist.cpp \

View File

@ -37,7 +37,6 @@
#include "pathut.h"
#include "rclinit.h"
#include "debuglog.h"
#include "wasastringtoquery.h"
#include "wasatorcl.h"
#include "internfile.h"
#include "wipedir.h"

View File

@ -32,7 +32,6 @@ using namespace std;
#include "searchdata.h"
#include "rclquery.h"
#include "pathut.h"
#include "wasastringtoquery.h"
#include "wasatorcl.h"
#include "debuglog.h"
#include "pathut.h"

View File

@ -4,8 +4,12 @@ include $(depth)/mk/sysconf
PROGS = xadump recollq #trhist qtry qxtry
SRCS = xadump.cpp
all: depend librecoll $(PROGS)
all: wasaparse.tab.cpp depend librecoll $(PROGS)
wasaparse.tab.cpp : wasaparse.y
bison wasaparse.y
mv -f wasaparse.tab.c wasaparse.tab.cpp
XADUMP_OBJS= xadump.o
xadump : $(XADUMP_OBJS)
$(CXX) $(ALL_CXXFLAGS) -o xadump $(XADUMP_OBJS) \
@ -39,3 +43,7 @@ trwasastrtoq.o : wasastringtoquery.cpp wasastringtoquery.h
include $(depth)/mk/commontargets
include alldeps
distclean::
-rm -f location.hh position.hh stack.hh \
wasaparse.tab.c wasaparse.tab.cpp wasaparse.tab.h

View File

@ -36,7 +36,6 @@ using namespace std;
#include "pathut.h"
#include "rclinit.h"
#include "debuglog.h"
#include "wasastringtoquery.h"
#include "wasatorcl.h"
#include "internfile.h"
#include "wipedir.h"

235
src/query/wasaparse.cpp Normal file
View File

@ -0,0 +1,235 @@
/* Copyright (C) 2006 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include <iostream>
#include "wasatorcl.h"
#include "wasaparserdriver.h"
#include "searchdata.h"
#include "debuglog.h"
#define YYDEBUG 1
#include "wasaparse.tab.h"
using namespace std;
using namespace Rcl;
void
yy::parser::error (const location_type& l, const std::string& m)
{
d->setreason(m);
}
SearchData *wasaStringToRcl(const RclConfig *config,
const std::string& stemlang,
const std::string& query, string &reason,
const std::string& autosuffs)
{
WasaParserDriver d(config, stemlang, autosuffs);
SearchData *sd = d.parse(query);
if (!sd)
reason = d.getreason();
return sd;
}
SearchData *WasaParserDriver::parse(const std::string& in)
{
m_input = in;
m_index = 0;
delete m_result;
m_result = 0;
m_returns = stack<int>();
yy::parser parser(this);
parser.set_debug_level(0);
if (parser.parse() != 0) {
delete m_result;
m_result = 0;
}
return m_result;
}
int WasaParserDriver::GETCHAR()
{
if (!m_returns.empty()) {
int c = m_returns.top();
m_returns.pop();
return c;
}
if (m_index < m_input.size())
return m_input[m_index++];
return 0;
}
void WasaParserDriver::UNGETCHAR(int c)
{
m_returns.push(c);
}
// Add clause to query, handling special pseudo-clauses for size/date
// etc. (mostly determined on field name).
bool WasaParserDriver::addClause(SearchData *sd,
SearchDataClauseSimple* cl)
{
if (cl->getfield().empty()) {
// Simple clause with empty field spec.
// Possibly change terms found in the "autosuffs" list into "ext"
// field queries
if (!m_autosuffs.empty()) {
vector<string> asfv;
if (stringToStrings(m_autosuffs, asfv)) {
if (find_if(asfv.begin(), asfv.end(),
StringIcmpPred(cl->gettext())) != asfv.end()) {
cl->setfield("ext");
cl->addModifier(SearchDataClause::SDCM_NOSTEMMING);
}
}
}
return sd->addClause(cl);
}
const string& fld = cl->getfield();
// MIME types and categories
if (!stringicmp("mime", fld) ||!stringicmp("format", fld)) {
if (cl->getexclude()) {
sd->remFiletype(cl->gettext());
} else {
sd->addFiletype(cl->gettext());
}
delete cl;
return true;
}
if (!stringicmp("rclcat", fld) || !stringicmp("type", fld)) {
vector<string> mtypes;
if (m_config && m_config->getMimeCatTypes(cl->gettext(), mtypes)) {
for (vector<string>::iterator mit = mtypes.begin();
mit != mtypes.end(); mit++) {
if (cl->getexclude()) {
sd->remFiletype(*mit);
} else {
sd->addFiletype(*mit);
}
}
}
delete cl;
return true;
}
// Handle "date" spec
if (!stringicmp("date", fld)) {
DateInterval di;
if (!parsedateinterval(cl->gettext(), &di)) {
LOGERR(("Bad date interval format: %s\n",
cl->gettext().c_str()));
m_reason = "Bad date interval format";
delete cl;
return false;
}
LOGDEB(("addClause:: date span: %d-%d-%d/%d-%d-%d\n",
di.y1,di.m1,di.d1, di.y2,di.m2,di.d2));
sd->setDateSpan(&di);
delete cl;
return true;
}
// Handle "size" spec
if (!stringicmp("size", fld)) {
char *cp;
size_t size = strtoll(cl->gettext().c_str(), &cp, 10);
if (*cp != 0) {
switch (*cp) {
case 'k': case 'K': size *= 1E3;break;
case 'm': case 'M': size *= 1E6;break;
case 'g': case 'G': size *= 1E9;break;
case 't': case 'T': size *= 1E12;break;
default:
m_reason = string("Bad multiplier suffix: ") + *cp;
delete cl;
return false;
}
}
SearchDataClause::Relation rel = cl->getrel();
delete cl;
switch (rel) {
case SearchDataClause::REL_EQUALS:
sd->setMaxSize(size);
sd->setMinSize(size);
break;
case SearchDataClause::REL_LT:
case SearchDataClause::REL_LTE:
sd->setMaxSize(size);
break;
case SearchDataClause::REL_GT:
case SearchDataClause::REL_GTE:
sd->setMinSize(size);
break;
default:
m_reason = "Bad relation operator with size query. Use > < or =";
return false;
}
return true;
}
if (!stringicmp("dir", fld)) {
// dir filtering special case
SearchDataClausePath *nclause =
new SearchDataClausePath(cl->gettext(), cl->getexclude());
delete cl;
sd->addClause(nclause);
}
if (cl->getTp() == SCLT_OR || cl->getTp() == SCLT_AND) {
// If this is a normal clause and the term has commas or
// slashes inside, take it as a list, turn the slashes/commas
// to spaces, leave unquoted. Otherwise, this would end up as
// a phrase query. This is a handy way to enter multiple terms
// to be searched inside a field. We interpret ',' as AND, and
// '/' as OR. No mixes allowed and ',' wins.
SClType tp = SCLT_FILENAME;// impossible value
string ns = neutchars(cl->gettext(), ",");
if (ns.compare(cl->gettext())) {
// had ','
tp = SCLT_AND;
} else {
ns = neutchars(cl->gettext(), "/");
if (ns.compare(cl->gettext())) {
// had not ',' but has '/'
tp = SCLT_OR;
}
}
if (tp != SCLT_FILENAME) {
SearchDataClauseSimple *ncl =
new SearchDataClauseSimple(tp, ns, fld);
delete cl;
return sd->addClause(ncl);
}
}
return sd->addClause(cl);
}

415
src/query/wasaparse.y Normal file
View File

@ -0,0 +1,415 @@
%{
#define YYDEBUG 1
#include <stdio.h>
#include <iostream>
#include <string>
#include "searchdata.h"
#include "wasaparserdriver.h"
#include "wasaparse.tab.h"
using namespace std;
int yylex(yy::parser::semantic_type *, WasaParserDriver *);
void yyerror(char const *);
static void qualify(Rcl::SearchDataClauseDist *, const string &);
static void addSubQuery(WasaParserDriver *d,
Rcl::SearchData *sd, Rcl::SearchData *sq)
{
sd->addClause(new Rcl::SearchDataClauseSub(RefCntr<Rcl::SearchData>(sq)));
}
%}
%skeleton "lalr1.cc"
%defines
%error-verbose
%parse-param {WasaParserDriver* d}
%lex-param {WasaParserDriver* d}
%union {
std::string *str;
Rcl::SearchDataClauseSimple *cl;
Rcl::SearchData *sd;
}
%destructor {delete $$;} <str>
%type <cl> qualquote
%type <cl> fieldexpr
%type <cl> term
%type <sd> query
%type <str> complexfieldname
/* Non operator tokens need precedence because of the possibility of
concatenation which needs to have lower prec than OR */
%left <str> WORD
%left <str> QUOTED
%left <str> QUALIFIERS
%left AND UCONCAT
%left OR
%token EQUALS CONTAINS SMALLEREQ SMALLER GREATEREQ GREATER
%%
topquery: query
{
d->m_result = $1;
}
query:
query query %prec UCONCAT
{
//cerr << "q: query query" << endl;
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
addSubQuery(d, sd, $1);
addSubQuery(d, sd, $2);
$$ = sd;
}
| query AND query
{
//cerr << "q: query AND query" << endl;
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
addSubQuery(d, sd, $1);
addSubQuery(d, sd, $3);
$$ = sd;
}
| query OR query
{
//cerr << "q: query OR query" << endl;
Rcl::SearchData *top = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR, d->m_stemlang);
addSubQuery(d, sd, $1);
addSubQuery(d, sd, $3);
addSubQuery(d, top, sd);
$$ = top;
}
| '(' query ')'
{
//cerr << "q: ( query )" << endl;
$$ = $2;
}
|
fieldexpr %prec UCONCAT
{
//cerr << "q: fieldexpr" << endl;
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
d->addClause(sd, $1);
$$ = sd;
}
;
fieldexpr: term
{
// cerr << "fe: simple fieldexpr: " << $1->gettext() << endl;
$$ = $1;
}
| complexfieldname EQUALS term
{
// cerr << "fe: " << *$1 << " = " << $3->gettext() << endl;
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_EQUALS);
$$ = $3;
delete $1;
}
| complexfieldname CONTAINS term
{
// cerr << "fe: " << *$1 << " : " << $3->gettext() << endl;
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_CONTAINS);
$$ = $3;
delete $1;
}
| complexfieldname SMALLER term
{
// cerr << "fe: " << *$1 << " < " << $3->gettext() << endl;
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_LT);
$$ = $3;
delete $1;
}
| complexfieldname SMALLEREQ term
{
// cerr << "fe: " << *$1 << " <= " << $3->gettext() << endl;
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_LTE);
$$ = $3;
delete $1;
}
| complexfieldname GREATER term
{
// cerr << "fe: " << *$1 << " > " << $3->gettext() << endl;
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_GT);
$$ = $3;
delete $1;
}
| complexfieldname GREATEREQ term
{
// cerr << "fe: " << *$1 << " >= " << $3->gettext() << endl;
$3->setfield(*$1);
$3->setrel(Rcl::SearchDataClause::REL_GTE);
$$ = $3;
delete $1;
}
| '-' fieldexpr
{
// cerr << "fe: - fieldexpr[" << $2->gettext() << "]" << endl;
$2->setexclude(true);
$$ = $2;
}
;
/* Deal with field names like dc:title */
complexfieldname:
WORD
{
// cerr << "cfn: WORD" << endl;
$$ = $1;
}
|
complexfieldname CONTAINS WORD
{
// cerr << "cfn: complexfieldname ':' WORD" << endl;
$$ = new string(*$1 + string(":") + *$3);
delete $1;
delete $3;
}
term:
WORD
{
//cerr << "term[" << *$1 << "]" << endl;
$$ = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, *$1);
delete $1;
}
| qualquote
{
$$ = $1;
}
qualquote:
QUOTED
{
// cerr << "QUOTED[" << *$1 << "]" << endl;
$$ = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
delete $1;
}
| QUOTED QUALIFIERS
{
// cerr << "QUOTED[" << *$1 << "] QUALIFIERS[" << *$2 << "]" << endl;
Rcl::SearchDataClauseDist *cl =
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
qualify(cl, *$2);
$$ = cl;
delete $1;
delete $2;
}
%%
#include <ctype.h>
// Look for int at index, skip and return new index found? value.
static unsigned int qualGetInt(const string& q, unsigned int cur, int *pval)
{
unsigned int ncur = cur;
if (cur < q.size() - 1) {
char *endptr;
int val = strtol(&q[cur + 1], &endptr, 10);
if (endptr != &q[cur + 1]) {
ncur += endptr - &q[cur + 1];
*pval = val;
}
}
return ncur;
}
static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals)
{
// cerr << "qualify(" << cl << ", " << quals << ")" << endl;
for (unsigned int i = 0; i < quals.length(); i++) {
//fprintf(stderr, "qual char %c\n", quals[i]);
switch (quals[i]) {
case 'b':
cl->setWeight(10.0);
break;
case 'c': break;
case 'C':
cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
break;
case 'd': break;
case 'D':
cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
break;
case 'e':
cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
break;
case 'l':
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
break;
case 'L': break;
case 'o':
{
int slack = 10;
i = qualGetInt(quals, i, &slack);
cl->setslack(slack);
//cerr << "set slack " << cl->getslack() << " done" << endl;
}
break;
case 'p':
cl->setTp(Rcl::SCLT_NEAR);
if (cl->getslack() == 0) {
cl->setslack(10);
//cerr << "set slack " << cl->getslack() << " done" << endl;
}
break;
case '.':case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':
{
int n = 0;
float factor = 1.0;
if (sscanf(&(quals[i]), "%f %n", &factor, &n)) {
if (factor != 1.0) {
cl->setWeight(factor);
}
}
if (n > 0)
i += n - 1;
}
default:
break;
}
}
}
// specialstartchars are special only at the beginning of a token
// (e.g. doctor-who is a term, not 2 terms separated by '-')
static const string specialstartchars("-");
// specialinchars are special everywhere except inside a quoted string
static const string specialinchars(":=<>()");
// Called with the first dquote already read
static int parseString(WasaParserDriver *d, yy::parser::semantic_type *yylval)
{
string* value = new string();
d->qualifiers().clear();
int c;
while ((c = d->GETCHAR())) {
switch (c) {
case '\\':
/* Escape: get next char */
c = d->GETCHAR();
if (c == 0) {
value->push_back(c);
goto out;
}
value->push_back(c);
break;
case '"':
/* End of string. Look for qualifiers */
while ((c = d->GETCHAR()) && !isspace(c))
d->qualifiers().push_back(c);
goto out;
default:
value->push_back(c);
}
}
out:
//cerr << "GOT QUOTED ["<<value<<"] quals [" << d->qualifiers() << "]" << endl;
yylval->str = value;
return yy::parser::token::QUOTED;
}
int yylex(yy::parser::semantic_type *yylval, WasaParserDriver *d)
{
if (!d->qualifiers().empty()) {
yylval->str = new string();
yylval->str->swap(d->qualifiers());
return yy::parser::token::QUALIFIERS;
}
int c;
/* Skip white space. */
while ((c = d->GETCHAR()) && isspace(c))
continue;
if (c == 0)
return 0;
if (specialstartchars.find_first_of(c) != string::npos) {
//cerr << "yylex: return " << c << endl;
return c;
}
// field-term relations
switch (c) {
case '=': return yy::parser::token::EQUALS;
case ':': return yy::parser::token::CONTAINS;
case '<': {
int c1 = d->GETCHAR();
if (c1 == '=') {
return yy::parser::token::SMALLEREQ;
} else {
d->UNGETCHAR(c1);
return yy::parser::token::SMALLER;
}
}
case '>': {
int c1 = d->GETCHAR();
if (c1 == '=') {
return yy::parser::token::GREATEREQ;
} else {
d->UNGETCHAR(c1);
return yy::parser::token::GREATER;
}
}
case '(': case ')':
return c;
}
if (c == '"')
return parseString(d, yylval);
d->UNGETCHAR(c);
// Other chars start a term or field name or reserved word
string* word = new string();
while ((c = d->GETCHAR())) {
if (isspace(c)) {
//cerr << "Word broken by whitespace" << endl;
break;
} else if (specialinchars.find_first_of(c) != string::npos) {
//cerr << "Word broken by special char" << endl;
d->UNGETCHAR(c);
break;
} else if (c == 0) {
//cerr << "Word broken by EOF" << endl;
break;
} else {
word->push_back(c);
}
}
if (!word->compare("AND") || !word->compare("&&")) {
delete word;
return yy::parser::token::AND;
} else if (!word->compare("OR") || !word->compare("||")) {
delete word;
return yy::parser::token::OR;
}
// cerr << "Got word [" << word << "]" << endl;
yylval->str = word;
return yy::parser::token::WORD;
}

View File

@ -0,0 +1,81 @@
/* Copyright (C) 2006 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _WASAPARSERDRIVER_H_INCLUDED_
#define _WASAPARSERDRIVER_H_INCLUDED_
#include <string>
#include <stack>
class WasaParserDriver;
namespace Rcl {
class SearchData;
class SearchDataClauseSimple;
}
namespace yy {
class parser;
}
class RclConfig;
class WasaParserDriver {
public:
WasaParserDriver(const RclConfig *c, const std::string sl,
const std::string& as)
: m_stemlang(sl), m_autosuffs(as), m_config(c),
m_index(0), m_result(0) {}
Rcl::SearchData *parse(const std::string&);
bool addClause(Rcl::SearchData *sd, Rcl::SearchDataClauseSimple* cl);
int GETCHAR();
void UNGETCHAR(int c);
std::string& qualifiers() {
return m_qualifiers;
}
void setreason(const std::string& reason) {
m_reason = reason;
}
const std::string& getreason() const {
return m_reason;
}
private:
friend class yy::parser;
std::string m_stemlang;
std::string m_autosuffs;
const RclConfig *m_config;
std::string m_input;
unsigned int m_index;
std::stack<int> m_returns;
Rcl::SearchData *m_result;
std::string m_reason;
// Let the quoted string reader store qualifiers in there, simpler
// than handling this in the parser, because their nature is
// determined by the absence of white space after the closing
// dquote. e.g "some term"abc. We could avoid this by making white
// space a token.
std::string m_qualifiers;
};
#endif /* _WASAPARSERDRIVER_H_INCLUDED_ */

View File

@ -1,515 +0,0 @@
/* Copyright (C) 2006 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef TEST_WASASTRINGTOQUERY
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <regex.h>
#include "smallut.h"
#include "wasastringtoquery.h"
#undef DEB_WASASTRINGTOQ
#ifdef DEB_WASASTRINGTOQ
#define DPRINT(X) fprintf X
#define DUMPQ(Q) {string D;Q->describe(D);fprintf(stderr, "%s\n", D.c_str());}
#else
#define DPRINT(X)
#define DUMPQ(Q)
#endif
WasaQuery::~WasaQuery()
{
for (vector<WasaQuery*>::iterator it = m_subs.begin();
it != m_subs.end(); it++) {
delete *it;
}
m_subs.clear();
}
static const char* reltosrel(WasaQuery::Rel rel)
{
switch (rel) {
case WasaQuery::REL_EQUALS: return "=";
case WasaQuery::REL_CONTAINS: return ":";
case WasaQuery::REL_LT: return "<";
case WasaQuery::REL_LTE: return "<=";
case WasaQuery::REL_GT: return ">";
case WasaQuery::REL_GTE: return ">=";
default: return "?";
}
}
void WasaQuery::describe(string &desc) const
{
desc += "(";
string fieldspec = m_fieldspec.empty() ? string() : m_fieldspec +
reltosrel(m_rel);
switch (m_op) {
case OP_NULL:
desc += "NULL";
break;
case OP_LEAF:
if (m_exclude)
desc += "NOT (";
desc += fieldspec + m_value;
if (m_exclude)
desc += ")";
break;
case OP_OR:
case OP_AND:
for (vector<WasaQuery *>::const_iterator it = m_subs.begin();
it != m_subs.end(); it++) {
(*it)->describe(desc);
vector<WasaQuery *>::const_iterator it1 = it;
it1++;
if (it1 != m_subs.end())
desc += m_op == OP_OR ? "OR ": "AND ";
}
break;
}
if (desc[desc.length() - 1] == ' ')
desc.erase(desc.length() - 1);
desc += ")";
if (m_modifiers != 0) {
if (m_modifiers & WQM_BOOST) desc += "BOOST|";
if (m_modifiers & WQM_CASESENS) desc += "CASESENS|";
if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|";
if (m_modifiers & WQM_FUZZY) desc += "FUZZY|";
if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|";
if (m_modifiers & WQM_PHRASESLACK) {
char buf[100];
sprintf(buf, "%d", m_slack);
desc += "PHRASESLACK(" + string(buf) + string(")|");
}
if (m_modifiers & WQM_PROX) desc += "PROX|";
if (m_modifiers & WQM_REGEX) desc += "REGEX|";
if (m_modifiers & WQM_SLOPPY) desc += "SLOPPY|";
if (m_modifiers & WQM_WORDS) desc += "WORDS|";
if (desc.length() > 0 && desc[desc.length()-1] == '|')
desc.erase(desc.length()-1);
}
desc += " ";
}
// The string query parser code:
/* Shamelessly lifted from Beagle:
* This is our regular Expression Pattern:
* we expect something like this:
* -key:"Value String"modifiers
* key:Value
* or
* Value
*/
/* The master regular expression used to parse a query string
* Sub-expressions in parenthesis are numbered from 1. Each opening
* parenthesis increases the index, but we're not interested in all
* Deviations from standard:
* Relation: the standard-conformant line read as (release<1.16):
"(:|=|<|>|<=|>=)" //7 Relation
but we are not actually making use of the relation type
(interpreting all as ":"), and this can product unexpected results
as a (ie pasted) search for nonexfield=value will silently drop
the nonexfield part, while the user probably was not aware of
triggering a field search (expecting just ':' to do this).
*/
static const char * parserExpr =
"(OR|\\|\\|)[[:space:]]*" //1 OR,||
"|"
"(AND|&&)[[:space:]]*" // 2 AND,&& (ignored, default)
"|"
"(" //3
"([+-])?" //4 Force or exclude indicator
"(" //5
"([[:alpha:]][[:alnum:]:]*)" //6 Field spec: ie: "dc:title:letitre"
"[[:space:]]*"
"(:|=|>|<)" //7 Relation
"[[:space:]]*)?"
"(" //8
"(\"" //9
"([^\"]+)" //10 "A quoted term"
"\")"
"([bcCdDeflLoprsw.0-9]*)" //11 modifiers
"|"
"([^[:space:]\"]+)" //12 ANormalTerm
")"
")[[:space:]]*"
;
// For debugging the parser. But see also NMATCH
static const char *matchNames[] = {
/* 0*/ "",
/* 1*/ "OR",
/* 2*/ "AND",
/* 3*/ "",
/* 4*/ "+-",
/* 5*/ "",
/* 6*/ "FIELD",
/* 7*/ "RELATION",
/* 8*/ "",
/* 9*/ "",
/*10*/ "QUOTEDTERM",
/*11*/ "MODIFIERS",
/*12*/ "TERM",
};
#define NMATCH (sizeof(matchNames) / sizeof(char *))
// Symbolic names for the interesting submatch indices
enum SbMatchIdx {SMI_OR=1, SMI_AND=2, SMI_PM=4, SMI_FIELD=6, SMI_REL=7,
SMI_QUOTED=10, SMI_MODIF=11, SMI_TERM=12};
static const int maxmatchlen = 1024;
static const int errbuflen = 300;
class StringToWasaQuery::Internal {
public:
Internal()
: m_rxneedsfree(false)
{}
~Internal()
{
if (m_rxneedsfree)
regfree(&m_rx);
}
bool checkSubMatch(int i, char *match, string& reason)
{
if (i < 0 || i >= int(NMATCH) || m_pmatch[i].rm_so == -1) {
//DPRINT((stderr, "checkSubMatch: no match: i %d rm_so %d\n",
//i, m_pmatch[i].rm_so));
return false;
}
if (m_pmatch[i].rm_eo - m_pmatch[i].rm_so <= 0) {
// weird and fatal
reason = "Internal regular expression handling error";
return false;
}
//DPRINT((stderr, "checkSubMatch: so %d eo %d\n", m_pmatch[i].rm_so,
//m_pmatch[i].rm_eo));
memcpy(match, m_cp + m_pmatch[i].rm_so,
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
return true;
}
WasaQuery *stringToQuery(const string& str, string& reason);
friend class StringToWasaQuery;
private:
const char *m_cp;
regex_t m_rx;
bool m_rxneedsfree;
regmatch_t m_pmatch[NMATCH];
};
StringToWasaQuery::StringToWasaQuery()
: internal(new Internal)
{
}
StringToWasaQuery::~StringToWasaQuery()
{
delete internal;
}
WasaQuery *
StringToWasaQuery::stringToQuery(const string& str, string& reason)
{
if (internal == 0)
return 0;
WasaQuery *wq = internal->stringToQuery(str, reason);
DUMPQ(wq);
return wq;
}
WasaQuery *
StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
{
if (m_rxneedsfree)
regfree(&m_rx);
char errbuf[errbuflen+1];
int errcode;
if ((errcode = regcomp(&m_rx, parserExpr, REG_EXTENDED)) != 0) {
regerror(errcode, &m_rx, errbuf, errbuflen);
reason = errbuf;
return 0;
}
m_rxneedsfree = true;
const char *cpe;
m_cp = str.c_str();
cpe = str.c_str() + str.length();
WasaQuery *query = new WasaQuery;
query->m_op = WasaQuery::OP_AND;
WasaQuery *orChain = 0;
bool prev_or = false;
// Loop on repeated regexp matches on the main string.
for (int loop = 0;;loop++) {
if ((errcode = regexec(&m_rx, m_cp, NMATCH, m_pmatch, 0))) {
regerror(errcode, &m_rx, errbuf, errbuflen);
reason = errbuf;
return 0;
}
if (m_pmatch[0].rm_eo <= 0) {
// weird and fatal
reason = "Internal regular expression handling error";
return 0;
}
#ifdef DEB_WASASTRINGTOQ
DPRINT((stderr, "Next part:\n"));
for (unsigned int i = 0; i < NMATCH; i++) {
if (m_pmatch[i].rm_so == -1) continue;
char match[maxmatchlen+1];
memcpy(match, m_cp + m_pmatch[i].rm_so,
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
if (matchNames[i][0])
DPRINT((stderr, "%10s: [%s] (%d->%d)\n", matchNames[i], match,
(int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo));
}
#endif
char match[maxmatchlen+1];
if (checkSubMatch(SMI_OR, match, reason)) {
if (prev_or) {
// Bad syntax
reason = "Bad syntax: consecutive OR";
return 0;
}
if (orChain == 0) {
// Fist OR seen: start OR subclause.
if ((orChain = new WasaQuery()) == 0) {
reason = "Out of memory";
return 0;
}
orChain->m_op = WasaQuery::OP_OR;
}
// For the first OR, we need to transfer the previous
// query from the main vector to the OR subquery
if (orChain->m_subs.empty() && !query->m_subs.empty()) {
orChain->m_subs.push_back(query->m_subs.back());
query->m_subs.pop_back();
}
prev_or = true;
} else if (checkSubMatch(SMI_AND, match, reason)) {
// Do nothing, AND is the default. We might want to check for
// errors like consecutive ANDs, or OR AND
} else {
WasaQuery *nclause = new WasaQuery;
if (nclause == 0) {
reason = "Out of memory";
return 0;
}
// Check for quoted or unquoted value
unsigned int mods = 0;
if (checkSubMatch(SMI_QUOTED, match, reason)) {
nclause->m_value = match;
mods |= WasaQuery::WQM_QUOTED;
} else if (checkSubMatch(SMI_TERM, match, reason)) {
nclause->m_value = match;
}
if (nclause->m_value.empty()) {
// Isolated +- or fieldname: without a value. Ignore until
// told otherwise.
DPRINT((stderr, "Clause with empty value, skipping\n"));
delete nclause;
goto nextfield;
}
if (checkSubMatch(SMI_MODIF, match, reason)) {
DPRINT((stderr, "Got modifiers: [%s]\n", match));
for (unsigned int i = 0; i < strlen(match); i++) {
switch (match[i]) {
case 'b':
mods |= WasaQuery::WQM_BOOST;
nclause->m_weight = 10.0;
break;
case 'c': break;
case 'C': mods |= WasaQuery::WQM_CASESENS; break;
case 'd': break;
case 'D': mods |= WasaQuery::WQM_DIACSENS; break;
case 'e': mods |= WasaQuery::WQM_CASESENS |
WasaQuery::WQM_DIACSENS |
WasaQuery::WQM_NOSTEM;
break;
case 'f': mods |= WasaQuery::WQM_FUZZY; break;
case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
case 'L': break;
case 'o':
mods |= WasaQuery::WQM_PHRASESLACK;
// Default slack if specified only by 'o' is 10.
nclause->m_slack = 10;
if (i < strlen(match) - 1) {
char *endptr;
int slack = strtol(match+i+1, &endptr, 10);
if (endptr != match+i+1) {
i += endptr - (match+i+1);
nclause->m_slack = slack;
}
}
break;
case 'p':
mods |= WasaQuery::WQM_PROX;
nclause->m_slack = 10;
break;
case 'r': mods |= WasaQuery::WQM_REGEX; break;
case 's': mods |= WasaQuery::WQM_SLOPPY; break;
case 'w': mods |= WasaQuery::WQM_WORDS; break;
case '.':case '0':case '1':case '2':case '3':case '4':
case '5':case '6':case '7':case '8':case '9':
{
int n;
float factor;
if (sscanf(match+i, "%f %n", &factor, &n)) {
nclause->m_weight = factor;
DPRINT((stderr, "Got factor %.2f len %d\n",
factor, n));
}
if (n)
i += n-1;
}
}
}
}
nclause->m_modifiers = WasaQuery::Modifier(mods);
// Field indicator ?
if (checkSubMatch(SMI_FIELD, match, reason)) {
// We used Check for special fields indicating sorting
// etc. here but this went away from the spec. See 1.4
// if it comes back
nclause->m_fieldspec = match;
if (checkSubMatch(SMI_REL, match, reason)) {
switch (match[0]) {
case '=':nclause->m_rel = WasaQuery::REL_EQUALS;break;
case ':':nclause->m_rel = WasaQuery::REL_CONTAINS;break;
case '<':
if (match[1] == '=')
nclause->m_rel = WasaQuery::REL_LTE;
else
nclause->m_rel = WasaQuery::REL_LT;
break;
case '>':
if (match[1] == '=')
nclause->m_rel = WasaQuery::REL_GTE;
else
nclause->m_rel = WasaQuery::REL_GT;
break;
default:
nclause->m_rel = WasaQuery::REL_CONTAINS;
}
} else {
// ?? If field matched we should have a relation
nclause->m_rel = WasaQuery::REL_CONTAINS;
}
}
nclause->m_op = WasaQuery::OP_LEAF;
// +- indicator ?
if (checkSubMatch(SMI_PM, match, reason) && match[0] == '-') {
nclause->m_exclude = true;
} else {
nclause->m_exclude = false;
}
if (prev_or) {
// The precedent token was an OR, add new clause to or chain
//DPRINT((stderr, "Adding to OR chain\n"));
orChain->m_subs.push_back(nclause);
} else {
if (orChain) {
// Getting out of OR. Add the OR subquery to the main one
//DPRINT((stderr, "Adding OR chain to main\n"));
query->m_subs.push_back(orChain);
orChain = 0;
}
//DPRINT((stderr, "Adding to main chain\n"));
// Add new clause to main query
query->m_subs.push_back(nclause);
}
prev_or = false;
}
nextfield:
// Advance current string position. We checked earlier that
// the increment is strictly positive, so we won't loop
// forever
m_cp += m_pmatch[0].rm_eo;
if (m_cp >= cpe)
break;
}
if (orChain) {
// Getting out of OR. Add the OR subquery to the main one
DPRINT((stderr, "Adding OR chain to main.Before: \n"));
DUMPQ(query);
DUMPQ(orChain);
query->m_subs.push_back(orChain);
}
regfree(&m_rx);
m_rxneedsfree = false;
return query;
}
#else // TEST
#include <stdio.h>
#include <stdlib.h>
#include "wasastringtoquery.h"
static char *thisprog;
int main(int argc, char **argv)
{
thisprog = argv[0];
argc--; argv++;
if (argc != 1) {
fprintf(stderr, "need one arg\n");
exit(1);
}
const string str = *argv++;argc--;
string reason;
StringToWasaQuery qparser;
WasaQuery *q = qparser.stringToQuery(str, reason);
if (q == 0) {
fprintf(stderr, "stringToQuery failed: %s\n", reason.c_str());
exit(1);
}
string desc;
q->describe(desc);
fprintf(stderr, "Finally: %s\n", desc.c_str());
exit(0);
}
#endif // TEST_WASASTRINGTOQUERY

View File

@ -1,112 +0,0 @@
/* Copyright (C) 2006 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _WASASTRINGTOQUERY_H_INCLUDED_
#define _WASASTRINGTOQUERY_H_INCLUDED_
#include <string>
#include <vector>
using std::string;
using std::vector;
/* Note: Xesam used to be named wasabi. We changed the references to wasabi in
the comments, but not the code */
/**
* A simple class to represent a parsed Xesam user language element.
* Can hold one leaf element or an array of subqueries to be joined by AND/OR
*
* The complete query is represented by a top WasaQuery holding a
* chain of ANDed subclauses. Some of the subclauses may be themselves
* OR'ed lists (it doesn't go deeper). Entries in the AND list may be
* negated (AND NOT).
*
* For LEAF elements, the value can hold one or several words. In the
* latter case, it should be interpreted as a phrase (comes from a
* user-entered "quoted string"), except if the modifier flags say otherwise.
*
* Some fields only make sense either for compound or LEAF queries. This
* is commented for each. We should subclass really.
*
* Note that wasaStringToQuery supposedly parses the whole Xesam
* User Search Language v 0.95, but that some elements are dropped or
* ignored during the translation to a native Recoll query in wasaToRcl
*/
class WasaQuery {
public:
/** Type of this element: leaf or AND/OR chain */
enum Op {OP_NULL, OP_LEAF, OP_OR, OP_AND};
/** Relation to be searched between field and value. Recoll actually only
supports "contain" except for a size field */
enum Rel {REL_NULL, REL_EQUALS, REL_CONTAINS, REL_LT, REL_LTE,
REL_GT, REL_GTE};
/** Modifiers for terms: case/diacritics handling,
stemming control... */
enum Modifier {WQM_CASESENS = 1, WQM_DIACSENS = 2, WQM_NOSTEM = 4,
WQM_BOOST = 8, WQM_PROX = 0x10, WQM_SLOPPY = 0x20,
WQM_WORDS = 0x40, WQM_PHRASESLACK = 0x80, WQM_REGEX = 0x100,
WQM_FUZZY = 0x200, WQM_QUOTED = 0x400};
typedef vector<WasaQuery*> subqlist_t;
WasaQuery()
: m_op(OP_NULL), m_rel(REL_NULL), m_exclude(false),
m_modifiers(0), m_slack(0), m_weight(1.0)
{}
~WasaQuery();
/** Get string describing the query tree from this point */
void describe(string &desc) const;
/** Op to be performed on either value (may be LEAF or EXCL, or subqs */
WasaQuery::Op m_op;
/** Field specification if any (ie: title, author ...) Only OPT_LEAF */
string m_fieldspec;
/** Relation between field and value: =, :, <,>,<=, >= */
WasaQuery::Rel m_rel;
/* Negating flag */
bool m_exclude;
/* String value. Valid for op == OP_LEAF or EXCL */
string m_value;
/** Subqueries. Valid for conjunctions */
vector<WasaQuery*> m_subs;
unsigned int m_modifiers;
int m_slack;
float m_weight;
};
/**
* Wasabi query string parser class. Could be a simple function
* really, but there might be some parser initialization work done in
* the constructor.
*/
class StringToWasaQuery {
public:
StringToWasaQuery();
~StringToWasaQuery();
WasaQuery *stringToQuery(const string& str, string& reason);
class Internal;
private:
Internal *internal;
};
#endif /* _WASASTRINGTOQUERY_H_INCLUDED_ */

View File

@ -1,286 +0,0 @@
/* Copyright (C) 2006 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <cstdio>
#include <string>
#include <list>
#include <algorithm>
using std::string;
using std::list;
#include "rclconfig.h"
#include "wasastringtoquery.h"
#include "rcldb.h"
#include "searchdata.h"
#include "wasatorcl.h"
#include "debuglog.h"
#include "smallut.h"
#include "rclconfig.h"
#include "refcntr.h"
#include "textsplit.h"
static Rcl::SearchData *wasaQueryToRcl(const RclConfig *config,
const string& stemlang,
WasaQuery *wasa,
const string& autosuffs, string& reason)
{
if (wasa == 0) {
reason = "NULL query";
return 0;
}
if (wasa->m_op != WasaQuery::OP_AND && wasa->m_op != WasaQuery::OP_OR) {
reason = "Top query neither AND nor OR ?";
LOGERR(("wasaQueryToRcl: top query neither AND nor OR!\n"));
return 0;
}
Rcl::SearchData *sdata = new
Rcl::SearchData(wasa->m_op == WasaQuery::OP_AND ? Rcl::SCLT_AND :
Rcl::SCLT_OR, stemlang);
LOGDEB2(("wasaQueryToRcl: %s chain\n", wasa->m_op == WasaQuery::OP_AND ?
"AND" : "OR"));
WasaQuery::subqlist_t::iterator it;
Rcl::SearchDataClause *nclause;
// Walk the list of clauses. Some pseudo-field types need special
// processing, which results in setting data in the top struct
// instead of adding a clause. We check for these first
for (it = wasa->m_subs.begin(); it != wasa->m_subs.end(); it++) {
if (!stringicmp("mime", (*it)->m_fieldspec) ||
!stringicmp("format", (*it)->m_fieldspec)) {
if ((*it)->m_op == WasaQuery::OP_LEAF) {
if ((*it)->m_exclude) {
sdata->remFiletype((*it)->m_value);
} else {
sdata->addFiletype((*it)->m_value);
}
} else {
reason = "internal error: mime clause not leaf??";
return 0;
}
continue;
}
// Xesam uses "type", we also support "rclcat", for broad
// categories like "audio", "presentation", etc.
if (!stringicmp("rclcat", (*it)->m_fieldspec) ||
!stringicmp("type", (*it)->m_fieldspec)) {
if ((*it)->m_op != WasaQuery::OP_LEAF) {
reason = "internal error: rclcat/type clause not leaf??";
return 0;
}
vector<string> mtypes;
if (config && config->getMimeCatTypes((*it)->m_value, mtypes)
&& !mtypes.empty()) {
for (vector<string>::iterator mit = mtypes.begin();
mit != mtypes.end(); mit++) {
if ((*it)->m_exclude) {
sdata->remFiletype(*mit);
} else {
sdata->addFiletype(*mit);
}
}
} else {
reason = "Unknown rclcat/type value: no mime types found";
return 0;
}
continue;
}
// Handle "date" spec
if (!stringicmp("date", (*it)->m_fieldspec)) {
if ((*it)->m_op != WasaQuery::OP_LEAF) {
reason = "Negative date filtering not supported";
return 0;
}
DateInterval di;
if (!parsedateinterval((*it)->m_value, &di)) {
LOGERR(("wasaQueryToRcl: bad date interval format\n"));
reason = "Bad date interval format";
return 0;
}
LOGDEB(("wasaQueryToRcl:: date span: %d-%d-%d/%d-%d-%d\n",
di.y1,di.m1,di.d1, di.y2,di.m2,di.d2));
sdata->setDateSpan(&di);
continue;
}
// Handle "size" spec
if (!stringicmp("size", (*it)->m_fieldspec)) {
if ((*it)->m_op != WasaQuery::OP_LEAF) {
reason = "Negative size filtering not supported";
return 0;
}
char *cp;
size_t size = strtoll((*it)->m_value.c_str(), &cp, 10);
if (*cp != 0) {
switch (*cp) {
case 'k': case 'K': size *= 1E3;break;
case 'm': case 'M': size *= 1E6;break;
case 'g': case 'G': size *= 1E9;break;
case 't': case 'T': size *= 1E12;break;
default:
reason = string("Bad multiplier suffix: ") + *cp;
return 0;
}
}
switch ((*it)->m_rel) {
case WasaQuery::REL_EQUALS:
sdata->setMaxSize(size);
sdata->setMinSize(size);
break;
case WasaQuery::REL_LT:
case WasaQuery::REL_LTE:
sdata->setMaxSize(size);
break;
case WasaQuery::REL_GT:
case WasaQuery::REL_GTE:
sdata->setMinSize(size);
break;
default:
reason = "Bad relation operator with size query. Use > < or =";
return 0;
}
continue;
}
// "Regular" processing follows:
unsigned int mods = (unsigned int)(*it)->m_modifiers;
LOGDEB0(("wasaQueryToRcl: clause modifiers 0x%x\n", mods));
nclause = 0;
switch ((*it)->m_op) {
case WasaQuery::OP_NULL:
case WasaQuery::OP_AND:
default:
reason = "Found bad NULL or AND query type in list";
LOGERR(("wasaQueryToRcl: found bad NULL or AND q type in list\n"));
continue;
case WasaQuery::OP_LEAF: {
LOGDEB0(("wasaQueryToRcl: leaf clause [%s:%s] slack %d excl %d\n",
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(),
(*it)->m_slack, (*it)->m_exclude));
// Change terms found in the "autosuffs" list into "ext"
// field queries
if ((*it)->m_fieldspec.empty() && !autosuffs.empty()) {
vector<string> asfv;
if (stringToStrings(autosuffs, asfv)) {
if (find_if(asfv.begin(), asfv.end(),
StringIcmpPred((*it)->m_value)) != asfv.end()) {
(*it)->m_fieldspec = "ext";
(*it)->m_modifiers |= WasaQuery::WQM_NOSTEM;
}
}
}
if (!stringicmp("dir", (*it)->m_fieldspec)) {
// dir filtering special case
nclause = new Rcl::SearchDataClausePath((*it)->m_value,
(*it)->m_exclude);
} else {
if ((*it)->m_exclude && wasa->m_op != WasaQuery::OP_AND) {
LOGERR(("wasaQueryToRcl: excl clause inside OR list!\n"));
continue;
}
if (mods & WasaQuery::WQM_QUOTED) {
Rcl::SClType tp = (mods & WasaQuery::WQM_PROX) ?
Rcl::SCLT_NEAR :
Rcl::SCLT_PHRASE;
nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value,
(*it)->m_slack,
(*it)->m_fieldspec);
} else {
// If term has commas or slashes inside, take it
// as a list, turn the slashes/commas to spaces,
// leave unquoted. Otherwise, this would end up as
// a phrase query. This is a handy way to enter
// multiple terms to be searched inside a
// field. We interpret ',' as AND, and '/' as
// OR. No mixes allowed and ',' wins.
Rcl::SClType tp = (*it)->m_exclude ? Rcl::SCLT_OR:
Rcl::SCLT_AND;
string ns = neutchars((*it)->m_value, ",");
if (ns.compare((*it)->m_value)) {
// had ','
tp = Rcl::SCLT_AND;
} else {
ns = neutchars((*it)->m_value, "/");
if (ns.compare((*it)->m_value)) {
tp = Rcl::SCLT_OR;
}
}
nclause = new Rcl::SearchDataClauseSimple(tp, ns,
(*it)->m_fieldspec);
}
nclause->setexclude((*it)->m_exclude);
}
if (nclause == 0) {
reason = "Out of memory";
LOGERR(("wasaQueryToRcl: out of memory\n"));
return 0;
}
}
break;
case WasaQuery::OP_OR:
LOGDEB2(("wasaQueryToRcl: OR clause [%s]:[%s]\n",
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
// Create a subquery.
Rcl::SearchData *sub =
wasaQueryToRcl(config, stemlang, *it, autosuffs, reason);
if (sub == 0) {
continue;
}
nclause =
new Rcl::SearchDataClauseSub(RefCntr<Rcl::SearchData>(sub));
if (nclause == 0) {
LOGERR(("wasaQueryToRcl: out of memory\n"));
reason = "Out of memory";
return 0;
}
}
if (mods & WasaQuery::WQM_NOSTEM)
nclause->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
if (mods & WasaQuery::WQM_DIACSENS)
nclause->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
if (mods & WasaQuery::WQM_CASESENS)
nclause->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
if ((*it)->m_weight != 1.0)
nclause->setWeight((*it)->m_weight);
sdata->addClause(nclause);
}
return sdata;
}
Rcl::SearchData *wasaStringToRcl(const RclConfig *config, const string& stemlang,
const string &qs, string &reason,
const string& autosuffs)
{
StringToWasaQuery parser;
WasaQuery *wq = parser.stringToQuery(qs, reason);
if (wq == 0)
return 0;
return wasaQueryToRcl(config, stemlang, wq, autosuffs, reason);
}

View File

@ -17,15 +17,18 @@
#ifndef _WASATORCL_H_INCLUDED_
#define _WASATORCL_H_INCLUDED_
#include <string>
using std::string;
#include "rcldb.h"
#include "searchdata.h"
namespace Rcl {
class SearchData;
}
class RclConfig;
extern Rcl::SearchData *wasaStringToRcl(const RclConfig *, const string& stemlang,
const string& query, string &reason,
const string& autosuffs = string());
extern Rcl::SearchData *wasaStringToRcl(const RclConfig *,
const std::string& stemlang,
const std::string& query,
std::string &reason,
const std::string& autosuffs = "");
#endif /* _WASATORCL_H_INCLUDED_ */

View File

@ -52,8 +52,6 @@ namespace Rcl {
typedef vector<SearchDataClause *>::iterator qlist_it_t;
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
static const int original_term_wqf_booster = 10;
void SearchData::commoninit()
{
m_haveDates = false;
@ -74,241 +72,6 @@ SearchData::~SearchData()
delete *it;
}
// Expand categories and mime type wild card exps Categories are
// expanded against the configuration, mimetypes against the index
// (for wildcards).
bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
{
const RclConfig *cfg = db.getConf();
if (!cfg) {
LOGFATAL(("Db::expandFileTypes: null configuration!!\n"));
return false;
}
vector<string> exptps;
for (vector<string>::iterator it = tps.begin(); it != tps.end(); it++) {
if (cfg->isMimeCategory(*it)) {
vector<string>tps;
cfg->getMimeCatTypes(*it, tps);
exptps.insert(exptps.end(), tps.begin(), tps.end());
} else {
TermMatchResult res;
string mt = stringtolower((const string&)*it);
// We set casesens|diacsens to get an equivalent of ixTermMatch()
db.termMatch(Db::ET_WILD|Db::ET_CASESENS|Db::ET_DIACSENS, string(),
mt, res, -1, "mtype");
if (res.entries.empty()) {
exptps.push_back(it->c_str());
} else {
for (vector<TermMatchEntry>::const_iterator rit =
res.entries.begin(); rit != res.entries.end(); rit++) {
exptps.push_back(strip_prefix(rit->term));
}
}
}
}
sort(exptps.begin(), exptps.end());
exptps.erase(unique(exptps.begin(), exptps.end()), exptps.end());
tps = exptps;
return true;
}
static const char *maxXapClauseMsg =
"Maximum Xapian query size exceeded. Increase maxXapianClauses "
"in the configuration. ";
static const char *maxXapClauseCaseDiacMsg =
"Or try to use case (C) or diacritics (D) sensitivity qualifiers, or less "
"wildcards ?"
;
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
vector<SearchDataClause*>& query,
string& reason, void *d)
{
Xapian::Query xq;
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
Xapian::Query nq;
if (!(*it)->toNativeQuery(db, &nq)) {
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
(*it)->getReason().c_str()));
reason += (*it)->getReason() + " ";
return false;
}
if (nq.empty()) {
LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
continue;
}
// If this structure is an AND list, must use AND_NOT for excl clauses.
// Else this is an OR list, and there can't be excl clauses (checked by
// addClause())
Xapian::Query::op op;
if (tp == SCLT_AND) {
if ((*it)->getexclude()) {
op = Xapian::Query::OP_AND_NOT;
} else {
op = Xapian::Query::OP_AND;
}
} else {
op = Xapian::Query::OP_OR;
}
if (xq.empty()) {
if (op == Xapian::Query::OP_AND_NOT)
xq = Xapian::Query(op, Xapian::Query::MatchAll, nq);
else
xq = nq;
} else {
xq = Xapian::Query(op, xq, nq);
}
if (int(xq.get_length()) >= getMaxCl()) {
LOGERR(("%s\n", maxXapClauseMsg));
m_reason += maxXapClauseMsg;
if (!o_index_stripchars)
m_reason += maxXapClauseCaseDiacMsg;
return false;
}
}
LOGDEB0(("SearchData::clausesToQuery: got %d clauses\n", xq.get_length()));
if (xq.empty())
xq = Xapian::Query::MatchAll;
*((Xapian::Query *)d) = xq;
return true;
}
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
{
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
m_reason.erase();
db.getConf()->getConfParam("maxTermExpand", &m_maxexp);
db.getConf()->getConfParam("maxXapianClauses", &m_maxcl);
// Walk the clause list translating each in turn and building the
// Xapian query tree
Xapian::Query xq;
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",
m_reason.c_str()));
return false;
}
if (m_haveDates) {
// If one of the extremities is unset, compute db extremas
if (m_dates.y1 == 0 || m_dates.y2 == 0) {
int minyear = 1970, maxyear = 2100;
if (!db.maxYearSpan(&minyear, &maxyear)) {
LOGERR(("Can't retrieve index min/max dates\n"));
//whatever, go on.
}
if (m_dates.y1 == 0) {
m_dates.y1 = minyear;
m_dates.m1 = 1;
m_dates.d1 = 1;
}
if (m_dates.y2 == 0) {
m_dates.y2 = maxyear;
m_dates.m2 = 12;
m_dates.d2 = 31;
}
}
LOGDEB(("Db::toNativeQuery: date interval: %d-%d-%d/%d-%d-%d\n",
m_dates.y1, m_dates.m1, m_dates.d1,
m_dates.y2, m_dates.m2, m_dates.d2));
Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1,
m_dates.y2, m_dates.m2, m_dates.d2);
if (dq.empty()) {
LOGINFO(("Db::toNativeQuery: date filter is empty\n"));
}
// If no probabilistic query is provided then promote the daterange
// filter to be THE query instead of filtering an empty query.
if (xq.empty()) {
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
xq = dq;
} else {
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq);
}
}
if (m_minSize != size_t(-1) || m_maxSize != size_t(-1)) {
Xapian::Query sq;
char min[50], max[50];
sprintf(min, "%lld", (long long)m_minSize);
sprintf(max, "%lld", (long long)m_maxSize);
if (m_minSize == size_t(-1)) {
string value(max);
leftzeropad(value, 12);
sq = Xapian::Query(Xapian::Query::OP_VALUE_LE, VALUE_SIZE, value);
} else if (m_maxSize == size_t(-1)) {
string value(min);
leftzeropad(value, 12);
sq = Xapian::Query(Xapian::Query::OP_VALUE_GE, VALUE_SIZE, value);
} else {
string minvalue(min);
leftzeropad(minvalue, 12);
string maxvalue(max);
leftzeropad(maxvalue, 12);
sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE,
minvalue, maxvalue);
}
// If no probabilistic query is provided then promote the
// filter to be THE query instead of filtering an empty query.
if (xq.empty()) {
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
xq = sq;
} else {
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, sq);
}
}
// Add the autophrase if any
if (m_autophrase.isNotNull()) {
Xapian::Query apq;
if (m_autophrase->toNativeQuery(db, &apq)) {
xq = xq.empty() ? apq :
Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq);
}
}
// Add the file type filtering clause if any
if (!m_filetypes.empty()) {
expandFileTypes(db, m_filetypes);
Xapian::Query tq;
for (vector<string>::iterator it = m_filetypes.begin();
it != m_filetypes.end(); it++) {
string term = wrap_prefix(mimetype_prefix) + *it;
LOGDEB0(("Adding file type term: [%s]\n", term.c_str()));
tq = tq.empty() ? Xapian::Query(term) :
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
}
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
}
// Add the neg file type filtering clause if any
if (!m_nfiletypes.empty()) {
expandFileTypes(db, m_nfiletypes);
Xapian::Query tq;
for (vector<string>::iterator it = m_nfiletypes.begin();
it != m_nfiletypes.end(); it++) {
string term = wrap_prefix(mimetype_prefix) + *it;
LOGDEB0(("Adding negative file type term: [%s]\n", term.c_str()));
tq = tq.empty() ? Xapian::Query(term) :
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
}
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq);
}
*((Xapian::Query *)d) = xq;
return true;
}
// This is called by the GUI simple search if the option is set: add
// (OR) phrase to a query (if it is simple enough) so that results
// where the search terms are close and in order will come up on top.
@ -428,695 +191,4 @@ void SearchData::getTerms(HighlightData &hld) const
return;
}
// Splitter callback for breaking a user string into simple terms and
// phrases. This is for parts of the user entry which would appear as
// a single word because there is no white space inside, but are
// actually multiple terms to rcldb (ie term1,term2)
class TextSplitQ : public TextSplitP {
public:
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
: TextSplitP(prc, flags),
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
{}
bool takeword(const std::string &term, int pos, int bs, int be)
{
// Check if the first letter is a majuscule in which
// case we do not want to do stem expansion. Need to do this
// before unac of course...
curnostemexp = unaciscapital(term);
return TextSplitP::takeword(term, pos, bs, be);
}
bool curnostemexp;
vector<string> terms;
vector<bool> nostemexps;
const StopList &stops;
// Count of terms including stopwords: this is for adjusting
// phrase/near slack
int alltermcount;
int lastpos;
};
class TermProcQ : public TermProc {
public:
TermProcQ() : TermProc(0), m_ts(0) {}
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
bool takeword(const std::string &term, int pos, int bs, int be)
{
m_ts->alltermcount++;
if (m_ts->lastpos < pos)
m_ts->lastpos = pos;
bool noexpand = be ? m_ts->curnostemexp : true;
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
term.c_str(), pos, noexpand));
if (m_terms[pos].size() < term.size()) {
m_terms[pos] = term;
m_nste[pos] = noexpand;
}
return true;
}
bool flush()
{
for (map<int, string>::const_iterator it = m_terms.begin();
it != m_terms.end(); it++) {
m_ts->terms.push_back(it->second);
m_ts->nostemexps.push_back(m_nste[it->first]);
}
return true;
}
private:
TextSplitQ *m_ts;
map<int, string> m_terms;
map<int, bool> m_nste;
};
#if 1
static void listVector(const string& what, const vector<string>&l)
{
string a;
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
a = a + *it + " ";
}
LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
}
#endif
/** Expand term into term list, using appropriate mode: stem, wildcards,
* diacritics...
*
* @param mods stem expansion, case and diacritics sensitivity control.
* @param term input single word
* @param oexp output expansion list
* @param sterm output original input term if there were no wildcards
* @param prefix field prefix in index. We could recompute it, but the caller
* has it already. Used in the simple case where there is nothing to expand,
* and we just return the prefixed term (else Db::termMatch deals with it).
*/
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
string& ermsg, int mods,
const string& term,
vector<string>& oexp, string &sterm,
const string& prefix)
{
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
sterm.clear();
oexp.clear();
if (term.empty())
return true;
bool maxexpissoft = false;
int maxexpand = getSoftMaxExp();
if (maxexpand != -1) {
maxexpissoft = true;
} else {
maxexpand = getMaxExp();
}
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
// If there are no wildcards, add term to the list of user-entered terms
if (!haswild) {
m_hldata.uterms.insert(term);
sterm = term;
}
// No stem expansion if there are wildcards or if prevented by caller
bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
if (haswild || getStemLang().empty()) {
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
nostemexp = true;
}
// noexpansion can be modified further down by possible case/diac expansion
bool noexpansion = nostemexp && !haswild;
int termmatchsens = 0;
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
bool case_sensitive = (mods & SDCM_CASESENS) != 0;
if (o_index_stripchars) {
diac_sensitive = case_sensitive = false;
} else {
// If we are working with a raw index, apply the rules for case and
// diacritics sensitivity.
// If any character has a diacritic, we become
// diacritic-sensitive. Note that the way that the test is
// performed (conversion+comparison) will automatically ignore
// accented characters which are actually a separate letter
if (getAutoDiac() && unachasaccents(term)) {
LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
diac_sensitive = true;
}
// If any character apart the first is uppercase, we become
// case-sensitive. The first character is reserved for
// turning off stemming. You need to use a query language
// modifier to search for Floor in a case-sensitive way.
Utf8Iter it(term);
it++;
if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
case_sensitive = true;
}
// If we are sensitive to case or diacritics turn stemming off
if (diac_sensitive || case_sensitive) {
LOGDEB0(("expandTerm: diac or case sens set -> stemexpand off\n"));
nostemexp = true;
}
if (!case_sensitive || !diac_sensitive)
noexpansion = false;
}
if (case_sensitive)
termmatchsens |= Db::ET_CASESENS;
if (diac_sensitive)
termmatchsens |= Db::ET_DIACSENS;
if (noexpansion) {
oexp.push_back(prefix + term);
m_hldata.terms[term] = term;
LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str()));
return true;
}
Db::MatchType mtyp = haswild ? Db::ET_WILD :
nostemexp ? Db::ET_NONE : Db::ET_STEM;
TermMatchResult res;
if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
m_field)) {
// Let it go through
}
// Term match entries to vector of terms
if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
ermsg = "Maximum term expansion size exceeded."
" Maybe use case/diacritics sensitivity or increase maxTermExpand.";
return false;
}
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
it != res.entries.end(); it++) {
oexp.push_back(it->term);
}
// If the term does not exist at all in the db, the return from
// termMatch() is going to be empty, which is not what we want (we
// would then compute an empty Xapian query)
if (oexp.empty())
oexp.push_back(prefix + term);
// Remember the uterm-to-expansion links
for (vector<string>::const_iterator it = oexp.begin();
it != oexp.end(); it++) {
m_hldata.terms[strip_prefix(*it)] = term;
}
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
return true;
}
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
void multiply_groups(vector<vector<string> >::const_iterator vvit,
vector<vector<string> >::const_iterator vvend,
vector<string>& comb,
vector<vector<string> >&allcombs)
{
// Remember my string vector and compute next, for recursive calls.
vector<vector<string> >::const_iterator myvit = vvit++;
// Walk the string vector I'm called upon and, for each string,
// add it to current result, an call myself recursively on the
// next string vector. The last call (last element of the vector of
// vectors), adds the elementary result to the output
// Walk my string vector
for (vector<string>::const_iterator strit = (*myvit).begin();
strit != (*myvit).end(); strit++) {
// Add my current value to the string vector we're building
comb.push_back(*strit);
if (vvit == vvend) {
// Last call: store current result
allcombs.push_back(comb);
} else {
// Call recursively on next string vector
multiply_groups(vvit, vvend, comb, allcombs);
}
// Pop the value I just added (make room for the next element in my
// vector)
comb.pop_back();
}
}
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
const string& span,
int mods, void * pq)
{
vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
span.c_str(), (unsigned int)mods));
vector<string> exp;
string sterm; // dumb version of user term
string prefix;
const FieldTraits *ftp;
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
prefix = wrap_prefix(ftp->pfx);
}
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
return;
// Set up the highlight data. No prefix should go in there
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
m_hldata.slacks.push_back(0);
m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
}
// Push either term or OR of stem-expanded set
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
m_curcl += exp.size();
// If sterm (simplified original user term) is not null, give it a
// relevance boost. We do this even if no expansion occurred (else
// the non-expanded terms in a term list would end-up with even
// less wqf). This does not happen if there are wildcards anywhere
// in the search.
// We normally boost the original term in the stem expansion list. Don't
// do it if there are wildcards anywhere, this would skew the results.
bool doBoostUserTerm =
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
(m_parentSearch == 0 && !m_haveWildCards);
if (doBoostUserTerm && !sterm.empty()) {
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
Xapian::Query(prefix+sterm,
original_term_wqf_booster));
}
pqueries.push_back(xq);
}
// User entry element had several terms: transform into a PHRASE or
// NEAR xapian query, the elements of which can themselves be OR
// queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though)
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
TextSplitQ *splitData,
int mods, void *pq,
bool useNear, int slack)
{
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
Xapian::Query::OP_PHRASE;
vector<Xapian::Query> orqueries;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
bool hadmultiple = false;
#endif
vector<vector<string> >groups;
string prefix;
const FieldTraits *ftp;
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
prefix = wrap_prefix(ftp->pfx);
}
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
slack++;
}
// Go through the list and perform stem/wildcard expansion for each element
vector<bool>::iterator nxit = splitData->nostemexps.begin();
for (vector<string>::iterator it = splitData->terms.begin();
it != splitData->terms.end(); it++, nxit++) {
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
// Adjust when we do stem expansion. Not if disabled by
// caller, not inside phrases, and some versions of xapian
// will accept only one OR clause inside NEAR.
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|| hadmultiple
#endif // single OR inside NEAR
;
int lmods = mods;
if (nostemexp)
lmods |= SearchDataClause::SDCM_NOSTEMMING;
string sterm;
vector<string> exp;
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
return;
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
listVector("", exp);
// groups is used for highlighting, we don't want prefixes in there.
vector<string> noprefs;
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
noprefs.push_back(it->substr(prefix.size()));
}
groups.push_back(noprefs);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(), exp.end()));
m_curcl += exp.size();
if (m_curcl >= getMaxCl())
return;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
if (exp.size() > 1)
hadmultiple = true;
#endif
}
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
slack++;
}
// Generate an appropriate PHRASE/NEAR query with adjusted slack
// For phrases, give a relevance boost like we do for original terms
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
splitData->alltermcount, splitData->lastpos));
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
splitData->lastpos + 1 + slack);
if (op == Xapian::Query::OP_PHRASE)
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
original_term_wqf_booster);
pqueries.push_back(xq);
// Add all combinations of NEAR/PHRASE groups to the highlighting data.
vector<vector<string> > allcombs;
vector<string> comb;
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
// Insert the search groups and slacks in the highlight data, with
// a reference to the user entry that generated them:
m_hldata.groups.insert(m_hldata.groups.end(),
allcombs.begin(), allcombs.end());
m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(),
m_hldata.ugroups.size() - 1);
}
// Trim string beginning with ^ or ending with $ and convert to flags
static int stringToMods(string& s)
{
int mods = 0;
// Check for an anchored search
trimstring(s);
if (s.length() > 0 && s[0] == '^') {
mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
s.erase(0, 1);
}
if (s.length() > 0 && s[s.length()-1] == '$') {
mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
s.erase(s.length()-1);
}
return mods;
}
/**
* Turn user entry string (NOT query language) into a list of xapian queries.
* We just separate words and phrases, and do wildcard and stem expansion,
*
* This is used to process data entered into an OR/AND/NEAR/PHRASE field of
* the GUI (in the case of NEAR/PHRASE, clausedist adds dquotes to the user
* entry).
*
* This appears awful, and it would seem that the split into
* terms/phrases should be performed in the upper layer so that we
* only receive pure term or near/phrase pure elements here, but in
* fact there are things that would appear like terms to naive code,
* and which will actually may be turned into phrases (ie: tom:jerry),
* in a manner which intimately depends on the index implementation,
* so that it makes sense to process this here.
*
* The final list contains one query for each term or phrase
* - Elements corresponding to a stem-expanded part are an OP_OR
* composition of the stem-expanded terms (or a single term query).
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
* composition of the phrase terms (no stem expansion in this case)
* @return the subquery count (either or'd stem-expanded terms or phrase word
* count)
*/
bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
string &ermsg, void *pq,
int slack, bool useNear)
{
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
int mods = m_modifiers;
LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
"slack %d near %d\n",
iq.c_str(), m_field.c_str(), mods, slack, useNear));
ermsg.erase();
m_curcl = 0;
const StopList stops = db.getStopList();
// Simple whitespace-split input into user-level words and
// double-quoted phrases: word1 word2 "this is a phrase".
//
// The text splitter may further still decide that the resulting
// "words" are really phrases, this depends on separators:
// [paul@dom.net] would still be a word (span), but [about:me]
// will probably be handled as a phrase.
vector<string> phrases;
TextSplit::stringToStrings(iq, phrases);
// Process each element: textsplit into terms, handle stem/wildcard
// expansion and transform into an appropriate Xapian::Query
try {
for (vector<string>::iterator it = phrases.begin();
it != phrases.end(); it++) {
LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
// Anchoring modifiers
int amods = stringToMods(*it);
int terminc = amods != 0 ? 1 : 0;
mods |= amods;
// If there are multiple spans in this element, including
// at least one composite, we have to increase the slack
// else a phrase query including a span would fail.
// Ex: "term0@term1 term2" is onlyspans-split as:
// 0 term0@term1 0 12
// 2 term2 13 18
// The position of term2 is 2, not 1, so a phrase search
// would fail.
// We used to do word split, searching for
// "term0 term1 term2" instead, which may have worse
// performance, but will succeed.
// We now adjust the phrase/near slack by comparing the term count
// and the last position
// The term processing pipeline:
TermProcQ tpq;
TermProc *nxt = &tpq;
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
//tpcommon.onlygrams(true);
TermProcPrep tpprep(nxt);
if (o_index_stripchars)
nxt = &tpprep;
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD),
stops, nxt);
tpq.setTSQ(&splitter);
splitter.text_to_words(*it);
slack += splitter.lastpos - splitter.terms.size() + 1;
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
switch (splitter.terms.size() + terminc) {
case 0:
continue;// ??
case 1: {
int lmods = mods;
if (splitter.nostemexps.front())
lmods |= SearchDataClause::SDCM_NOSTEMMING;
m_hldata.ugroups.push_back(splitter.terms);
processSimpleSpan(db, ermsg, splitter.terms.front(),
lmods, &pqueries);
}
break;
default:
m_hldata.ugroups.push_back(splitter.terms);
processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
useNear, slack);
}
if (m_curcl >= getMaxCl()) {
ermsg = maxXapClauseMsg;
if (!o_index_stripchars)
ermsg += maxXapClauseCaseDiacMsg;
break;
}
}
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
} catch (const string &s) {
ermsg = s;
} catch (const char *s) {
ermsg = s;
} catch (...) {
ermsg = "Caught unknown exception";
}
if (!ermsg.empty()) {
LOGERR(("stringToXapianQueries: %s\n", ermsg.c_str()));
return false;
}
return true;
}
// Translate a simple OR or AND search clause.
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
{
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
getStemLang().c_str()));
Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query();
Xapian::Query::op op;
switch (m_tp) {
case SCLT_AND: op = Xapian::Query::OP_AND; break;
case SCLT_OR: op = Xapian::Query::OP_OR; break;
default:
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
return false;
}
vector<Xapian::Query> pqueries;
if (!processUserString(db, m_text, m_reason, &pqueries))
return false;
if (pqueries.empty()) {
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
return true;
}
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
if (m_weight != 1.0) {
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
}
return true;
}
// Translate a FILENAME search clause. This always comes
// from a "filename" search from the gui or recollq. A query language
// "filename:"-prefixed field will not go through here, but through
// the generic field-processing code.
//
// We do not split the entry any more (used to do some crazy thing
// about expanding multiple fragments in the past). We just take the
// value blanks and all and expand this against the indexed unsplit
// file names
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
{
Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query();
int maxexp = getSoftMaxExp();
if (maxexp == -1)
maxexp = getMaxExp();
vector<string> names;
db.filenameWildExp(m_text, names, maxexp);
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
if (m_weight != 1.0) {
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
}
return true;
}
// Translate a dir: path filtering clause. See comments in .h
bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
{
LOGDEB(("SearchDataClausePath::toNativeQuery: [%s]\n", m_text.c_str()));
Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query();
if (m_text.empty()) {
LOGERR(("SearchDataClausePath: empty path??\n"));
m_reason = "Empty path ?";
return false;
}
vector<Xapian::Query> orqueries;
if (m_text[0] == '/')
orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix)));
else
m_text = path_tildexpand(m_text);
vector<string> vpath;
stringToTokens(m_text, vpath, "/");
for (vector<string>::const_iterator pit = vpath.begin();
pit != vpath.end(); pit++){
string sterm;
vector<string> exp;
if (!expandTerm(db, m_reason,
SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS,
*pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
return false;
}
LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
listVector("", exp);
if (exp.size() == 1)
orqueries.push_back(Xapian::Query(exp[0]));
else
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(), exp.end()));
m_curcl += exp.size();
if (m_curcl >= getMaxCl())
return false;
}
*qp = Xapian::Query(Xapian::Query::OP_PHRASE,
orqueries.begin(), orqueries.end());
if (m_weight != 1.0) {
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
}
return true;
}
// Translate NEAR or PHRASE clause.
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
{
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query();
vector<Xapian::Query> pqueries;
Xapian::Query nq;
// We produce a single phrase out of the user entry then use
// stringToXapianQueries() to lowercase and simplify the phrase
// terms etc. This will result into a single (complex)
// Xapian::Query.
if (m_text.find('\"') != string::npos) {
m_text = neutchars(m_text, "\"");
}
string s = cstr_dquote + m_text + cstr_dquote;
bool useNear = (m_tp == SCLT_NEAR);
if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
return false;
if (pqueries.empty()) {
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
return true;
}
*qp = *pqueries.begin();
if (m_weight != 1.0) {
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
}
return true;
}
} // Namespace Rcl

View File

@ -102,7 +102,7 @@ public:
bool toNativeQuery(Rcl::Db &db, void *);
/** We become the owner of cl and will delete it */
bool addClause(SearchDataClause *cl);
bool addClause(SearchDataClause* cl);
/** If this is a simple query (one field only, no distance clauses),
* add phrase made of query terms to query, so that docs containing the
@ -164,7 +164,7 @@ public:
private:
// Combine type. Only SCLT_AND or SCLT_OR here
SClType m_tp;
// Complex query descriptor
// The clauses
std::vector<SearchDataClause*> m_query;
// Restricted set of filetypes if not empty.
std::vector<std::string> m_filetypes;
@ -173,14 +173,18 @@ private:
// Autophrase if set. Can't be part of the normal chain because
// it uses OP_AND_MAYBE
RefCntr<SearchDataClauseDist> m_autophrase;
//
// Special stuff produced by input which looks like a clause but means
// something else (date and size specs)
bool m_haveDates;
DateInterval m_dates; // Restrict to date interval
size_t m_maxSize;
size_t m_minSize;
// Printable expanded version of the complete query, retrieved/set
// from rcldb after the Xapian::setQuery() call
std::string m_description;
// Error diag
std::string m_reason;
bool m_haveWildCards;
std::string m_stemlang;
@ -215,10 +219,12 @@ class SearchDataClause {
public:
enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2,
SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16};
enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE};
SearchDataClause(SClType tp)
: m_tp(tp), m_parentSearch(0), m_haveWildCards(0),
m_modifiers(SDCM_NONE), m_weight(1.0), m_exclude(false)
m_modifiers(SDCM_NONE), m_weight(1.0), m_exclude(false),
m_rel(REL_CONTAINS)
{}
virtual ~SearchDataClause() {}
virtual bool toNativeQuery(Rcl::Db &db, void *) = 0;
@ -230,6 +236,9 @@ public:
{
return m_tp;
}
void setTp(SClType tp) {
m_tp = tp;
}
void setParent(SearchData *p)
{
m_parentSearch = p;
@ -279,7 +288,12 @@ public:
{
m_exclude = onoff;
}
virtual void setrel(Relation rel) {
m_rel = rel;
}
virtual Relation getrel() {
return m_rel;
}
friend class SearchData;
protected:
std::string m_reason;
@ -289,6 +303,8 @@ protected:
Modifier m_modifiers;
float m_weight;
bool m_exclude;
Relation m_rel;
private:
SearchDataClause(const SearchDataClause&)
{
@ -339,13 +355,15 @@ public:
{
return m_field;
}
virtual void setfield(const string& field) {
m_field = field;
}
protected:
std::string m_text; // Raw user entry text.
std::string m_field; // Field specification if any
HighlightData m_hldata;
// Current count of Xapian clauses, to check against expansion limit
int m_curcl;
bool processUserString(Rcl::Db &db, const string &iq,
std::string &ermsg,
void* pq, int slack = 0, bool useNear = false);
@ -444,6 +462,9 @@ public:
{
return m_slack;
}
virtual void setslack(int slack) {
m_slack = slack;
}
private:
int m_slack;
};

983
src/rcldb/searchdatatox.cpp Normal file
View File

@ -0,0 +1,983 @@
/* Copyright (C) 2006 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
// Handle translation from rcl's SearchData structures to Xapian Queries
#include "autoconfig.h"
#include <stdio.h>
#include <string>
#include <vector>
#include <algorithm>
#include <sstream>
using namespace std;
#include "xapian.h"
#include "cstr.h"
#include "rcldb.h"
#include "rcldb_p.h"
#include "searchdata.h"
#include "debuglog.h"
#include "smallut.h"
#include "textsplit.h"
#include "unacpp.h"
#include "utf8iter.h"
#include "stoplist.h"
#include "rclconfig.h"
#include "termproc.h"
#include "synfamily.h"
#include "stemdb.h"
#include "expansiondbs.h"
#include "base64.h"
#include "daterange.h"
namespace Rcl {
typedef vector<SearchDataClause *>::iterator qlist_it_t;
static const int original_term_wqf_booster = 10;
// Expand categories and mime type wild card exps Categories are
// expanded against the configuration, mimetypes against the index
// (for wildcards).
bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
{
const RclConfig *cfg = db.getConf();
if (!cfg) {
LOGFATAL(("Db::expandFileTypes: null configuration!!\n"));
return false;
}
vector<string> exptps;
for (vector<string>::iterator it = tps.begin(); it != tps.end(); it++) {
if (cfg->isMimeCategory(*it)) {
vector<string>tps;
cfg->getMimeCatTypes(*it, tps);
exptps.insert(exptps.end(), tps.begin(), tps.end());
} else {
TermMatchResult res;
string mt = stringtolower((const string&)*it);
// We set casesens|diacsens to get an equivalent of ixTermMatch()
db.termMatch(Db::ET_WILD|Db::ET_CASESENS|Db::ET_DIACSENS, string(),
mt, res, -1, "mtype");
if (res.entries.empty()) {
exptps.push_back(it->c_str());
} else {
for (vector<TermMatchEntry>::const_iterator rit =
res.entries.begin(); rit != res.entries.end(); rit++) {
exptps.push_back(strip_prefix(rit->term));
}
}
}
}
sort(exptps.begin(), exptps.end());
exptps.erase(unique(exptps.begin(), exptps.end()), exptps.end());
tps = exptps;
return true;
}
static const char *maxXapClauseMsg =
"Maximum Xapian query size exceeded. Increase maxXapianClauses "
"in the configuration. ";
static const char *maxXapClauseCaseDiacMsg =
"Or try to use case (C) or diacritics (D) sensitivity qualifiers, or less "
"wildcards ?"
;
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
vector<SearchDataClause*>& query,
string& reason, void *d)
{
Xapian::Query xq;
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
Xapian::Query nq;
if (!(*it)->toNativeQuery(db, &nq)) {
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
(*it)->getReason().c_str()));
reason += (*it)->getReason() + " ";
return false;
}
if (nq.empty()) {
LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
continue;
}
// If this structure is an AND list, must use AND_NOT for excl clauses.
// Else this is an OR list, and there can't be excl clauses (checked by
// addClause())
Xapian::Query::op op;
if (tp == SCLT_AND) {
if ((*it)->getexclude()) {
op = Xapian::Query::OP_AND_NOT;
} else {
op = Xapian::Query::OP_AND;
}
} else {
op = Xapian::Query::OP_OR;
}
if (xq.empty()) {
if (op == Xapian::Query::OP_AND_NOT)
xq = Xapian::Query(op, Xapian::Query::MatchAll, nq);
else
xq = nq;
} else {
xq = Xapian::Query(op, xq, nq);
}
if (int(xq.get_length()) >= getMaxCl()) {
LOGERR(("%s\n", maxXapClauseMsg));
m_reason += maxXapClauseMsg;
if (!o_index_stripchars)
m_reason += maxXapClauseCaseDiacMsg;
return false;
}
}
LOGDEB0(("SearchData::clausesToQuery: got %d clauses\n", xq.get_length()));
if (xq.empty())
xq = Xapian::Query::MatchAll;
*((Xapian::Query *)d) = xq;
return true;
}
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
{
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
m_reason.erase();
db.getConf()->getConfParam("maxTermExpand", &m_maxexp);
db.getConf()->getConfParam("maxXapianClauses", &m_maxcl);
// Walk the clause list translating each in turn and building the
// Xapian query tree
Xapian::Query xq;
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",
m_reason.c_str()));
return false;
}
if (m_haveDates) {
// If one of the extremities is unset, compute db extremas
if (m_dates.y1 == 0 || m_dates.y2 == 0) {
int minyear = 1970, maxyear = 2100;
if (!db.maxYearSpan(&minyear, &maxyear)) {
LOGERR(("Can't retrieve index min/max dates\n"));
//whatever, go on.
}
if (m_dates.y1 == 0) {
m_dates.y1 = minyear;
m_dates.m1 = 1;
m_dates.d1 = 1;
}
if (m_dates.y2 == 0) {
m_dates.y2 = maxyear;
m_dates.m2 = 12;
m_dates.d2 = 31;
}
}
LOGDEB(("Db::toNativeQuery: date interval: %d-%d-%d/%d-%d-%d\n",
m_dates.y1, m_dates.m1, m_dates.d1,
m_dates.y2, m_dates.m2, m_dates.d2));
Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1,
m_dates.y2, m_dates.m2, m_dates.d2);
if (dq.empty()) {
LOGINFO(("Db::toNativeQuery: date filter is empty\n"));
}
// If no probabilistic query is provided then promote the daterange
// filter to be THE query instead of filtering an empty query.
if (xq.empty()) {
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
xq = dq;
} else {
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq);
}
}
if (m_minSize != size_t(-1) || m_maxSize != size_t(-1)) {
Xapian::Query sq;
char min[50], max[50];
sprintf(min, "%lld", (long long)m_minSize);
sprintf(max, "%lld", (long long)m_maxSize);
if (m_minSize == size_t(-1)) {
string value(max);
leftzeropad(value, 12);
sq = Xapian::Query(Xapian::Query::OP_VALUE_LE, VALUE_SIZE, value);
} else if (m_maxSize == size_t(-1)) {
string value(min);
leftzeropad(value, 12);
sq = Xapian::Query(Xapian::Query::OP_VALUE_GE, VALUE_SIZE, value);
} else {
string minvalue(min);
leftzeropad(minvalue, 12);
string maxvalue(max);
leftzeropad(maxvalue, 12);
sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE,
minvalue, maxvalue);
}
// If no probabilistic query is provided then promote the
// filter to be THE query instead of filtering an empty query.
if (xq.empty()) {
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
xq = sq;
} else {
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, sq);
}
}
// Add the autophrase if any
if (m_autophrase.isNotNull()) {
Xapian::Query apq;
if (m_autophrase->toNativeQuery(db, &apq)) {
xq = xq.empty() ? apq :
Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq);
}
}
// Add the file type filtering clause if any
if (!m_filetypes.empty()) {
expandFileTypes(db, m_filetypes);
Xapian::Query tq;
for (vector<string>::iterator it = m_filetypes.begin();
it != m_filetypes.end(); it++) {
string term = wrap_prefix(mimetype_prefix) + *it;
LOGDEB0(("Adding file type term: [%s]\n", term.c_str()));
tq = tq.empty() ? Xapian::Query(term) :
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
}
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
}
// Add the neg file type filtering clause if any
if (!m_nfiletypes.empty()) {
expandFileTypes(db, m_nfiletypes);
Xapian::Query tq;
for (vector<string>::iterator it = m_nfiletypes.begin();
it != m_nfiletypes.end(); it++) {
string term = wrap_prefix(mimetype_prefix) + *it;
LOGDEB0(("Adding negative file type term: [%s]\n", term.c_str()));
tq = tq.empty() ? Xapian::Query(term) :
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
}
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq);
}
*((Xapian::Query *)d) = xq;
return true;
}
// Splitter callback for breaking a user string into simple terms and
// phrases. This is for parts of the user entry which would appear as
// a single word because there is no white space inside, but are
// actually multiple terms to rcldb (ie term1,term2)
class TextSplitQ : public TextSplitP {
public:
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
: TextSplitP(prc, flags),
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
{}
bool takeword(const std::string &term, int pos, int bs, int be)
{
// Check if the first letter is a majuscule in which
// case we do not want to do stem expansion. Need to do this
// before unac of course...
curnostemexp = unaciscapital(term);
return TextSplitP::takeword(term, pos, bs, be);
}
bool curnostemexp;
vector<string> terms;
vector<bool> nostemexps;
const StopList &stops;
// Count of terms including stopwords: this is for adjusting
// phrase/near slack
int alltermcount;
int lastpos;
};
class TermProcQ : public TermProc {
public:
TermProcQ() : TermProc(0), m_ts(0) {}
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
bool takeword(const std::string &term, int pos, int bs, int be)
{
m_ts->alltermcount++;
if (m_ts->lastpos < pos)
m_ts->lastpos = pos;
bool noexpand = be ? m_ts->curnostemexp : true;
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
term.c_str(), pos, noexpand));
if (m_terms[pos].size() < term.size()) {
m_terms[pos] = term;
m_nste[pos] = noexpand;
}
return true;
}
bool flush()
{
for (map<int, string>::const_iterator it = m_terms.begin();
it != m_terms.end(); it++) {
m_ts->terms.push_back(it->second);
m_ts->nostemexps.push_back(m_nste[it->first]);
}
return true;
}
private:
TextSplitQ *m_ts;
map<int, string> m_terms;
map<int, bool> m_nste;
};
#if 1
static void listVector(const string& what, const vector<string>&l)
{
string a;
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
a = a + *it + " ";
}
LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
}
#endif
/** Expand term into term list, using appropriate mode: stem, wildcards,
* diacritics...
*
* @param mods stem expansion, case and diacritics sensitivity control.
* @param term input single word
* @param oexp output expansion list
* @param sterm output original input term if there were no wildcards
* @param prefix field prefix in index. We could recompute it, but the caller
* has it already. Used in the simple case where there is nothing to expand,
* and we just return the prefixed term (else Db::termMatch deals with it).
*/
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
string& ermsg, int mods,
const string& term,
vector<string>& oexp, string &sterm,
const string& prefix)
{
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
sterm.clear();
oexp.clear();
if (term.empty())
return true;
bool maxexpissoft = false;
int maxexpand = getSoftMaxExp();
if (maxexpand != -1) {
maxexpissoft = true;
} else {
maxexpand = getMaxExp();
}
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
// If there are no wildcards, add term to the list of user-entered terms
if (!haswild) {
m_hldata.uterms.insert(term);
sterm = term;
}
// No stem expansion if there are wildcards or if prevented by caller
bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
if (haswild || getStemLang().empty()) {
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
nostemexp = true;
}
// noexpansion can be modified further down by possible case/diac expansion
bool noexpansion = nostemexp && !haswild;
int termmatchsens = 0;
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
bool case_sensitive = (mods & SDCM_CASESENS) != 0;
if (o_index_stripchars) {
diac_sensitive = case_sensitive = false;
} else {
// If we are working with a raw index, apply the rules for case and
// diacritics sensitivity.
// If any character has a diacritic, we become
// diacritic-sensitive. Note that the way that the test is
// performed (conversion+comparison) will automatically ignore
// accented characters which are actually a separate letter
if (getAutoDiac() && unachasaccents(term)) {
LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
diac_sensitive = true;
}
// If any character apart the first is uppercase, we become
// case-sensitive. The first character is reserved for
// turning off stemming. You need to use a query language
// modifier to search for Floor in a case-sensitive way.
Utf8Iter it(term);
it++;
if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
case_sensitive = true;
}
// If we are sensitive to case or diacritics turn stemming off
if (diac_sensitive || case_sensitive) {
LOGDEB0(("expandTerm: diac or case sens set -> stemexpand off\n"));
nostemexp = true;
}
if (!case_sensitive || !diac_sensitive)
noexpansion = false;
}
if (case_sensitive)
termmatchsens |= Db::ET_CASESENS;
if (diac_sensitive)
termmatchsens |= Db::ET_DIACSENS;
if (noexpansion) {
oexp.push_back(prefix + term);
m_hldata.terms[term] = term;
LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str()));
return true;
}
Db::MatchType mtyp = haswild ? Db::ET_WILD :
nostemexp ? Db::ET_NONE : Db::ET_STEM;
TermMatchResult res;
if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
m_field)) {
// Let it go through
}
// Term match entries to vector of terms
if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
ermsg = "Maximum term expansion size exceeded."
" Maybe use case/diacritics sensitivity or increase maxTermExpand.";
return false;
}
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
it != res.entries.end(); it++) {
oexp.push_back(it->term);
}
// If the term does not exist at all in the db, the return from
// termMatch() is going to be empty, which is not what we want (we
// would then compute an empty Xapian query)
if (oexp.empty())
oexp.push_back(prefix + term);
// Remember the uterm-to-expansion links
for (vector<string>::const_iterator it = oexp.begin();
it != oexp.end(); it++) {
m_hldata.terms[strip_prefix(*it)] = term;
}
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
return true;
}
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
void multiply_groups(vector<vector<string> >::const_iterator vvit,
vector<vector<string> >::const_iterator vvend,
vector<string>& comb,
vector<vector<string> >&allcombs)
{
// Remember my string vector and compute next, for recursive calls.
vector<vector<string> >::const_iterator myvit = vvit++;
// Walk the string vector I'm called upon and, for each string,
// add it to current result, an call myself recursively on the
// next string vector. The last call (last element of the vector of
// vectors), adds the elementary result to the output
// Walk my string vector
for (vector<string>::const_iterator strit = (*myvit).begin();
strit != (*myvit).end(); strit++) {
// Add my current value to the string vector we're building
comb.push_back(*strit);
if (vvit == vvend) {
// Last call: store current result
allcombs.push_back(comb);
} else {
// Call recursively on next string vector
multiply_groups(vvit, vvend, comb, allcombs);
}
// Pop the value I just added (make room for the next element in my
// vector)
comb.pop_back();
}
}
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
const string& span,
int mods, void * pq)
{
vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
span.c_str(), (unsigned int)mods));
vector<string> exp;
string sterm; // dumb version of user term
string prefix;
const FieldTraits *ftp;
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
prefix = wrap_prefix(ftp->pfx);
}
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
return;
// Set up the highlight data. No prefix should go in there
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
m_hldata.slacks.push_back(0);
m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
}
// Push either term or OR of stem-expanded set
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
m_curcl += exp.size();
// If sterm (simplified original user term) is not null, give it a
// relevance boost. We do this even if no expansion occurred (else
// the non-expanded terms in a term list would end-up with even
// less wqf). This does not happen if there are wildcards anywhere
// in the search.
// We normally boost the original term in the stem expansion list. Don't
// do it if there are wildcards anywhere, this would skew the results.
bool doBoostUserTerm =
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
(m_parentSearch == 0 && !m_haveWildCards);
if (doBoostUserTerm && !sterm.empty()) {
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
Xapian::Query(prefix+sterm,
original_term_wqf_booster));
}
pqueries.push_back(xq);
}
// User entry element had several terms: transform into a PHRASE or
// NEAR xapian query, the elements of which can themselves be OR
// queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though)
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
TextSplitQ *splitData,
int mods, void *pq,
bool useNear, int slack)
{
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
Xapian::Query::OP_PHRASE;
vector<Xapian::Query> orqueries;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
bool hadmultiple = false;
#endif
vector<vector<string> >groups;
string prefix;
const FieldTraits *ftp;
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
prefix = wrap_prefix(ftp->pfx);
}
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
slack++;
}
// Go through the list and perform stem/wildcard expansion for each element
vector<bool>::iterator nxit = splitData->nostemexps.begin();
for (vector<string>::iterator it = splitData->terms.begin();
it != splitData->terms.end(); it++, nxit++) {
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
// Adjust when we do stem expansion. Not if disabled by
// caller, not inside phrases, and some versions of xapian
// will accept only one OR clause inside NEAR.
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|| hadmultiple
#endif // single OR inside NEAR
;
int lmods = mods;
if (nostemexp)
lmods |= SearchDataClause::SDCM_NOSTEMMING;
string sterm;
vector<string> exp;
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
return;
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
listVector("", exp);
// groups is used for highlighting, we don't want prefixes in there.
vector<string> noprefs;
for (vector<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
noprefs.push_back(it->substr(prefix.size()));
}
groups.push_back(noprefs);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(), exp.end()));
m_curcl += exp.size();
if (m_curcl >= getMaxCl())
return;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
if (exp.size() > 1)
hadmultiple = true;
#endif
}
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
slack++;
}
// Generate an appropriate PHRASE/NEAR query with adjusted slack
// For phrases, give a relevance boost like we do for original terms
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
splitData->alltermcount, splitData->lastpos));
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
splitData->lastpos + 1 + slack);
if (op == Xapian::Query::OP_PHRASE)
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
original_term_wqf_booster);
pqueries.push_back(xq);
// Add all combinations of NEAR/PHRASE groups to the highlighting data.
vector<vector<string> > allcombs;
vector<string> comb;
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
// Insert the search groups and slacks in the highlight data, with
// a reference to the user entry that generated them:
m_hldata.groups.insert(m_hldata.groups.end(),
allcombs.begin(), allcombs.end());
m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(),
m_hldata.ugroups.size() - 1);
}
// Trim string beginning with ^ or ending with $ and convert to flags
static int stringToMods(string& s)
{
int mods = 0;
// Check for an anchored search
trimstring(s);
if (s.length() > 0 && s[0] == '^') {
mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
s.erase(0, 1);
}
if (s.length() > 0 && s[s.length()-1] == '$') {
mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
s.erase(s.length()-1);
}
return mods;
}
/**
* Turn user entry string (NOT query language) into a list of xapian queries.
* We just separate words and phrases, and do wildcard and stem expansion,
*
* This is used to process data entered into an OR/AND/NEAR/PHRASE field of
* the GUI (in the case of NEAR/PHRASE, clausedist adds dquotes to the user
* entry).
*
* This appears awful, and it would seem that the split into
* terms/phrases should be performed in the upper layer so that we
* only receive pure term or near/phrase pure elements here, but in
* fact there are things that would appear like terms to naive code,
* and which will actually may be turned into phrases (ie: tom:jerry),
* in a manner which intimately depends on the index implementation,
* so that it makes sense to process this here.
*
* The final list contains one query for each term or phrase
* - Elements corresponding to a stem-expanded part are an OP_OR
* composition of the stem-expanded terms (or a single term query).
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
* composition of the phrase terms (no stem expansion in this case)
* @return the subquery count (either or'd stem-expanded terms or phrase word
* count)
*/
bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
string &ermsg, void *pq,
int slack, bool useNear)
{
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
int mods = m_modifiers;
LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
"slack %d near %d\n",
iq.c_str(), m_field.c_str(), mods, slack, useNear));
ermsg.erase();
m_curcl = 0;
const StopList stops = db.getStopList();
// Simple whitespace-split input into user-level words and
// double-quoted phrases: word1 word2 "this is a phrase".
//
// The text splitter may further still decide that the resulting
// "words" are really phrases, this depends on separators:
// [paul@dom.net] would still be a word (span), but [about:me]
// will probably be handled as a phrase.
vector<string> phrases;
TextSplit::stringToStrings(iq, phrases);
// Process each element: textsplit into terms, handle stem/wildcard
// expansion and transform into an appropriate Xapian::Query
try {
for (vector<string>::iterator it = phrases.begin();
it != phrases.end(); it++) {
LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
// Anchoring modifiers
int amods = stringToMods(*it);
int terminc = amods != 0 ? 1 : 0;
mods |= amods;
// If there are multiple spans in this element, including
// at least one composite, we have to increase the slack
// else a phrase query including a span would fail.
// Ex: "term0@term1 term2" is onlyspans-split as:
// 0 term0@term1 0 12
// 2 term2 13 18
// The position of term2 is 2, not 1, so a phrase search
// would fail.
// We used to do word split, searching for
// "term0 term1 term2" instead, which may have worse
// performance, but will succeed.
// We now adjust the phrase/near slack by comparing the term count
// and the last position
// The term processing pipeline:
TermProcQ tpq;
TermProc *nxt = &tpq;
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
//tpcommon.onlygrams(true);
TermProcPrep tpprep(nxt);
if (o_index_stripchars)
nxt = &tpprep;
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD),
stops, nxt);
tpq.setTSQ(&splitter);
splitter.text_to_words(*it);
slack += splitter.lastpos - splitter.terms.size() + 1;
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
switch (splitter.terms.size() + terminc) {
case 0:
continue;// ??
case 1: {
int lmods = mods;
if (splitter.nostemexps.front())
lmods |= SearchDataClause::SDCM_NOSTEMMING;
m_hldata.ugroups.push_back(splitter.terms);
processSimpleSpan(db, ermsg, splitter.terms.front(),
lmods, &pqueries);
}
break;
default:
m_hldata.ugroups.push_back(splitter.terms);
processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
useNear, slack);
}
if (m_curcl >= getMaxCl()) {
ermsg = maxXapClauseMsg;
if (!o_index_stripchars)
ermsg += maxXapClauseCaseDiacMsg;
break;
}
}
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
} catch (const string &s) {
ermsg = s;
} catch (const char *s) {
ermsg = s;
} catch (...) {
ermsg = "Caught unknown exception";
}
if (!ermsg.empty()) {
LOGERR(("stringToXapianQueries: %s\n", ermsg.c_str()));
return false;
}
return true;
}
// Translate a simple OR or AND search clause.
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
{
LOGDEB(("SearchDataClauseSimple::toNativeQuery: fld [%s] val [%s] "
"stemlang [%s]\n", m_field.c_str(), m_text.c_str(),
getStemLang().c_str()));
Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query();
Xapian::Query::op op;
switch (m_tp) {
case SCLT_AND: op = Xapian::Query::OP_AND; break;
case SCLT_OR: op = Xapian::Query::OP_OR; break;
default:
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
return false;
}
vector<Xapian::Query> pqueries;
if (!processUserString(db, m_text, m_reason, &pqueries))
return false;
if (pqueries.empty()) {
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
return true;
}
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
if (m_weight != 1.0) {
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
}
return true;
}
// Translate a FILENAME search clause. This always comes
// from a "filename" search from the gui or recollq. A query language
// "filename:"-prefixed field will not go through here, but through
// the generic field-processing code.
//
// We do not split the entry any more (used to do some crazy thing
// about expanding multiple fragments in the past). We just take the
// value blanks and all and expand this against the indexed unsplit
// file names
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
{
Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query();
int maxexp = getSoftMaxExp();
if (maxexp == -1)
maxexp = getMaxExp();
vector<string> names;
db.filenameWildExp(m_text, names, maxexp);
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
if (m_weight != 1.0) {
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
}
return true;
}
// Translate a dir: path filtering clause. See comments in .h
bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
{
LOGDEB(("SearchDataClausePath::toNativeQuery: [%s]\n", m_text.c_str()));
Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query();
if (m_text.empty()) {
LOGERR(("SearchDataClausePath: empty path??\n"));
m_reason = "Empty path ?";
return false;
}
vector<Xapian::Query> orqueries;
if (m_text[0] == '/')
orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix)));
else
m_text = path_tildexpand(m_text);
vector<string> vpath;
stringToTokens(m_text, vpath, "/");
for (vector<string>::const_iterator pit = vpath.begin();
pit != vpath.end(); pit++){
string sterm;
vector<string> exp;
if (!expandTerm(db, m_reason,
SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS,
*pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
return false;
}
LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
listVector("", exp);
if (exp.size() == 1)
orqueries.push_back(Xapian::Query(exp[0]));
else
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
exp.begin(), exp.end()));
m_curcl += exp.size();
if (m_curcl >= getMaxCl())
return false;
}
*qp = Xapian::Query(Xapian::Query::OP_PHRASE,
orqueries.begin(), orqueries.end());
if (m_weight != 1.0) {
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
}
return true;
}
// Translate NEAR or PHRASE clause.
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
{
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
Xapian::Query *qp = (Xapian::Query *)p;
*qp = Xapian::Query();
vector<Xapian::Query> pqueries;
Xapian::Query nq;
// We produce a single phrase out of the user entry then use
// stringToXapianQueries() to lowercase and simplify the phrase
// terms etc. This will result into a single (complex)
// Xapian::Query.
if (m_text.find('\"') != string::npos) {
m_text = neutchars(m_text, "\"");
}
string s = cstr_dquote + m_text + cstr_dquote;
bool useNear = (m_tp == SCLT_NEAR);
if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
return false;
if (pqueries.empty()) {
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
return true;
}
*qp = *pqueries.begin();
if (m_weight != 1.0) {
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
}
return true;
}
} // Namespace Rcl