Converted query language parser from the old regexp jungle to bison. Allow using parentheses for clearer syntax.
This commit is contained in:
parent
88bccb47b3
commit
3fb7183eae
@ -15,6 +15,7 @@ QTGUI = @QTGUI@
|
||||
RCLLIBVERSION=@RCLLIBVERSION@
|
||||
|
||||
all: configure mk/sysconf
|
||||
${MAKE} -C query wasaparse.tab.cpp
|
||||
(cd lib; sh mkMake)
|
||||
${MAKE} -C lib
|
||||
${MAKE} -C index depend recollindex
|
||||
@ -59,6 +60,7 @@ clean:
|
||||
# Note: we don't remove the top Makefile, to keep the "clean" targets
|
||||
# available but a "Make" won't work without a configure anyway
|
||||
distclean: clean
|
||||
${MAKE} -C query distclean
|
||||
-${MAKE} -C desktop/unity-lens-recoll distclean
|
||||
-${MAKE} -C python/recoll distclean
|
||||
rm -f mk/sysconf mk/localdefs sampleconf/recoll.conf \
|
||||
|
||||
@ -34,7 +34,6 @@ using namespace std;
|
||||
#include "pathut.h"
|
||||
#include "searchdata.h"
|
||||
#include "rclquery.h"
|
||||
#include "wasastringtoquery.h"
|
||||
#include "wasatorcl.h"
|
||||
#include "kio_recoll.h"
|
||||
#include "docseqdb.h"
|
||||
|
||||
@ -38,7 +38,6 @@ using namespace std;
|
||||
#include "pathut.h"
|
||||
#include "searchdata.h"
|
||||
#include "rclquery.h"
|
||||
#include "wasastringtoquery.h"
|
||||
#include "wasatorcl.h"
|
||||
#include "kio_recoll.h"
|
||||
#include "docseqdb.h"
|
||||
|
||||
@ -42,8 +42,8 @@ ${depth}/query/plaintorich.cpp \
|
||||
${depth}/query/recollq.cpp \
|
||||
${depth}/query/reslistpager.cpp \
|
||||
${depth}/query/sortseq.cpp \
|
||||
${depth}/query/wasastringtoquery.cpp \
|
||||
${depth}/query/wasatorcl.cpp \
|
||||
${depth}/query/wasaparse.cpp \
|
||||
${depth}/query/wasaparse.tab.cpp \
|
||||
${depth}/rcldb/daterange.cpp \
|
||||
${depth}/rcldb/expansiondbs.cpp \
|
||||
${depth}/rcldb/rclabstract.cpp \
|
||||
@ -53,6 +53,7 @@ ${depth}/rcldb/rcldups.cpp \
|
||||
${depth}/rcldb/rclquery.cpp \
|
||||
${depth}/rcldb/rclterms.cpp \
|
||||
${depth}/rcldb/searchdata.cpp \
|
||||
${depth}/rcldb/searchdatatox.cpp \
|
||||
${depth}/rcldb/searchdataxml.cpp \
|
||||
${depth}/rcldb/stemdb.cpp \
|
||||
${depth}/rcldb/stoplist.cpp \
|
||||
|
||||
@ -37,7 +37,6 @@
|
||||
#include "pathut.h"
|
||||
#include "rclinit.h"
|
||||
#include "debuglog.h"
|
||||
#include "wasastringtoquery.h"
|
||||
#include "wasatorcl.h"
|
||||
#include "internfile.h"
|
||||
#include "wipedir.h"
|
||||
|
||||
@ -32,7 +32,6 @@ using namespace std;
|
||||
#include "searchdata.h"
|
||||
#include "rclquery.h"
|
||||
#include "pathut.h"
|
||||
#include "wasastringtoquery.h"
|
||||
#include "wasatorcl.h"
|
||||
#include "debuglog.h"
|
||||
#include "pathut.h"
|
||||
|
||||
@ -4,8 +4,12 @@ include $(depth)/mk/sysconf
|
||||
PROGS = xadump recollq #trhist qtry qxtry
|
||||
SRCS = xadump.cpp
|
||||
|
||||
all: depend librecoll $(PROGS)
|
||||
all: wasaparse.tab.cpp depend librecoll $(PROGS)
|
||||
|
||||
wasaparse.tab.cpp : wasaparse.y
|
||||
bison wasaparse.y
|
||||
mv -f wasaparse.tab.c wasaparse.tab.cpp
|
||||
|
||||
XADUMP_OBJS= xadump.o
|
||||
xadump : $(XADUMP_OBJS)
|
||||
$(CXX) $(ALL_CXXFLAGS) -o xadump $(XADUMP_OBJS) \
|
||||
@ -39,3 +43,7 @@ trwasastrtoq.o : wasastringtoquery.cpp wasastringtoquery.h
|
||||
include $(depth)/mk/commontargets
|
||||
|
||||
include alldeps
|
||||
|
||||
distclean::
|
||||
-rm -f location.hh position.hh stack.hh \
|
||||
wasaparse.tab.c wasaparse.tab.cpp wasaparse.tab.h
|
||||
|
||||
@ -36,7 +36,6 @@ using namespace std;
|
||||
#include "pathut.h"
|
||||
#include "rclinit.h"
|
||||
#include "debuglog.h"
|
||||
#include "wasastringtoquery.h"
|
||||
#include "wasatorcl.h"
|
||||
#include "internfile.h"
|
||||
#include "wipedir.h"
|
||||
|
||||
235
src/query/wasaparse.cpp
Normal file
235
src/query/wasaparse.cpp
Normal file
@ -0,0 +1,235 @@
|
||||
/* Copyright (C) 2006 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "wasatorcl.h"
|
||||
#include "wasaparserdriver.h"
|
||||
#include "searchdata.h"
|
||||
#include "debuglog.h"
|
||||
|
||||
#define YYDEBUG 1
|
||||
|
||||
#include "wasaparse.tab.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace Rcl;
|
||||
|
||||
|
||||
void
|
||||
yy::parser::error (const location_type& l, const std::string& m)
|
||||
{
|
||||
d->setreason(m);
|
||||
}
|
||||
|
||||
|
||||
SearchData *wasaStringToRcl(const RclConfig *config,
|
||||
const std::string& stemlang,
|
||||
const std::string& query, string &reason,
|
||||
const std::string& autosuffs)
|
||||
{
|
||||
WasaParserDriver d(config, stemlang, autosuffs);
|
||||
SearchData *sd = d.parse(query);
|
||||
if (!sd)
|
||||
reason = d.getreason();
|
||||
return sd;
|
||||
}
|
||||
|
||||
SearchData *WasaParserDriver::parse(const std::string& in)
|
||||
{
|
||||
m_input = in;
|
||||
m_index = 0;
|
||||
delete m_result;
|
||||
m_result = 0;
|
||||
m_returns = stack<int>();
|
||||
|
||||
yy::parser parser(this);
|
||||
parser.set_debug_level(0);
|
||||
|
||||
if (parser.parse() != 0) {
|
||||
delete m_result;
|
||||
m_result = 0;
|
||||
}
|
||||
|
||||
return m_result;
|
||||
}
|
||||
|
||||
int WasaParserDriver::GETCHAR()
|
||||
{
|
||||
if (!m_returns.empty()) {
|
||||
int c = m_returns.top();
|
||||
m_returns.pop();
|
||||
return c;
|
||||
}
|
||||
if (m_index < m_input.size())
|
||||
return m_input[m_index++];
|
||||
return 0;
|
||||
}
|
||||
void WasaParserDriver::UNGETCHAR(int c)
|
||||
{
|
||||
m_returns.push(c);
|
||||
}
|
||||
|
||||
// Add clause to query, handling special pseudo-clauses for size/date
|
||||
// etc. (mostly determined on field name).
|
||||
bool WasaParserDriver::addClause(SearchData *sd,
|
||||
SearchDataClauseSimple* cl)
|
||||
{
|
||||
if (cl->getfield().empty()) {
|
||||
// Simple clause with empty field spec.
|
||||
// Possibly change terms found in the "autosuffs" list into "ext"
|
||||
// field queries
|
||||
if (!m_autosuffs.empty()) {
|
||||
vector<string> asfv;
|
||||
if (stringToStrings(m_autosuffs, asfv)) {
|
||||
if (find_if(asfv.begin(), asfv.end(),
|
||||
StringIcmpPred(cl->gettext())) != asfv.end()) {
|
||||
cl->setfield("ext");
|
||||
cl->addModifier(SearchDataClause::SDCM_NOSTEMMING);
|
||||
}
|
||||
}
|
||||
}
|
||||
return sd->addClause(cl);
|
||||
}
|
||||
|
||||
|
||||
const string& fld = cl->getfield();
|
||||
|
||||
// MIME types and categories
|
||||
if (!stringicmp("mime", fld) ||!stringicmp("format", fld)) {
|
||||
if (cl->getexclude()) {
|
||||
sd->remFiletype(cl->gettext());
|
||||
} else {
|
||||
sd->addFiletype(cl->gettext());
|
||||
}
|
||||
delete cl;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!stringicmp("rclcat", fld) || !stringicmp("type", fld)) {
|
||||
vector<string> mtypes;
|
||||
if (m_config && m_config->getMimeCatTypes(cl->gettext(), mtypes)) {
|
||||
for (vector<string>::iterator mit = mtypes.begin();
|
||||
mit != mtypes.end(); mit++) {
|
||||
if (cl->getexclude()) {
|
||||
sd->remFiletype(*mit);
|
||||
} else {
|
||||
sd->addFiletype(*mit);
|
||||
}
|
||||
}
|
||||
}
|
||||
delete cl;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Handle "date" spec
|
||||
if (!stringicmp("date", fld)) {
|
||||
DateInterval di;
|
||||
if (!parsedateinterval(cl->gettext(), &di)) {
|
||||
LOGERR(("Bad date interval format: %s\n",
|
||||
cl->gettext().c_str()));
|
||||
m_reason = "Bad date interval format";
|
||||
delete cl;
|
||||
return false;
|
||||
}
|
||||
LOGDEB(("addClause:: date span: %d-%d-%d/%d-%d-%d\n",
|
||||
di.y1,di.m1,di.d1, di.y2,di.m2,di.d2));
|
||||
sd->setDateSpan(&di);
|
||||
delete cl;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Handle "size" spec
|
||||
if (!stringicmp("size", fld)) {
|
||||
char *cp;
|
||||
size_t size = strtoll(cl->gettext().c_str(), &cp, 10);
|
||||
if (*cp != 0) {
|
||||
switch (*cp) {
|
||||
case 'k': case 'K': size *= 1E3;break;
|
||||
case 'm': case 'M': size *= 1E6;break;
|
||||
case 'g': case 'G': size *= 1E9;break;
|
||||
case 't': case 'T': size *= 1E12;break;
|
||||
default:
|
||||
m_reason = string("Bad multiplier suffix: ") + *cp;
|
||||
delete cl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
SearchDataClause::Relation rel = cl->getrel();
|
||||
|
||||
delete cl;
|
||||
|
||||
switch (rel) {
|
||||
case SearchDataClause::REL_EQUALS:
|
||||
sd->setMaxSize(size);
|
||||
sd->setMinSize(size);
|
||||
break;
|
||||
case SearchDataClause::REL_LT:
|
||||
case SearchDataClause::REL_LTE:
|
||||
sd->setMaxSize(size);
|
||||
break;
|
||||
case SearchDataClause::REL_GT:
|
||||
case SearchDataClause::REL_GTE:
|
||||
sd->setMinSize(size);
|
||||
break;
|
||||
default:
|
||||
m_reason = "Bad relation operator with size query. Use > < or =";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!stringicmp("dir", fld)) {
|
||||
// dir filtering special case
|
||||
SearchDataClausePath *nclause =
|
||||
new SearchDataClausePath(cl->gettext(), cl->getexclude());
|
||||
delete cl;
|
||||
sd->addClause(nclause);
|
||||
}
|
||||
|
||||
if (cl->getTp() == SCLT_OR || cl->getTp() == SCLT_AND) {
|
||||
// If this is a normal clause and the term has commas or
|
||||
// slashes inside, take it as a list, turn the slashes/commas
|
||||
// to spaces, leave unquoted. Otherwise, this would end up as
|
||||
// a phrase query. This is a handy way to enter multiple terms
|
||||
// to be searched inside a field. We interpret ',' as AND, and
|
||||
// '/' as OR. No mixes allowed and ',' wins.
|
||||
SClType tp = SCLT_FILENAME;// impossible value
|
||||
string ns = neutchars(cl->gettext(), ",");
|
||||
if (ns.compare(cl->gettext())) {
|
||||
// had ','
|
||||
tp = SCLT_AND;
|
||||
} else {
|
||||
ns = neutchars(cl->gettext(), "/");
|
||||
if (ns.compare(cl->gettext())) {
|
||||
// had not ',' but has '/'
|
||||
tp = SCLT_OR;
|
||||
}
|
||||
}
|
||||
|
||||
if (tp != SCLT_FILENAME) {
|
||||
SearchDataClauseSimple *ncl =
|
||||
new SearchDataClauseSimple(tp, ns, fld);
|
||||
delete cl;
|
||||
return sd->addClause(ncl);
|
||||
}
|
||||
}
|
||||
return sd->addClause(cl);
|
||||
}
|
||||
|
||||
415
src/query/wasaparse.y
Normal file
415
src/query/wasaparse.y
Normal file
@ -0,0 +1,415 @@
|
||||
%{
|
||||
#define YYDEBUG 1
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "searchdata.h"
|
||||
#include "wasaparserdriver.h"
|
||||
#include "wasaparse.tab.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int yylex(yy::parser::semantic_type *, WasaParserDriver *);
|
||||
void yyerror(char const *);
|
||||
static void qualify(Rcl::SearchDataClauseDist *, const string &);
|
||||
|
||||
static void addSubQuery(WasaParserDriver *d,
|
||||
Rcl::SearchData *sd, Rcl::SearchData *sq)
|
||||
{
|
||||
sd->addClause(new Rcl::SearchDataClauseSub(RefCntr<Rcl::SearchData>(sq)));
|
||||
}
|
||||
|
||||
%}
|
||||
|
||||
%skeleton "lalr1.cc"
|
||||
%defines
|
||||
%error-verbose
|
||||
|
||||
%parse-param {WasaParserDriver* d}
|
||||
%lex-param {WasaParserDriver* d}
|
||||
|
||||
%union {
|
||||
std::string *str;
|
||||
Rcl::SearchDataClauseSimple *cl;
|
||||
Rcl::SearchData *sd;
|
||||
}
|
||||
%destructor {delete $$;} <str>
|
||||
|
||||
%type <cl> qualquote
|
||||
%type <cl> fieldexpr
|
||||
%type <cl> term
|
||||
%type <sd> query
|
||||
%type <str> complexfieldname
|
||||
|
||||
/* Non operator tokens need precedence because of the possibility of
|
||||
concatenation which needs to have lower prec than OR */
|
||||
%left <str> WORD
|
||||
%left <str> QUOTED
|
||||
%left <str> QUALIFIERS
|
||||
%left AND UCONCAT
|
||||
%left OR
|
||||
|
||||
%token EQUALS CONTAINS SMALLEREQ SMALLER GREATEREQ GREATER
|
||||
|
||||
%%
|
||||
|
||||
topquery: query
|
||||
{
|
||||
d->m_result = $1;
|
||||
}
|
||||
|
||||
query:
|
||||
query query %prec UCONCAT
|
||||
{
|
||||
//cerr << "q: query query" << endl;
|
||||
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
|
||||
addSubQuery(d, sd, $1);
|
||||
addSubQuery(d, sd, $2);
|
||||
$$ = sd;
|
||||
}
|
||||
| query AND query
|
||||
{
|
||||
//cerr << "q: query AND query" << endl;
|
||||
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
|
||||
addSubQuery(d, sd, $1);
|
||||
addSubQuery(d, sd, $3);
|
||||
$$ = sd;
|
||||
}
|
||||
| query OR query
|
||||
{
|
||||
//cerr << "q: query OR query" << endl;
|
||||
Rcl::SearchData *top = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
|
||||
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR, d->m_stemlang);
|
||||
addSubQuery(d, sd, $1);
|
||||
addSubQuery(d, sd, $3);
|
||||
addSubQuery(d, top, sd);
|
||||
$$ = top;
|
||||
}
|
||||
| '(' query ')'
|
||||
{
|
||||
//cerr << "q: ( query )" << endl;
|
||||
$$ = $2;
|
||||
}
|
||||
|
|
||||
fieldexpr %prec UCONCAT
|
||||
{
|
||||
//cerr << "q: fieldexpr" << endl;
|
||||
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
|
||||
d->addClause(sd, $1);
|
||||
$$ = sd;
|
||||
}
|
||||
;
|
||||
|
||||
fieldexpr: term
|
||||
{
|
||||
// cerr << "fe: simple fieldexpr: " << $1->gettext() << endl;
|
||||
$$ = $1;
|
||||
}
|
||||
| complexfieldname EQUALS term
|
||||
{
|
||||
// cerr << "fe: " << *$1 << " = " << $3->gettext() << endl;
|
||||
$3->setfield(*$1);
|
||||
$3->setrel(Rcl::SearchDataClause::REL_EQUALS);
|
||||
$$ = $3;
|
||||
delete $1;
|
||||
}
|
||||
| complexfieldname CONTAINS term
|
||||
{
|
||||
// cerr << "fe: " << *$1 << " : " << $3->gettext() << endl;
|
||||
$3->setfield(*$1);
|
||||
$3->setrel(Rcl::SearchDataClause::REL_CONTAINS);
|
||||
$$ = $3;
|
||||
delete $1;
|
||||
}
|
||||
| complexfieldname SMALLER term
|
||||
{
|
||||
// cerr << "fe: " << *$1 << " < " << $3->gettext() << endl;
|
||||
$3->setfield(*$1);
|
||||
$3->setrel(Rcl::SearchDataClause::REL_LT);
|
||||
$$ = $3;
|
||||
delete $1;
|
||||
}
|
||||
| complexfieldname SMALLEREQ term
|
||||
{
|
||||
// cerr << "fe: " << *$1 << " <= " << $3->gettext() << endl;
|
||||
$3->setfield(*$1);
|
||||
$3->setrel(Rcl::SearchDataClause::REL_LTE);
|
||||
$$ = $3;
|
||||
delete $1;
|
||||
}
|
||||
| complexfieldname GREATER term
|
||||
{
|
||||
// cerr << "fe: " << *$1 << " > " << $3->gettext() << endl;
|
||||
$3->setfield(*$1);
|
||||
$3->setrel(Rcl::SearchDataClause::REL_GT);
|
||||
$$ = $3;
|
||||
delete $1;
|
||||
}
|
||||
| complexfieldname GREATEREQ term
|
||||
{
|
||||
// cerr << "fe: " << *$1 << " >= " << $3->gettext() << endl;
|
||||
$3->setfield(*$1);
|
||||
$3->setrel(Rcl::SearchDataClause::REL_GTE);
|
||||
$$ = $3;
|
||||
delete $1;
|
||||
}
|
||||
| '-' fieldexpr
|
||||
{
|
||||
// cerr << "fe: - fieldexpr[" << $2->gettext() << "]" << endl;
|
||||
$2->setexclude(true);
|
||||
$$ = $2;
|
||||
}
|
||||
;
|
||||
|
||||
/* Deal with field names like dc:title */
|
||||
complexfieldname:
|
||||
WORD
|
||||
{
|
||||
// cerr << "cfn: WORD" << endl;
|
||||
$$ = $1;
|
||||
}
|
||||
|
|
||||
complexfieldname CONTAINS WORD
|
||||
{
|
||||
// cerr << "cfn: complexfieldname ':' WORD" << endl;
|
||||
$$ = new string(*$1 + string(":") + *$3);
|
||||
delete $1;
|
||||
delete $3;
|
||||
}
|
||||
|
||||
term:
|
||||
WORD
|
||||
{
|
||||
//cerr << "term[" << *$1 << "]" << endl;
|
||||
$$ = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, *$1);
|
||||
delete $1;
|
||||
}
|
||||
| qualquote
|
||||
{
|
||||
$$ = $1;
|
||||
}
|
||||
|
||||
qualquote:
|
||||
QUOTED
|
||||
{
|
||||
// cerr << "QUOTED[" << *$1 << "]" << endl;
|
||||
$$ = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
|
||||
delete $1;
|
||||
}
|
||||
| QUOTED QUALIFIERS
|
||||
{
|
||||
// cerr << "QUOTED[" << *$1 << "] QUALIFIERS[" << *$2 << "]" << endl;
|
||||
Rcl::SearchDataClauseDist *cl =
|
||||
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
|
||||
qualify(cl, *$2);
|
||||
$$ = cl;
|
||||
delete $1;
|
||||
delete $2;
|
||||
}
|
||||
|
||||
|
||||
%%
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
// Look for int at index, skip and return new index found? value.
|
||||
static unsigned int qualGetInt(const string& q, unsigned int cur, int *pval)
|
||||
{
|
||||
unsigned int ncur = cur;
|
||||
if (cur < q.size() - 1) {
|
||||
char *endptr;
|
||||
int val = strtol(&q[cur + 1], &endptr, 10);
|
||||
if (endptr != &q[cur + 1]) {
|
||||
ncur += endptr - &q[cur + 1];
|
||||
*pval = val;
|
||||
}
|
||||
}
|
||||
return ncur;
|
||||
}
|
||||
|
||||
static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals)
|
||||
{
|
||||
// cerr << "qualify(" << cl << ", " << quals << ")" << endl;
|
||||
for (unsigned int i = 0; i < quals.length(); i++) {
|
||||
//fprintf(stderr, "qual char %c\n", quals[i]);
|
||||
switch (quals[i]) {
|
||||
case 'b':
|
||||
cl->setWeight(10.0);
|
||||
break;
|
||||
case 'c': break;
|
||||
case 'C':
|
||||
cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
|
||||
break;
|
||||
case 'd': break;
|
||||
case 'D':
|
||||
cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
|
||||
break;
|
||||
case 'e':
|
||||
cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
|
||||
cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
|
||||
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
|
||||
break;
|
||||
case 'l':
|
||||
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
|
||||
break;
|
||||
case 'L': break;
|
||||
case 'o':
|
||||
{
|
||||
int slack = 10;
|
||||
i = qualGetInt(quals, i, &slack);
|
||||
cl->setslack(slack);
|
||||
//cerr << "set slack " << cl->getslack() << " done" << endl;
|
||||
}
|
||||
break;
|
||||
case 'p':
|
||||
cl->setTp(Rcl::SCLT_NEAR);
|
||||
if (cl->getslack() == 0) {
|
||||
cl->setslack(10);
|
||||
//cerr << "set slack " << cl->getslack() << " done" << endl;
|
||||
}
|
||||
break;
|
||||
case '.':case '0':case '1':case '2':case '3':case '4':
|
||||
case '5':case '6':case '7':case '8':case '9':
|
||||
{
|
||||
int n = 0;
|
||||
float factor = 1.0;
|
||||
if (sscanf(&(quals[i]), "%f %n", &factor, &n)) {
|
||||
if (factor != 1.0) {
|
||||
cl->setWeight(factor);
|
||||
}
|
||||
}
|
||||
if (n > 0)
|
||||
i += n - 1;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// specialstartchars are special only at the beginning of a token
|
||||
// (e.g. doctor-who is a term, not 2 terms separated by '-')
|
||||
static const string specialstartchars("-");
|
||||
// specialinchars are special everywhere except inside a quoted string
|
||||
static const string specialinchars(":=<>()");
|
||||
|
||||
// Called with the first dquote already read
|
||||
static int parseString(WasaParserDriver *d, yy::parser::semantic_type *yylval)
|
||||
{
|
||||
string* value = new string();
|
||||
d->qualifiers().clear();
|
||||
int c;
|
||||
while ((c = d->GETCHAR())) {
|
||||
switch (c) {
|
||||
case '\\':
|
||||
/* Escape: get next char */
|
||||
c = d->GETCHAR();
|
||||
if (c == 0) {
|
||||
value->push_back(c);
|
||||
goto out;
|
||||
}
|
||||
value->push_back(c);
|
||||
break;
|
||||
case '"':
|
||||
/* End of string. Look for qualifiers */
|
||||
while ((c = d->GETCHAR()) && !isspace(c))
|
||||
d->qualifiers().push_back(c);
|
||||
goto out;
|
||||
default:
|
||||
value->push_back(c);
|
||||
}
|
||||
}
|
||||
out:
|
||||
//cerr << "GOT QUOTED ["<<value<<"] quals [" << d->qualifiers() << "]" << endl;
|
||||
yylval->str = value;
|
||||
return yy::parser::token::QUOTED;
|
||||
}
|
||||
|
||||
|
||||
int yylex(yy::parser::semantic_type *yylval, WasaParserDriver *d)
|
||||
{
|
||||
if (!d->qualifiers().empty()) {
|
||||
yylval->str = new string();
|
||||
yylval->str->swap(d->qualifiers());
|
||||
return yy::parser::token::QUALIFIERS;
|
||||
}
|
||||
|
||||
int c;
|
||||
|
||||
/* Skip white space. */
|
||||
while ((c = d->GETCHAR()) && isspace(c))
|
||||
continue;
|
||||
|
||||
if (c == 0)
|
||||
return 0;
|
||||
|
||||
if (specialstartchars.find_first_of(c) != string::npos) {
|
||||
//cerr << "yylex: return " << c << endl;
|
||||
return c;
|
||||
}
|
||||
|
||||
// field-term relations
|
||||
switch (c) {
|
||||
case '=': return yy::parser::token::EQUALS;
|
||||
case ':': return yy::parser::token::CONTAINS;
|
||||
case '<': {
|
||||
int c1 = d->GETCHAR();
|
||||
if (c1 == '=') {
|
||||
return yy::parser::token::SMALLEREQ;
|
||||
} else {
|
||||
d->UNGETCHAR(c1);
|
||||
return yy::parser::token::SMALLER;
|
||||
}
|
||||
}
|
||||
case '>': {
|
||||
int c1 = d->GETCHAR();
|
||||
if (c1 == '=') {
|
||||
return yy::parser::token::GREATEREQ;
|
||||
} else {
|
||||
d->UNGETCHAR(c1);
|
||||
return yy::parser::token::GREATER;
|
||||
}
|
||||
}
|
||||
case '(': case ')':
|
||||
return c;
|
||||
}
|
||||
|
||||
if (c == '"')
|
||||
return parseString(d, yylval);
|
||||
|
||||
d->UNGETCHAR(c);
|
||||
|
||||
// Other chars start a term or field name or reserved word
|
||||
string* word = new string();
|
||||
while ((c = d->GETCHAR())) {
|
||||
if (isspace(c)) {
|
||||
//cerr << "Word broken by whitespace" << endl;
|
||||
break;
|
||||
} else if (specialinchars.find_first_of(c) != string::npos) {
|
||||
//cerr << "Word broken by special char" << endl;
|
||||
d->UNGETCHAR(c);
|
||||
break;
|
||||
} else if (c == 0) {
|
||||
//cerr << "Word broken by EOF" << endl;
|
||||
break;
|
||||
} else {
|
||||
word->push_back(c);
|
||||
}
|
||||
}
|
||||
|
||||
if (!word->compare("AND") || !word->compare("&&")) {
|
||||
delete word;
|
||||
return yy::parser::token::AND;
|
||||
} else if (!word->compare("OR") || !word->compare("||")) {
|
||||
delete word;
|
||||
return yy::parser::token::OR;
|
||||
}
|
||||
|
||||
// cerr << "Got word [" << word << "]" << endl;
|
||||
yylval->str = word;
|
||||
return yy::parser::token::WORD;
|
||||
}
|
||||
81
src/query/wasaparserdriver.h
Normal file
81
src/query/wasaparserdriver.h
Normal file
@ -0,0 +1,81 @@
|
||||
/* Copyright (C) 2006 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _WASAPARSERDRIVER_H_INCLUDED_
|
||||
#define _WASAPARSERDRIVER_H_INCLUDED_
|
||||
|
||||
#include <string>
|
||||
#include <stack>
|
||||
|
||||
class WasaParserDriver;
|
||||
namespace Rcl {
|
||||
class SearchData;
|
||||
class SearchDataClauseSimple;
|
||||
}
|
||||
namespace yy {
|
||||
class parser;
|
||||
}
|
||||
|
||||
class RclConfig;
|
||||
|
||||
class WasaParserDriver {
|
||||
public:
|
||||
|
||||
WasaParserDriver(const RclConfig *c, const std::string sl,
|
||||
const std::string& as)
|
||||
: m_stemlang(sl), m_autosuffs(as), m_config(c),
|
||||
m_index(0), m_result(0) {}
|
||||
|
||||
Rcl::SearchData *parse(const std::string&);
|
||||
bool addClause(Rcl::SearchData *sd, Rcl::SearchDataClauseSimple* cl);
|
||||
|
||||
int GETCHAR();
|
||||
void UNGETCHAR(int c);
|
||||
|
||||
std::string& qualifiers() {
|
||||
return m_qualifiers;
|
||||
}
|
||||
void setreason(const std::string& reason) {
|
||||
m_reason = reason;
|
||||
}
|
||||
const std::string& getreason() const {
|
||||
return m_reason;
|
||||
}
|
||||
|
||||
private:
|
||||
friend class yy::parser;
|
||||
|
||||
std::string m_stemlang;
|
||||
std::string m_autosuffs;
|
||||
const RclConfig *m_config;
|
||||
|
||||
std::string m_input;
|
||||
unsigned int m_index;
|
||||
std::stack<int> m_returns;
|
||||
Rcl::SearchData *m_result;
|
||||
|
||||
std::string m_reason;
|
||||
|
||||
// Let the quoted string reader store qualifiers in there, simpler
|
||||
// than handling this in the parser, because their nature is
|
||||
// determined by the absence of white space after the closing
|
||||
// dquote. e.g "some term"abc. We could avoid this by making white
|
||||
// space a token.
|
||||
std::string m_qualifiers;
|
||||
};
|
||||
|
||||
|
||||
#endif /* _WASAPARSERDRIVER_H_INCLUDED_ */
|
||||
@ -1,515 +0,0 @@
|
||||
/* Copyright (C) 2006 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef TEST_WASASTRINGTOQUERY
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <regex.h>
|
||||
|
||||
#include "smallut.h"
|
||||
#include "wasastringtoquery.h"
|
||||
|
||||
#undef DEB_WASASTRINGTOQ
|
||||
#ifdef DEB_WASASTRINGTOQ
|
||||
#define DPRINT(X) fprintf X
|
||||
#define DUMPQ(Q) {string D;Q->describe(D);fprintf(stderr, "%s\n", D.c_str());}
|
||||
#else
|
||||
#define DPRINT(X)
|
||||
#define DUMPQ(Q)
|
||||
#endif
|
||||
|
||||
WasaQuery::~WasaQuery()
|
||||
{
|
||||
for (vector<WasaQuery*>::iterator it = m_subs.begin();
|
||||
it != m_subs.end(); it++) {
|
||||
delete *it;
|
||||
}
|
||||
m_subs.clear();
|
||||
}
|
||||
|
||||
static const char* reltosrel(WasaQuery::Rel rel)
|
||||
{
|
||||
switch (rel) {
|
||||
case WasaQuery::REL_EQUALS: return "=";
|
||||
case WasaQuery::REL_CONTAINS: return ":";
|
||||
case WasaQuery::REL_LT: return "<";
|
||||
case WasaQuery::REL_LTE: return "<=";
|
||||
case WasaQuery::REL_GT: return ">";
|
||||
case WasaQuery::REL_GTE: return ">=";
|
||||
default: return "?";
|
||||
}
|
||||
}
|
||||
|
||||
void WasaQuery::describe(string &desc) const
|
||||
{
|
||||
desc += "(";
|
||||
string fieldspec = m_fieldspec.empty() ? string() : m_fieldspec +
|
||||
reltosrel(m_rel);
|
||||
switch (m_op) {
|
||||
case OP_NULL:
|
||||
desc += "NULL";
|
||||
break;
|
||||
case OP_LEAF:
|
||||
if (m_exclude)
|
||||
desc += "NOT (";
|
||||
desc += fieldspec + m_value;
|
||||
if (m_exclude)
|
||||
desc += ")";
|
||||
break;
|
||||
case OP_OR:
|
||||
case OP_AND:
|
||||
for (vector<WasaQuery *>::const_iterator it = m_subs.begin();
|
||||
it != m_subs.end(); it++) {
|
||||
(*it)->describe(desc);
|
||||
vector<WasaQuery *>::const_iterator it1 = it;
|
||||
it1++;
|
||||
if (it1 != m_subs.end())
|
||||
desc += m_op == OP_OR ? "OR ": "AND ";
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (desc[desc.length() - 1] == ' ')
|
||||
desc.erase(desc.length() - 1);
|
||||
desc += ")";
|
||||
if (m_modifiers != 0) {
|
||||
if (m_modifiers & WQM_BOOST) desc += "BOOST|";
|
||||
if (m_modifiers & WQM_CASESENS) desc += "CASESENS|";
|
||||
if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|";
|
||||
if (m_modifiers & WQM_FUZZY) desc += "FUZZY|";
|
||||
if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|";
|
||||
if (m_modifiers & WQM_PHRASESLACK) {
|
||||
char buf[100];
|
||||
sprintf(buf, "%d", m_slack);
|
||||
desc += "PHRASESLACK(" + string(buf) + string(")|");
|
||||
}
|
||||
if (m_modifiers & WQM_PROX) desc += "PROX|";
|
||||
if (m_modifiers & WQM_REGEX) desc += "REGEX|";
|
||||
if (m_modifiers & WQM_SLOPPY) desc += "SLOPPY|";
|
||||
if (m_modifiers & WQM_WORDS) desc += "WORDS|";
|
||||
|
||||
if (desc.length() > 0 && desc[desc.length()-1] == '|')
|
||||
desc.erase(desc.length()-1);
|
||||
}
|
||||
desc += " ";
|
||||
}
|
||||
|
||||
// The string query parser code:
|
||||
|
||||
/* Shamelessly lifted from Beagle:
|
||||
* This is our regular Expression Pattern:
|
||||
* we expect something like this:
|
||||
* -key:"Value String"modifiers
|
||||
* key:Value
|
||||
* or
|
||||
* Value
|
||||
*/
|
||||
|
||||
/* The master regular expression used to parse a query string
|
||||
* Sub-expressions in parenthesis are numbered from 1. Each opening
|
||||
* parenthesis increases the index, but we're not interested in all
|
||||
* Deviations from standard:
|
||||
* Relation: the standard-conformant line read as (release<1.16):
|
||||
"(:|=|<|>|<=|>=)" //7 Relation
|
||||
but we are not actually making use of the relation type
|
||||
(interpreting all as ":"), and this can product unexpected results
|
||||
as a (ie pasted) search for nonexfield=value will silently drop
|
||||
the nonexfield part, while the user probably was not aware of
|
||||
triggering a field search (expecting just ':' to do this).
|
||||
*/
|
||||
static const char * parserExpr =
|
||||
"(OR|\\|\\|)[[:space:]]*" //1 OR,||
|
||||
"|"
|
||||
"(AND|&&)[[:space:]]*" // 2 AND,&& (ignored, default)
|
||||
"|"
|
||||
"(" //3
|
||||
"([+-])?" //4 Force or exclude indicator
|
||||
"(" //5
|
||||
"([[:alpha:]][[:alnum:]:]*)" //6 Field spec: ie: "dc:title:letitre"
|
||||
"[[:space:]]*"
|
||||
"(:|=|>|<)" //7 Relation
|
||||
"[[:space:]]*)?"
|
||||
"(" //8
|
||||
"(\"" //9
|
||||
"([^\"]+)" //10 "A quoted term"
|
||||
"\")"
|
||||
"([bcCdDeflLoprsw.0-9]*)" //11 modifiers
|
||||
"|"
|
||||
"([^[:space:]\"]+)" //12 ANormalTerm
|
||||
")"
|
||||
")[[:space:]]*"
|
||||
;
|
||||
|
||||
// For debugging the parser. But see also NMATCH
|
||||
static const char *matchNames[] = {
|
||||
/* 0*/ "",
|
||||
/* 1*/ "OR",
|
||||
/* 2*/ "AND",
|
||||
/* 3*/ "",
|
||||
/* 4*/ "+-",
|
||||
/* 5*/ "",
|
||||
/* 6*/ "FIELD",
|
||||
/* 7*/ "RELATION",
|
||||
/* 8*/ "",
|
||||
/* 9*/ "",
|
||||
/*10*/ "QUOTEDTERM",
|
||||
/*11*/ "MODIFIERS",
|
||||
/*12*/ "TERM",
|
||||
};
|
||||
#define NMATCH (sizeof(matchNames) / sizeof(char *))
|
||||
|
||||
// Symbolic names for the interesting submatch indices
|
||||
enum SbMatchIdx {SMI_OR=1, SMI_AND=2, SMI_PM=4, SMI_FIELD=6, SMI_REL=7,
|
||||
SMI_QUOTED=10, SMI_MODIF=11, SMI_TERM=12};
|
||||
|
||||
static const int maxmatchlen = 1024;
|
||||
static const int errbuflen = 300;
|
||||
|
||||
class StringToWasaQuery::Internal {
|
||||
public:
|
||||
Internal()
|
||||
: m_rxneedsfree(false)
|
||||
{}
|
||||
~Internal()
|
||||
{
|
||||
if (m_rxneedsfree)
|
||||
regfree(&m_rx);
|
||||
}
|
||||
bool checkSubMatch(int i, char *match, string& reason)
|
||||
{
|
||||
if (i < 0 || i >= int(NMATCH) || m_pmatch[i].rm_so == -1) {
|
||||
//DPRINT((stderr, "checkSubMatch: no match: i %d rm_so %d\n",
|
||||
//i, m_pmatch[i].rm_so));
|
||||
return false;
|
||||
}
|
||||
if (m_pmatch[i].rm_eo - m_pmatch[i].rm_so <= 0) {
|
||||
// weird and fatal
|
||||
reason = "Internal regular expression handling error";
|
||||
return false;
|
||||
}
|
||||
//DPRINT((stderr, "checkSubMatch: so %d eo %d\n", m_pmatch[i].rm_so,
|
||||
//m_pmatch[i].rm_eo));
|
||||
memcpy(match, m_cp + m_pmatch[i].rm_so,
|
||||
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
|
||||
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
WasaQuery *stringToQuery(const string& str, string& reason);
|
||||
|
||||
friend class StringToWasaQuery;
|
||||
private:
|
||||
const char *m_cp;
|
||||
regex_t m_rx;
|
||||
bool m_rxneedsfree;
|
||||
regmatch_t m_pmatch[NMATCH];
|
||||
};
|
||||
|
||||
StringToWasaQuery::StringToWasaQuery()
|
||||
: internal(new Internal)
|
||||
{
|
||||
}
|
||||
|
||||
StringToWasaQuery::~StringToWasaQuery()
|
||||
{
|
||||
delete internal;
|
||||
}
|
||||
|
||||
WasaQuery *
|
||||
StringToWasaQuery::stringToQuery(const string& str, string& reason)
|
||||
{
|
||||
if (internal == 0)
|
||||
return 0;
|
||||
WasaQuery *wq = internal->stringToQuery(str, reason);
|
||||
DUMPQ(wq);
|
||||
return wq;
|
||||
}
|
||||
|
||||
WasaQuery *
|
||||
StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
||||
{
|
||||
if (m_rxneedsfree)
|
||||
regfree(&m_rx);
|
||||
|
||||
char errbuf[errbuflen+1];
|
||||
int errcode;
|
||||
if ((errcode = regcomp(&m_rx, parserExpr, REG_EXTENDED)) != 0) {
|
||||
regerror(errcode, &m_rx, errbuf, errbuflen);
|
||||
reason = errbuf;
|
||||
return 0;
|
||||
}
|
||||
m_rxneedsfree = true;
|
||||
|
||||
const char *cpe;
|
||||
m_cp = str.c_str();
|
||||
cpe = str.c_str() + str.length();
|
||||
|
||||
WasaQuery *query = new WasaQuery;
|
||||
query->m_op = WasaQuery::OP_AND;
|
||||
WasaQuery *orChain = 0;
|
||||
bool prev_or = false;
|
||||
|
||||
// Loop on repeated regexp matches on the main string.
|
||||
for (int loop = 0;;loop++) {
|
||||
if ((errcode = regexec(&m_rx, m_cp, NMATCH, m_pmatch, 0))) {
|
||||
regerror(errcode, &m_rx, errbuf, errbuflen);
|
||||
reason = errbuf;
|
||||
return 0;
|
||||
}
|
||||
if (m_pmatch[0].rm_eo <= 0) {
|
||||
// weird and fatal
|
||||
reason = "Internal regular expression handling error";
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef DEB_WASASTRINGTOQ
|
||||
DPRINT((stderr, "Next part:\n"));
|
||||
for (unsigned int i = 0; i < NMATCH; i++) {
|
||||
if (m_pmatch[i].rm_so == -1) continue;
|
||||
char match[maxmatchlen+1];
|
||||
memcpy(match, m_cp + m_pmatch[i].rm_so,
|
||||
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
|
||||
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
|
||||
if (matchNames[i][0])
|
||||
DPRINT((stderr, "%10s: [%s] (%d->%d)\n", matchNames[i], match,
|
||||
(int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo));
|
||||
}
|
||||
#endif
|
||||
|
||||
char match[maxmatchlen+1];
|
||||
if (checkSubMatch(SMI_OR, match, reason)) {
|
||||
if (prev_or) {
|
||||
// Bad syntax
|
||||
reason = "Bad syntax: consecutive OR";
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (orChain == 0) {
|
||||
// Fist OR seen: start OR subclause.
|
||||
if ((orChain = new WasaQuery()) == 0) {
|
||||
reason = "Out of memory";
|
||||
return 0;
|
||||
}
|
||||
orChain->m_op = WasaQuery::OP_OR;
|
||||
}
|
||||
|
||||
// For the first OR, we need to transfer the previous
|
||||
// query from the main vector to the OR subquery
|
||||
if (orChain->m_subs.empty() && !query->m_subs.empty()) {
|
||||
orChain->m_subs.push_back(query->m_subs.back());
|
||||
query->m_subs.pop_back();
|
||||
}
|
||||
prev_or = true;
|
||||
|
||||
} else if (checkSubMatch(SMI_AND, match, reason)) {
|
||||
// Do nothing, AND is the default. We might want to check for
|
||||
// errors like consecutive ANDs, or OR AND
|
||||
|
||||
} else {
|
||||
|
||||
WasaQuery *nclause = new WasaQuery;
|
||||
if (nclause == 0) {
|
||||
reason = "Out of memory";
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Check for quoted or unquoted value
|
||||
unsigned int mods = 0;
|
||||
if (checkSubMatch(SMI_QUOTED, match, reason)) {
|
||||
nclause->m_value = match;
|
||||
mods |= WasaQuery::WQM_QUOTED;
|
||||
} else if (checkSubMatch(SMI_TERM, match, reason)) {
|
||||
nclause->m_value = match;
|
||||
}
|
||||
|
||||
if (nclause->m_value.empty()) {
|
||||
// Isolated +- or fieldname: without a value. Ignore until
|
||||
// told otherwise.
|
||||
DPRINT((stderr, "Clause with empty value, skipping\n"));
|
||||
delete nclause;
|
||||
goto nextfield;
|
||||
}
|
||||
|
||||
if (checkSubMatch(SMI_MODIF, match, reason)) {
|
||||
DPRINT((stderr, "Got modifiers: [%s]\n", match));
|
||||
for (unsigned int i = 0; i < strlen(match); i++) {
|
||||
switch (match[i]) {
|
||||
case 'b':
|
||||
mods |= WasaQuery::WQM_BOOST;
|
||||
nclause->m_weight = 10.0;
|
||||
break;
|
||||
case 'c': break;
|
||||
case 'C': mods |= WasaQuery::WQM_CASESENS; break;
|
||||
case 'd': break;
|
||||
case 'D': mods |= WasaQuery::WQM_DIACSENS; break;
|
||||
case 'e': mods |= WasaQuery::WQM_CASESENS |
|
||||
WasaQuery::WQM_DIACSENS |
|
||||
WasaQuery::WQM_NOSTEM;
|
||||
break;
|
||||
case 'f': mods |= WasaQuery::WQM_FUZZY; break;
|
||||
case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
|
||||
case 'L': break;
|
||||
case 'o':
|
||||
mods |= WasaQuery::WQM_PHRASESLACK;
|
||||
// Default slack if specified only by 'o' is 10.
|
||||
nclause->m_slack = 10;
|
||||
if (i < strlen(match) - 1) {
|
||||
char *endptr;
|
||||
int slack = strtol(match+i+1, &endptr, 10);
|
||||
if (endptr != match+i+1) {
|
||||
i += endptr - (match+i+1);
|
||||
nclause->m_slack = slack;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 'p':
|
||||
mods |= WasaQuery::WQM_PROX;
|
||||
nclause->m_slack = 10;
|
||||
break;
|
||||
case 'r': mods |= WasaQuery::WQM_REGEX; break;
|
||||
case 's': mods |= WasaQuery::WQM_SLOPPY; break;
|
||||
case 'w': mods |= WasaQuery::WQM_WORDS; break;
|
||||
case '.':case '0':case '1':case '2':case '3':case '4':
|
||||
case '5':case '6':case '7':case '8':case '9':
|
||||
{
|
||||
int n;
|
||||
float factor;
|
||||
if (sscanf(match+i, "%f %n", &factor, &n)) {
|
||||
nclause->m_weight = factor;
|
||||
DPRINT((stderr, "Got factor %.2f len %d\n",
|
||||
factor, n));
|
||||
}
|
||||
if (n)
|
||||
i += n-1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
nclause->m_modifiers = WasaQuery::Modifier(mods);
|
||||
|
||||
// Field indicator ?
|
||||
if (checkSubMatch(SMI_FIELD, match, reason)) {
|
||||
// We used Check for special fields indicating sorting
|
||||
// etc. here but this went away from the spec. See 1.4
|
||||
// if it comes back
|
||||
nclause->m_fieldspec = match;
|
||||
if (checkSubMatch(SMI_REL, match, reason)) {
|
||||
switch (match[0]) {
|
||||
case '=':nclause->m_rel = WasaQuery::REL_EQUALS;break;
|
||||
case ':':nclause->m_rel = WasaQuery::REL_CONTAINS;break;
|
||||
case '<':
|
||||
if (match[1] == '=')
|
||||
nclause->m_rel = WasaQuery::REL_LTE;
|
||||
else
|
||||
nclause->m_rel = WasaQuery::REL_LT;
|
||||
break;
|
||||
case '>':
|
||||
if (match[1] == '=')
|
||||
nclause->m_rel = WasaQuery::REL_GTE;
|
||||
else
|
||||
nclause->m_rel = WasaQuery::REL_GT;
|
||||
break;
|
||||
default:
|
||||
nclause->m_rel = WasaQuery::REL_CONTAINS;
|
||||
}
|
||||
} else {
|
||||
// ?? If field matched we should have a relation
|
||||
nclause->m_rel = WasaQuery::REL_CONTAINS;
|
||||
}
|
||||
}
|
||||
|
||||
nclause->m_op = WasaQuery::OP_LEAF;
|
||||
// +- indicator ?
|
||||
if (checkSubMatch(SMI_PM, match, reason) && match[0] == '-') {
|
||||
nclause->m_exclude = true;
|
||||
} else {
|
||||
nclause->m_exclude = false;
|
||||
}
|
||||
|
||||
if (prev_or) {
|
||||
// The precedent token was an OR, add new clause to or chain
|
||||
//DPRINT((stderr, "Adding to OR chain\n"));
|
||||
orChain->m_subs.push_back(nclause);
|
||||
} else {
|
||||
if (orChain) {
|
||||
// Getting out of OR. Add the OR subquery to the main one
|
||||
//DPRINT((stderr, "Adding OR chain to main\n"));
|
||||
query->m_subs.push_back(orChain);
|
||||
orChain = 0;
|
||||
}
|
||||
//DPRINT((stderr, "Adding to main chain\n"));
|
||||
// Add new clause to main query
|
||||
query->m_subs.push_back(nclause);
|
||||
}
|
||||
|
||||
prev_or = false;
|
||||
}
|
||||
|
||||
nextfield:
|
||||
// Advance current string position. We checked earlier that
|
||||
// the increment is strictly positive, so we won't loop
|
||||
// forever
|
||||
m_cp += m_pmatch[0].rm_eo;
|
||||
if (m_cp >= cpe)
|
||||
break;
|
||||
}
|
||||
|
||||
if (orChain) {
|
||||
// Getting out of OR. Add the OR subquery to the main one
|
||||
DPRINT((stderr, "Adding OR chain to main.Before: \n"));
|
||||
DUMPQ(query);
|
||||
DUMPQ(orChain);
|
||||
query->m_subs.push_back(orChain);
|
||||
}
|
||||
|
||||
regfree(&m_rx);
|
||||
m_rxneedsfree = false;
|
||||
return query;
|
||||
}
|
||||
|
||||
#else // TEST
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "wasastringtoquery.h"
|
||||
|
||||
static char *thisprog;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
thisprog = argv[0];
|
||||
argc--; argv++;
|
||||
|
||||
if (argc != 1) {
|
||||
fprintf(stderr, "need one arg\n");
|
||||
exit(1);
|
||||
}
|
||||
const string str = *argv++;argc--;
|
||||
string reason;
|
||||
StringToWasaQuery qparser;
|
||||
WasaQuery *q = qparser.stringToQuery(str, reason);
|
||||
if (q == 0) {
|
||||
fprintf(stderr, "stringToQuery failed: %s\n", reason.c_str());
|
||||
exit(1);
|
||||
}
|
||||
string desc;
|
||||
q->describe(desc);
|
||||
fprintf(stderr, "Finally: %s\n", desc.c_str());
|
||||
exit(0);
|
||||
}
|
||||
|
||||
#endif // TEST_WASASTRINGTOQUERY
|
||||
@ -1,112 +0,0 @@
|
||||
/* Copyright (C) 2006 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#ifndef _WASASTRINGTOQUERY_H_INCLUDED_
|
||||
#define _WASASTRINGTOQUERY_H_INCLUDED_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
/* Note: Xesam used to be named wasabi. We changed the references to wasabi in
|
||||
the comments, but not the code */
|
||||
|
||||
/**
|
||||
* A simple class to represent a parsed Xesam user language element.
|
||||
* Can hold one leaf element or an array of subqueries to be joined by AND/OR
|
||||
*
|
||||
* The complete query is represented by a top WasaQuery holding a
|
||||
* chain of ANDed subclauses. Some of the subclauses may be themselves
|
||||
* OR'ed lists (it doesn't go deeper). Entries in the AND list may be
|
||||
* negated (AND NOT).
|
||||
*
|
||||
* For LEAF elements, the value can hold one or several words. In the
|
||||
* latter case, it should be interpreted as a phrase (comes from a
|
||||
* user-entered "quoted string"), except if the modifier flags say otherwise.
|
||||
*
|
||||
* Some fields only make sense either for compound or LEAF queries. This
|
||||
* is commented for each. We should subclass really.
|
||||
*
|
||||
* Note that wasaStringToQuery supposedly parses the whole Xesam
|
||||
* User Search Language v 0.95, but that some elements are dropped or
|
||||
* ignored during the translation to a native Recoll query in wasaToRcl
|
||||
*/
|
||||
class WasaQuery {
|
||||
public:
|
||||
/** Type of this element: leaf or AND/OR chain */
|
||||
enum Op {OP_NULL, OP_LEAF, OP_OR, OP_AND};
|
||||
/** Relation to be searched between field and value. Recoll actually only
|
||||
supports "contain" except for a size field */
|
||||
enum Rel {REL_NULL, REL_EQUALS, REL_CONTAINS, REL_LT, REL_LTE,
|
||||
REL_GT, REL_GTE};
|
||||
/** Modifiers for terms: case/diacritics handling,
|
||||
stemming control... */
|
||||
enum Modifier {WQM_CASESENS = 1, WQM_DIACSENS = 2, WQM_NOSTEM = 4,
|
||||
WQM_BOOST = 8, WQM_PROX = 0x10, WQM_SLOPPY = 0x20,
|
||||
WQM_WORDS = 0x40, WQM_PHRASESLACK = 0x80, WQM_REGEX = 0x100,
|
||||
WQM_FUZZY = 0x200, WQM_QUOTED = 0x400};
|
||||
|
||||
typedef vector<WasaQuery*> subqlist_t;
|
||||
|
||||
WasaQuery()
|
||||
: m_op(OP_NULL), m_rel(REL_NULL), m_exclude(false),
|
||||
m_modifiers(0), m_slack(0), m_weight(1.0)
|
||||
{}
|
||||
|
||||
~WasaQuery();
|
||||
|
||||
/** Get string describing the query tree from this point */
|
||||
void describe(string &desc) const;
|
||||
|
||||
/** Op to be performed on either value (may be LEAF or EXCL, or subqs */
|
||||
WasaQuery::Op m_op;
|
||||
|
||||
/** Field specification if any (ie: title, author ...) Only OPT_LEAF */
|
||||
string m_fieldspec;
|
||||
/** Relation between field and value: =, :, <,>,<=, >= */
|
||||
WasaQuery::Rel m_rel;
|
||||
|
||||
/* Negating flag */
|
||||
bool m_exclude;
|
||||
|
||||
/* String value. Valid for op == OP_LEAF or EXCL */
|
||||
string m_value;
|
||||
|
||||
/** Subqueries. Valid for conjunctions */
|
||||
vector<WasaQuery*> m_subs;
|
||||
|
||||
unsigned int m_modifiers;
|
||||
int m_slack;
|
||||
float m_weight;
|
||||
};
|
||||
|
||||
/**
|
||||
* Wasabi query string parser class. Could be a simple function
|
||||
* really, but there might be some parser initialization work done in
|
||||
* the constructor.
|
||||
*/
|
||||
class StringToWasaQuery {
|
||||
public:
|
||||
StringToWasaQuery();
|
||||
~StringToWasaQuery();
|
||||
WasaQuery *stringToQuery(const string& str, string& reason);
|
||||
class Internal;
|
||||
private:
|
||||
Internal *internal;
|
||||
};
|
||||
|
||||
#endif /* _WASASTRINGTOQUERY_H_INCLUDED_ */
|
||||
@ -1,286 +0,0 @@
|
||||
/* Copyright (C) 2006 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <list>
|
||||
#include <algorithm>
|
||||
using std::string;
|
||||
using std::list;
|
||||
|
||||
#include "rclconfig.h"
|
||||
#include "wasastringtoquery.h"
|
||||
#include "rcldb.h"
|
||||
#include "searchdata.h"
|
||||
#include "wasatorcl.h"
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
#include "rclconfig.h"
|
||||
#include "refcntr.h"
|
||||
#include "textsplit.h"
|
||||
|
||||
static Rcl::SearchData *wasaQueryToRcl(const RclConfig *config,
|
||||
const string& stemlang,
|
||||
WasaQuery *wasa,
|
||||
const string& autosuffs, string& reason)
|
||||
{
|
||||
if (wasa == 0) {
|
||||
reason = "NULL query";
|
||||
return 0;
|
||||
}
|
||||
if (wasa->m_op != WasaQuery::OP_AND && wasa->m_op != WasaQuery::OP_OR) {
|
||||
reason = "Top query neither AND nor OR ?";
|
||||
LOGERR(("wasaQueryToRcl: top query neither AND nor OR!\n"));
|
||||
return 0;
|
||||
}
|
||||
|
||||
Rcl::SearchData *sdata = new
|
||||
Rcl::SearchData(wasa->m_op == WasaQuery::OP_AND ? Rcl::SCLT_AND :
|
||||
Rcl::SCLT_OR, stemlang);
|
||||
LOGDEB2(("wasaQueryToRcl: %s chain\n", wasa->m_op == WasaQuery::OP_AND ?
|
||||
"AND" : "OR"));
|
||||
|
||||
WasaQuery::subqlist_t::iterator it;
|
||||
Rcl::SearchDataClause *nclause;
|
||||
|
||||
// Walk the list of clauses. Some pseudo-field types need special
|
||||
// processing, which results in setting data in the top struct
|
||||
// instead of adding a clause. We check for these first
|
||||
for (it = wasa->m_subs.begin(); it != wasa->m_subs.end(); it++) {
|
||||
|
||||
if (!stringicmp("mime", (*it)->m_fieldspec) ||
|
||||
!stringicmp("format", (*it)->m_fieldspec)) {
|
||||
if ((*it)->m_op == WasaQuery::OP_LEAF) {
|
||||
if ((*it)->m_exclude) {
|
||||
sdata->remFiletype((*it)->m_value);
|
||||
} else {
|
||||
sdata->addFiletype((*it)->m_value);
|
||||
}
|
||||
} else {
|
||||
reason = "internal error: mime clause not leaf??";
|
||||
return 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Xesam uses "type", we also support "rclcat", for broad
|
||||
// categories like "audio", "presentation", etc.
|
||||
if (!stringicmp("rclcat", (*it)->m_fieldspec) ||
|
||||
!stringicmp("type", (*it)->m_fieldspec)) {
|
||||
if ((*it)->m_op != WasaQuery::OP_LEAF) {
|
||||
reason = "internal error: rclcat/type clause not leaf??";
|
||||
return 0;
|
||||
}
|
||||
vector<string> mtypes;
|
||||
if (config && config->getMimeCatTypes((*it)->m_value, mtypes)
|
||||
&& !mtypes.empty()) {
|
||||
for (vector<string>::iterator mit = mtypes.begin();
|
||||
mit != mtypes.end(); mit++) {
|
||||
if ((*it)->m_exclude) {
|
||||
sdata->remFiletype(*mit);
|
||||
} else {
|
||||
sdata->addFiletype(*mit);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
reason = "Unknown rclcat/type value: no mime types found";
|
||||
return 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Handle "date" spec
|
||||
if (!stringicmp("date", (*it)->m_fieldspec)) {
|
||||
if ((*it)->m_op != WasaQuery::OP_LEAF) {
|
||||
reason = "Negative date filtering not supported";
|
||||
return 0;
|
||||
}
|
||||
DateInterval di;
|
||||
if (!parsedateinterval((*it)->m_value, &di)) {
|
||||
LOGERR(("wasaQueryToRcl: bad date interval format\n"));
|
||||
reason = "Bad date interval format";
|
||||
return 0;
|
||||
}
|
||||
LOGDEB(("wasaQueryToRcl:: date span: %d-%d-%d/%d-%d-%d\n",
|
||||
di.y1,di.m1,di.d1, di.y2,di.m2,di.d2));
|
||||
sdata->setDateSpan(&di);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Handle "size" spec
|
||||
if (!stringicmp("size", (*it)->m_fieldspec)) {
|
||||
if ((*it)->m_op != WasaQuery::OP_LEAF) {
|
||||
reason = "Negative size filtering not supported";
|
||||
return 0;
|
||||
}
|
||||
char *cp;
|
||||
size_t size = strtoll((*it)->m_value.c_str(), &cp, 10);
|
||||
if (*cp != 0) {
|
||||
switch (*cp) {
|
||||
case 'k': case 'K': size *= 1E3;break;
|
||||
case 'm': case 'M': size *= 1E6;break;
|
||||
case 'g': case 'G': size *= 1E9;break;
|
||||
case 't': case 'T': size *= 1E12;break;
|
||||
default:
|
||||
reason = string("Bad multiplier suffix: ") + *cp;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
switch ((*it)->m_rel) {
|
||||
case WasaQuery::REL_EQUALS:
|
||||
sdata->setMaxSize(size);
|
||||
sdata->setMinSize(size);
|
||||
break;
|
||||
case WasaQuery::REL_LT:
|
||||
case WasaQuery::REL_LTE:
|
||||
sdata->setMaxSize(size);
|
||||
break;
|
||||
case WasaQuery::REL_GT:
|
||||
case WasaQuery::REL_GTE:
|
||||
sdata->setMinSize(size);
|
||||
break;
|
||||
default:
|
||||
reason = "Bad relation operator with size query. Use > < or =";
|
||||
return 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// "Regular" processing follows:
|
||||
unsigned int mods = (unsigned int)(*it)->m_modifiers;
|
||||
LOGDEB0(("wasaQueryToRcl: clause modifiers 0x%x\n", mods));
|
||||
nclause = 0;
|
||||
|
||||
switch ((*it)->m_op) {
|
||||
case WasaQuery::OP_NULL:
|
||||
case WasaQuery::OP_AND:
|
||||
default:
|
||||
reason = "Found bad NULL or AND query type in list";
|
||||
LOGERR(("wasaQueryToRcl: found bad NULL or AND q type in list\n"));
|
||||
continue;
|
||||
|
||||
case WasaQuery::OP_LEAF: {
|
||||
LOGDEB0(("wasaQueryToRcl: leaf clause [%s:%s] slack %d excl %d\n",
|
||||
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(),
|
||||
(*it)->m_slack, (*it)->m_exclude));
|
||||
|
||||
// Change terms found in the "autosuffs" list into "ext"
|
||||
// field queries
|
||||
if ((*it)->m_fieldspec.empty() && !autosuffs.empty()) {
|
||||
vector<string> asfv;
|
||||
if (stringToStrings(autosuffs, asfv)) {
|
||||
if (find_if(asfv.begin(), asfv.end(),
|
||||
StringIcmpPred((*it)->m_value)) != asfv.end()) {
|
||||
(*it)->m_fieldspec = "ext";
|
||||
(*it)->m_modifiers |= WasaQuery::WQM_NOSTEM;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!stringicmp("dir", (*it)->m_fieldspec)) {
|
||||
// dir filtering special case
|
||||
nclause = new Rcl::SearchDataClausePath((*it)->m_value,
|
||||
(*it)->m_exclude);
|
||||
} else {
|
||||
if ((*it)->m_exclude && wasa->m_op != WasaQuery::OP_AND) {
|
||||
LOGERR(("wasaQueryToRcl: excl clause inside OR list!\n"));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mods & WasaQuery::WQM_QUOTED) {
|
||||
Rcl::SClType tp = (mods & WasaQuery::WQM_PROX) ?
|
||||
Rcl::SCLT_NEAR :
|
||||
Rcl::SCLT_PHRASE;
|
||||
nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value,
|
||||
(*it)->m_slack,
|
||||
(*it)->m_fieldspec);
|
||||
} else {
|
||||
// If term has commas or slashes inside, take it
|
||||
// as a list, turn the slashes/commas to spaces,
|
||||
// leave unquoted. Otherwise, this would end up as
|
||||
// a phrase query. This is a handy way to enter
|
||||
// multiple terms to be searched inside a
|
||||
// field. We interpret ',' as AND, and '/' as
|
||||
// OR. No mixes allowed and ',' wins.
|
||||
Rcl::SClType tp = (*it)->m_exclude ? Rcl::SCLT_OR:
|
||||
Rcl::SCLT_AND;
|
||||
string ns = neutchars((*it)->m_value, ",");
|
||||
if (ns.compare((*it)->m_value)) {
|
||||
// had ','
|
||||
tp = Rcl::SCLT_AND;
|
||||
} else {
|
||||
ns = neutchars((*it)->m_value, "/");
|
||||
if (ns.compare((*it)->m_value)) {
|
||||
tp = Rcl::SCLT_OR;
|
||||
}
|
||||
}
|
||||
nclause = new Rcl::SearchDataClauseSimple(tp, ns,
|
||||
(*it)->m_fieldspec);
|
||||
}
|
||||
nclause->setexclude((*it)->m_exclude);
|
||||
}
|
||||
|
||||
if (nclause == 0) {
|
||||
reason = "Out of memory";
|
||||
LOGERR(("wasaQueryToRcl: out of memory\n"));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case WasaQuery::OP_OR:
|
||||
LOGDEB2(("wasaQueryToRcl: OR clause [%s]:[%s]\n",
|
||||
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
|
||||
// Create a subquery.
|
||||
Rcl::SearchData *sub =
|
||||
wasaQueryToRcl(config, stemlang, *it, autosuffs, reason);
|
||||
if (sub == 0) {
|
||||
continue;
|
||||
}
|
||||
nclause =
|
||||
new Rcl::SearchDataClauseSub(RefCntr<Rcl::SearchData>(sub));
|
||||
if (nclause == 0) {
|
||||
LOGERR(("wasaQueryToRcl: out of memory\n"));
|
||||
reason = "Out of memory";
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (mods & WasaQuery::WQM_NOSTEM)
|
||||
nclause->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
|
||||
if (mods & WasaQuery::WQM_DIACSENS)
|
||||
nclause->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
|
||||
if (mods & WasaQuery::WQM_CASESENS)
|
||||
nclause->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
|
||||
if ((*it)->m_weight != 1.0)
|
||||
nclause->setWeight((*it)->m_weight);
|
||||
sdata->addClause(nclause);
|
||||
}
|
||||
|
||||
return sdata;
|
||||
}
|
||||
|
||||
Rcl::SearchData *wasaStringToRcl(const RclConfig *config, const string& stemlang,
|
||||
const string &qs, string &reason,
|
||||
const string& autosuffs)
|
||||
{
|
||||
StringToWasaQuery parser;
|
||||
WasaQuery *wq = parser.stringToQuery(qs, reason);
|
||||
if (wq == 0)
|
||||
return 0;
|
||||
return wasaQueryToRcl(config, stemlang, wq, autosuffs, reason);
|
||||
}
|
||||
@ -17,15 +17,18 @@
|
||||
|
||||
#ifndef _WASATORCL_H_INCLUDED_
|
||||
#define _WASATORCL_H_INCLUDED_
|
||||
|
||||
#include <string>
|
||||
using std::string;
|
||||
|
||||
#include "rcldb.h"
|
||||
#include "searchdata.h"
|
||||
|
||||
namespace Rcl {
|
||||
class SearchData;
|
||||
}
|
||||
class RclConfig;
|
||||
|
||||
extern Rcl::SearchData *wasaStringToRcl(const RclConfig *, const string& stemlang,
|
||||
const string& query, string &reason,
|
||||
const string& autosuffs = string());
|
||||
extern Rcl::SearchData *wasaStringToRcl(const RclConfig *,
|
||||
const std::string& stemlang,
|
||||
const std::string& query,
|
||||
std::string &reason,
|
||||
const std::string& autosuffs = "");
|
||||
|
||||
#endif /* _WASATORCL_H_INCLUDED_ */
|
||||
|
||||
@ -52,8 +52,6 @@ namespace Rcl {
|
||||
typedef vector<SearchDataClause *>::iterator qlist_it_t;
|
||||
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
|
||||
|
||||
static const int original_term_wqf_booster = 10;
|
||||
|
||||
void SearchData::commoninit()
|
||||
{
|
||||
m_haveDates = false;
|
||||
@ -74,241 +72,6 @@ SearchData::~SearchData()
|
||||
delete *it;
|
||||
}
|
||||
|
||||
// Expand categories and mime type wild card exps Categories are
|
||||
// expanded against the configuration, mimetypes against the index
|
||||
// (for wildcards).
|
||||
bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
|
||||
{
|
||||
const RclConfig *cfg = db.getConf();
|
||||
if (!cfg) {
|
||||
LOGFATAL(("Db::expandFileTypes: null configuration!!\n"));
|
||||
return false;
|
||||
}
|
||||
vector<string> exptps;
|
||||
|
||||
for (vector<string>::iterator it = tps.begin(); it != tps.end(); it++) {
|
||||
if (cfg->isMimeCategory(*it)) {
|
||||
vector<string>tps;
|
||||
cfg->getMimeCatTypes(*it, tps);
|
||||
exptps.insert(exptps.end(), tps.begin(), tps.end());
|
||||
} else {
|
||||
TermMatchResult res;
|
||||
string mt = stringtolower((const string&)*it);
|
||||
// We set casesens|diacsens to get an equivalent of ixTermMatch()
|
||||
db.termMatch(Db::ET_WILD|Db::ET_CASESENS|Db::ET_DIACSENS, string(),
|
||||
mt, res, -1, "mtype");
|
||||
if (res.entries.empty()) {
|
||||
exptps.push_back(it->c_str());
|
||||
} else {
|
||||
for (vector<TermMatchEntry>::const_iterator rit =
|
||||
res.entries.begin(); rit != res.entries.end(); rit++) {
|
||||
exptps.push_back(strip_prefix(rit->term));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
sort(exptps.begin(), exptps.end());
|
||||
exptps.erase(unique(exptps.begin(), exptps.end()), exptps.end());
|
||||
|
||||
tps = exptps;
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char *maxXapClauseMsg =
|
||||
"Maximum Xapian query size exceeded. Increase maxXapianClauses "
|
||||
"in the configuration. ";
|
||||
static const char *maxXapClauseCaseDiacMsg =
|
||||
"Or try to use case (C) or diacritics (D) sensitivity qualifiers, or less "
|
||||
"wildcards ?"
|
||||
;
|
||||
|
||||
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
|
||||
vector<SearchDataClause*>& query,
|
||||
string& reason, void *d)
|
||||
{
|
||||
Xapian::Query xq;
|
||||
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
|
||||
Xapian::Query nq;
|
||||
if (!(*it)->toNativeQuery(db, &nq)) {
|
||||
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
|
||||
(*it)->getReason().c_str()));
|
||||
reason += (*it)->getReason() + " ";
|
||||
return false;
|
||||
}
|
||||
if (nq.empty()) {
|
||||
LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
|
||||
continue;
|
||||
}
|
||||
// If this structure is an AND list, must use AND_NOT for excl clauses.
|
||||
// Else this is an OR list, and there can't be excl clauses (checked by
|
||||
// addClause())
|
||||
Xapian::Query::op op;
|
||||
if (tp == SCLT_AND) {
|
||||
if ((*it)->getexclude()) {
|
||||
op = Xapian::Query::OP_AND_NOT;
|
||||
} else {
|
||||
op = Xapian::Query::OP_AND;
|
||||
}
|
||||
} else {
|
||||
op = Xapian::Query::OP_OR;
|
||||
}
|
||||
if (xq.empty()) {
|
||||
if (op == Xapian::Query::OP_AND_NOT)
|
||||
xq = Xapian::Query(op, Xapian::Query::MatchAll, nq);
|
||||
else
|
||||
xq = nq;
|
||||
} else {
|
||||
xq = Xapian::Query(op, xq, nq);
|
||||
}
|
||||
if (int(xq.get_length()) >= getMaxCl()) {
|
||||
LOGERR(("%s\n", maxXapClauseMsg));
|
||||
m_reason += maxXapClauseMsg;
|
||||
if (!o_index_stripchars)
|
||||
m_reason += maxXapClauseCaseDiacMsg;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
LOGDEB0(("SearchData::clausesToQuery: got %d clauses\n", xq.get_length()));
|
||||
|
||||
if (xq.empty())
|
||||
xq = Xapian::Query::MatchAll;
|
||||
|
||||
*((Xapian::Query *)d) = xq;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
||||
{
|
||||
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
|
||||
m_reason.erase();
|
||||
|
||||
db.getConf()->getConfParam("maxTermExpand", &m_maxexp);
|
||||
db.getConf()->getConfParam("maxXapianClauses", &m_maxcl);
|
||||
|
||||
// Walk the clause list translating each in turn and building the
|
||||
// Xapian query tree
|
||||
Xapian::Query xq;
|
||||
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
|
||||
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",
|
||||
m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (m_haveDates) {
|
||||
// If one of the extremities is unset, compute db extremas
|
||||
if (m_dates.y1 == 0 || m_dates.y2 == 0) {
|
||||
int minyear = 1970, maxyear = 2100;
|
||||
if (!db.maxYearSpan(&minyear, &maxyear)) {
|
||||
LOGERR(("Can't retrieve index min/max dates\n"));
|
||||
//whatever, go on.
|
||||
}
|
||||
|
||||
if (m_dates.y1 == 0) {
|
||||
m_dates.y1 = minyear;
|
||||
m_dates.m1 = 1;
|
||||
m_dates.d1 = 1;
|
||||
}
|
||||
if (m_dates.y2 == 0) {
|
||||
m_dates.y2 = maxyear;
|
||||
m_dates.m2 = 12;
|
||||
m_dates.d2 = 31;
|
||||
}
|
||||
}
|
||||
LOGDEB(("Db::toNativeQuery: date interval: %d-%d-%d/%d-%d-%d\n",
|
||||
m_dates.y1, m_dates.m1, m_dates.d1,
|
||||
m_dates.y2, m_dates.m2, m_dates.d2));
|
||||
Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1,
|
||||
m_dates.y2, m_dates.m2, m_dates.d2);
|
||||
if (dq.empty()) {
|
||||
LOGINFO(("Db::toNativeQuery: date filter is empty\n"));
|
||||
}
|
||||
// If no probabilistic query is provided then promote the daterange
|
||||
// filter to be THE query instead of filtering an empty query.
|
||||
if (xq.empty()) {
|
||||
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
|
||||
xq = dq;
|
||||
} else {
|
||||
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (m_minSize != size_t(-1) || m_maxSize != size_t(-1)) {
|
||||
Xapian::Query sq;
|
||||
char min[50], max[50];
|
||||
sprintf(min, "%lld", (long long)m_minSize);
|
||||
sprintf(max, "%lld", (long long)m_maxSize);
|
||||
if (m_minSize == size_t(-1)) {
|
||||
string value(max);
|
||||
leftzeropad(value, 12);
|
||||
sq = Xapian::Query(Xapian::Query::OP_VALUE_LE, VALUE_SIZE, value);
|
||||
} else if (m_maxSize == size_t(-1)) {
|
||||
string value(min);
|
||||
leftzeropad(value, 12);
|
||||
sq = Xapian::Query(Xapian::Query::OP_VALUE_GE, VALUE_SIZE, value);
|
||||
} else {
|
||||
string minvalue(min);
|
||||
leftzeropad(minvalue, 12);
|
||||
string maxvalue(max);
|
||||
leftzeropad(maxvalue, 12);
|
||||
sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE,
|
||||
minvalue, maxvalue);
|
||||
}
|
||||
|
||||
// If no probabilistic query is provided then promote the
|
||||
// filter to be THE query instead of filtering an empty query.
|
||||
if (xq.empty()) {
|
||||
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
|
||||
xq = sq;
|
||||
} else {
|
||||
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, sq);
|
||||
}
|
||||
}
|
||||
|
||||
// Add the autophrase if any
|
||||
if (m_autophrase.isNotNull()) {
|
||||
Xapian::Query apq;
|
||||
if (m_autophrase->toNativeQuery(db, &apq)) {
|
||||
xq = xq.empty() ? apq :
|
||||
Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq);
|
||||
}
|
||||
}
|
||||
|
||||
// Add the file type filtering clause if any
|
||||
if (!m_filetypes.empty()) {
|
||||
expandFileTypes(db, m_filetypes);
|
||||
|
||||
Xapian::Query tq;
|
||||
for (vector<string>::iterator it = m_filetypes.begin();
|
||||
it != m_filetypes.end(); it++) {
|
||||
string term = wrap_prefix(mimetype_prefix) + *it;
|
||||
LOGDEB0(("Adding file type term: [%s]\n", term.c_str()));
|
||||
tq = tq.empty() ? Xapian::Query(term) :
|
||||
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
|
||||
}
|
||||
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
|
||||
}
|
||||
|
||||
// Add the neg file type filtering clause if any
|
||||
if (!m_nfiletypes.empty()) {
|
||||
expandFileTypes(db, m_nfiletypes);
|
||||
|
||||
Xapian::Query tq;
|
||||
for (vector<string>::iterator it = m_nfiletypes.begin();
|
||||
it != m_nfiletypes.end(); it++) {
|
||||
string term = wrap_prefix(mimetype_prefix) + *it;
|
||||
LOGDEB0(("Adding negative file type term: [%s]\n", term.c_str()));
|
||||
tq = tq.empty() ? Xapian::Query(term) :
|
||||
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
|
||||
}
|
||||
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq);
|
||||
}
|
||||
|
||||
*((Xapian::Query *)d) = xq;
|
||||
return true;
|
||||
}
|
||||
|
||||
// This is called by the GUI simple search if the option is set: add
|
||||
// (OR) phrase to a query (if it is simple enough) so that results
|
||||
// where the search terms are close and in order will come up on top.
|
||||
@ -428,695 +191,4 @@ void SearchData::getTerms(HighlightData &hld) const
|
||||
return;
|
||||
}
|
||||
|
||||
// Splitter callback for breaking a user string into simple terms and
|
||||
// phrases. This is for parts of the user entry which would appear as
|
||||
// a single word because there is no white space inside, but are
|
||||
// actually multiple terms to rcldb (ie term1,term2)
|
||||
class TextSplitQ : public TextSplitP {
|
||||
public:
|
||||
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
|
||||
: TextSplitP(prc, flags),
|
||||
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
|
||||
{}
|
||||
|
||||
bool takeword(const std::string &term, int pos, int bs, int be)
|
||||
{
|
||||
// Check if the first letter is a majuscule in which
|
||||
// case we do not want to do stem expansion. Need to do this
|
||||
// before unac of course...
|
||||
curnostemexp = unaciscapital(term);
|
||||
|
||||
return TextSplitP::takeword(term, pos, bs, be);
|
||||
}
|
||||
|
||||
bool curnostemexp;
|
||||
vector<string> terms;
|
||||
vector<bool> nostemexps;
|
||||
const StopList &stops;
|
||||
// Count of terms including stopwords: this is for adjusting
|
||||
// phrase/near slack
|
||||
int alltermcount;
|
||||
int lastpos;
|
||||
};
|
||||
|
||||
class TermProcQ : public TermProc {
|
||||
public:
|
||||
TermProcQ() : TermProc(0), m_ts(0) {}
|
||||
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
|
||||
|
||||
bool takeword(const std::string &term, int pos, int bs, int be)
|
||||
{
|
||||
m_ts->alltermcount++;
|
||||
if (m_ts->lastpos < pos)
|
||||
m_ts->lastpos = pos;
|
||||
bool noexpand = be ? m_ts->curnostemexp : true;
|
||||
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
|
||||
term.c_str(), pos, noexpand));
|
||||
if (m_terms[pos].size() < term.size()) {
|
||||
m_terms[pos] = term;
|
||||
m_nste[pos] = noexpand;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool flush()
|
||||
{
|
||||
for (map<int, string>::const_iterator it = m_terms.begin();
|
||||
it != m_terms.end(); it++) {
|
||||
m_ts->terms.push_back(it->second);
|
||||
m_ts->nostemexps.push_back(m_nste[it->first]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
TextSplitQ *m_ts;
|
||||
map<int, string> m_terms;
|
||||
map<int, bool> m_nste;
|
||||
};
|
||||
|
||||
|
||||
#if 1
|
||||
static void listVector(const string& what, const vector<string>&l)
|
||||
{
|
||||
string a;
|
||||
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
||||
a = a + *it + " ";
|
||||
}
|
||||
LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
|
||||
}
|
||||
#endif
|
||||
|
||||
/** Expand term into term list, using appropriate mode: stem, wildcards,
|
||||
* diacritics...
|
||||
*
|
||||
* @param mods stem expansion, case and diacritics sensitivity control.
|
||||
* @param term input single word
|
||||
* @param oexp output expansion list
|
||||
* @param sterm output original input term if there were no wildcards
|
||||
* @param prefix field prefix in index. We could recompute it, but the caller
|
||||
* has it already. Used in the simple case where there is nothing to expand,
|
||||
* and we just return the prefixed term (else Db::termMatch deals with it).
|
||||
*/
|
||||
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
||||
string& ermsg, int mods,
|
||||
const string& term,
|
||||
vector<string>& oexp, string &sterm,
|
||||
const string& prefix)
|
||||
{
|
||||
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
|
||||
mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
|
||||
sterm.clear();
|
||||
oexp.clear();
|
||||
if (term.empty())
|
||||
return true;
|
||||
|
||||
bool maxexpissoft = false;
|
||||
int maxexpand = getSoftMaxExp();
|
||||
if (maxexpand != -1) {
|
||||
maxexpissoft = true;
|
||||
} else {
|
||||
maxexpand = getMaxExp();
|
||||
}
|
||||
|
||||
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
||||
|
||||
// If there are no wildcards, add term to the list of user-entered terms
|
||||
if (!haswild) {
|
||||
m_hldata.uterms.insert(term);
|
||||
sterm = term;
|
||||
}
|
||||
// No stem expansion if there are wildcards or if prevented by caller
|
||||
bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
|
||||
if (haswild || getStemLang().empty()) {
|
||||
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
||||
nostemexp = true;
|
||||
}
|
||||
|
||||
// noexpansion can be modified further down by possible case/diac expansion
|
||||
bool noexpansion = nostemexp && !haswild;
|
||||
|
||||
int termmatchsens = 0;
|
||||
|
||||
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
|
||||
bool case_sensitive = (mods & SDCM_CASESENS) != 0;
|
||||
|
||||
if (o_index_stripchars) {
|
||||
diac_sensitive = case_sensitive = false;
|
||||
} else {
|
||||
// If we are working with a raw index, apply the rules for case and
|
||||
// diacritics sensitivity.
|
||||
|
||||
// If any character has a diacritic, we become
|
||||
// diacritic-sensitive. Note that the way that the test is
|
||||
// performed (conversion+comparison) will automatically ignore
|
||||
// accented characters which are actually a separate letter
|
||||
if (getAutoDiac() && unachasaccents(term)) {
|
||||
LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
|
||||
diac_sensitive = true;
|
||||
}
|
||||
|
||||
// If any character apart the first is uppercase, we become
|
||||
// case-sensitive. The first character is reserved for
|
||||
// turning off stemming. You need to use a query language
|
||||
// modifier to search for Floor in a case-sensitive way.
|
||||
Utf8Iter it(term);
|
||||
it++;
|
||||
if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
|
||||
LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
|
||||
case_sensitive = true;
|
||||
}
|
||||
|
||||
// If we are sensitive to case or diacritics turn stemming off
|
||||
if (diac_sensitive || case_sensitive) {
|
||||
LOGDEB0(("expandTerm: diac or case sens set -> stemexpand off\n"));
|
||||
nostemexp = true;
|
||||
}
|
||||
|
||||
if (!case_sensitive || !diac_sensitive)
|
||||
noexpansion = false;
|
||||
}
|
||||
|
||||
if (case_sensitive)
|
||||
termmatchsens |= Db::ET_CASESENS;
|
||||
if (diac_sensitive)
|
||||
termmatchsens |= Db::ET_DIACSENS;
|
||||
|
||||
if (noexpansion) {
|
||||
oexp.push_back(prefix + term);
|
||||
m_hldata.terms[term] = term;
|
||||
LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
Db::MatchType mtyp = haswild ? Db::ET_WILD :
|
||||
nostemexp ? Db::ET_NONE : Db::ET_STEM;
|
||||
TermMatchResult res;
|
||||
if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
|
||||
m_field)) {
|
||||
// Let it go through
|
||||
}
|
||||
|
||||
// Term match entries to vector of terms
|
||||
if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
|
||||
ermsg = "Maximum term expansion size exceeded."
|
||||
" Maybe use case/diacritics sensitivity or increase maxTermExpand.";
|
||||
return false;
|
||||
}
|
||||
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
||||
it != res.entries.end(); it++) {
|
||||
oexp.push_back(it->term);
|
||||
}
|
||||
// If the term does not exist at all in the db, the return from
|
||||
// termMatch() is going to be empty, which is not what we want (we
|
||||
// would then compute an empty Xapian query)
|
||||
if (oexp.empty())
|
||||
oexp.push_back(prefix + term);
|
||||
|
||||
// Remember the uterm-to-expansion links
|
||||
for (vector<string>::const_iterator it = oexp.begin();
|
||||
it != oexp.end(); it++) {
|
||||
m_hldata.terms[strip_prefix(*it)] = term;
|
||||
}
|
||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
|
||||
void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
||||
vector<vector<string> >::const_iterator vvend,
|
||||
vector<string>& comb,
|
||||
vector<vector<string> >&allcombs)
|
||||
{
|
||||
// Remember my string vector and compute next, for recursive calls.
|
||||
vector<vector<string> >::const_iterator myvit = vvit++;
|
||||
|
||||
// Walk the string vector I'm called upon and, for each string,
|
||||
// add it to current result, an call myself recursively on the
|
||||
// next string vector. The last call (last element of the vector of
|
||||
// vectors), adds the elementary result to the output
|
||||
|
||||
// Walk my string vector
|
||||
for (vector<string>::const_iterator strit = (*myvit).begin();
|
||||
strit != (*myvit).end(); strit++) {
|
||||
|
||||
// Add my current value to the string vector we're building
|
||||
comb.push_back(*strit);
|
||||
|
||||
if (vvit == vvend) {
|
||||
// Last call: store current result
|
||||
allcombs.push_back(comb);
|
||||
} else {
|
||||
// Call recursively on next string vector
|
||||
multiply_groups(vvit, vvend, comb, allcombs);
|
||||
}
|
||||
// Pop the value I just added (make room for the next element in my
|
||||
// vector)
|
||||
comb.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
|
||||
const string& span,
|
||||
int mods, void * pq)
|
||||
{
|
||||
vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
|
||||
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
|
||||
span.c_str(), (unsigned int)mods));
|
||||
vector<string> exp;
|
||||
string sterm; // dumb version of user term
|
||||
|
||||
string prefix;
|
||||
const FieldTraits *ftp;
|
||||
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
|
||||
prefix = wrap_prefix(ftp->pfx);
|
||||
}
|
||||
|
||||
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
|
||||
return;
|
||||
|
||||
// Set up the highlight data. No prefix should go in there
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
|
||||
m_hldata.slacks.push_back(0);
|
||||
m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
|
||||
}
|
||||
|
||||
// Push either term or OR of stem-expanded set
|
||||
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
|
||||
m_curcl += exp.size();
|
||||
|
||||
// If sterm (simplified original user term) is not null, give it a
|
||||
// relevance boost. We do this even if no expansion occurred (else
|
||||
// the non-expanded terms in a term list would end-up with even
|
||||
// less wqf). This does not happen if there are wildcards anywhere
|
||||
// in the search.
|
||||
// We normally boost the original term in the stem expansion list. Don't
|
||||
// do it if there are wildcards anywhere, this would skew the results.
|
||||
bool doBoostUserTerm =
|
||||
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
||||
(m_parentSearch == 0 && !m_haveWildCards);
|
||||
if (doBoostUserTerm && !sterm.empty()) {
|
||||
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
|
||||
Xapian::Query(prefix+sterm,
|
||||
original_term_wqf_booster));
|
||||
}
|
||||
pqueries.push_back(xq);
|
||||
}
|
||||
|
||||
// User entry element had several terms: transform into a PHRASE or
|
||||
// NEAR xapian query, the elements of which can themselves be OR
|
||||
// queries if the terms get expanded by stemming or wildcards (we
|
||||
// don't do stemming for PHRASE though)
|
||||
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
||||
TextSplitQ *splitData,
|
||||
int mods, void *pq,
|
||||
bool useNear, int slack)
|
||||
{
|
||||
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
||||
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
||||
Xapian::Query::OP_PHRASE;
|
||||
vector<Xapian::Query> orqueries;
|
||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||
bool hadmultiple = false;
|
||||
#endif
|
||||
vector<vector<string> >groups;
|
||||
|
||||
string prefix;
|
||||
const FieldTraits *ftp;
|
||||
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
|
||||
prefix = wrap_prefix(ftp->pfx);
|
||||
}
|
||||
|
||||
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
|
||||
orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
|
||||
slack++;
|
||||
}
|
||||
|
||||
// Go through the list and perform stem/wildcard expansion for each element
|
||||
vector<bool>::iterator nxit = splitData->nostemexps.begin();
|
||||
for (vector<string>::iterator it = splitData->terms.begin();
|
||||
it != splitData->terms.end(); it++, nxit++) {
|
||||
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
|
||||
// Adjust when we do stem expansion. Not if disabled by
|
||||
// caller, not inside phrases, and some versions of xapian
|
||||
// will accept only one OR clause inside NEAR.
|
||||
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
|
||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||
|| hadmultiple
|
||||
#endif // single OR inside NEAR
|
||||
;
|
||||
int lmods = mods;
|
||||
if (nostemexp)
|
||||
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||
string sterm;
|
||||
vector<string> exp;
|
||||
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
|
||||
return;
|
||||
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
|
||||
listVector("", exp);
|
||||
// groups is used for highlighting, we don't want prefixes in there.
|
||||
vector<string> noprefs;
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
noprefs.push_back(it->substr(prefix.size()));
|
||||
}
|
||||
groups.push_back(noprefs);
|
||||
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||
exp.begin(), exp.end()));
|
||||
m_curcl += exp.size();
|
||||
if (m_curcl >= getMaxCl())
|
||||
return;
|
||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||
if (exp.size() > 1)
|
||||
hadmultiple = true;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
|
||||
orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
|
||||
slack++;
|
||||
}
|
||||
|
||||
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
||||
// For phrases, give a relevance boost like we do for original terms
|
||||
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
||||
splitData->alltermcount, splitData->lastpos));
|
||||
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
||||
splitData->lastpos + 1 + slack);
|
||||
if (op == Xapian::Query::OP_PHRASE)
|
||||
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
|
||||
original_term_wqf_booster);
|
||||
pqueries.push_back(xq);
|
||||
|
||||
// Add all combinations of NEAR/PHRASE groups to the highlighting data.
|
||||
vector<vector<string> > allcombs;
|
||||
vector<string> comb;
|
||||
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
|
||||
|
||||
// Insert the search groups and slacks in the highlight data, with
|
||||
// a reference to the user entry that generated them:
|
||||
m_hldata.groups.insert(m_hldata.groups.end(),
|
||||
allcombs.begin(), allcombs.end());
|
||||
m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
|
||||
m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(),
|
||||
m_hldata.ugroups.size() - 1);
|
||||
}
|
||||
|
||||
// Trim string beginning with ^ or ending with $ and convert to flags
|
||||
static int stringToMods(string& s)
|
||||
{
|
||||
int mods = 0;
|
||||
// Check for an anchored search
|
||||
trimstring(s);
|
||||
if (s.length() > 0 && s[0] == '^') {
|
||||
mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
|
||||
s.erase(0, 1);
|
||||
}
|
||||
if (s.length() > 0 && s[s.length()-1] == '$') {
|
||||
mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
|
||||
s.erase(s.length()-1);
|
||||
}
|
||||
return mods;
|
||||
}
|
||||
|
||||
/**
|
||||
* Turn user entry string (NOT query language) into a list of xapian queries.
|
||||
* We just separate words and phrases, and do wildcard and stem expansion,
|
||||
*
|
||||
* This is used to process data entered into an OR/AND/NEAR/PHRASE field of
|
||||
* the GUI (in the case of NEAR/PHRASE, clausedist adds dquotes to the user
|
||||
* entry).
|
||||
*
|
||||
* This appears awful, and it would seem that the split into
|
||||
* terms/phrases should be performed in the upper layer so that we
|
||||
* only receive pure term or near/phrase pure elements here, but in
|
||||
* fact there are things that would appear like terms to naive code,
|
||||
* and which will actually may be turned into phrases (ie: tom:jerry),
|
||||
* in a manner which intimately depends on the index implementation,
|
||||
* so that it makes sense to process this here.
|
||||
*
|
||||
* The final list contains one query for each term or phrase
|
||||
* - Elements corresponding to a stem-expanded part are an OP_OR
|
||||
* composition of the stem-expanded terms (or a single term query).
|
||||
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
|
||||
* composition of the phrase terms (no stem expansion in this case)
|
||||
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
||||
* count)
|
||||
*/
|
||||
bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
|
||||
string &ermsg, void *pq,
|
||||
int slack, bool useNear)
|
||||
{
|
||||
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
||||
int mods = m_modifiers;
|
||||
|
||||
LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
|
||||
"slack %d near %d\n",
|
||||
iq.c_str(), m_field.c_str(), mods, slack, useNear));
|
||||
ermsg.erase();
|
||||
m_curcl = 0;
|
||||
const StopList stops = db.getStopList();
|
||||
|
||||
// Simple whitespace-split input into user-level words and
|
||||
// double-quoted phrases: word1 word2 "this is a phrase".
|
||||
//
|
||||
// The text splitter may further still decide that the resulting
|
||||
// "words" are really phrases, this depends on separators:
|
||||
// [paul@dom.net] would still be a word (span), but [about:me]
|
||||
// will probably be handled as a phrase.
|
||||
vector<string> phrases;
|
||||
TextSplit::stringToStrings(iq, phrases);
|
||||
|
||||
// Process each element: textsplit into terms, handle stem/wildcard
|
||||
// expansion and transform into an appropriate Xapian::Query
|
||||
try {
|
||||
for (vector<string>::iterator it = phrases.begin();
|
||||
it != phrases.end(); it++) {
|
||||
LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
|
||||
// Anchoring modifiers
|
||||
int amods = stringToMods(*it);
|
||||
int terminc = amods != 0 ? 1 : 0;
|
||||
mods |= amods;
|
||||
// If there are multiple spans in this element, including
|
||||
// at least one composite, we have to increase the slack
|
||||
// else a phrase query including a span would fail.
|
||||
// Ex: "term0@term1 term2" is onlyspans-split as:
|
||||
// 0 term0@term1 0 12
|
||||
// 2 term2 13 18
|
||||
// The position of term2 is 2, not 1, so a phrase search
|
||||
// would fail.
|
||||
// We used to do word split, searching for
|
||||
// "term0 term1 term2" instead, which may have worse
|
||||
// performance, but will succeed.
|
||||
// We now adjust the phrase/near slack by comparing the term count
|
||||
// and the last position
|
||||
|
||||
// The term processing pipeline:
|
||||
TermProcQ tpq;
|
||||
TermProc *nxt = &tpq;
|
||||
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
||||
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
||||
//tpcommon.onlygrams(true);
|
||||
TermProcPrep tpprep(nxt);
|
||||
if (o_index_stripchars)
|
||||
nxt = &tpprep;
|
||||
|
||||
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||
TextSplit::TXTS_KEEPWILD),
|
||||
stops, nxt);
|
||||
tpq.setTSQ(&splitter);
|
||||
splitter.text_to_words(*it);
|
||||
|
||||
slack += splitter.lastpos - splitter.terms.size() + 1;
|
||||
|
||||
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
|
||||
switch (splitter.terms.size() + terminc) {
|
||||
case 0:
|
||||
continue;// ??
|
||||
case 1: {
|
||||
int lmods = mods;
|
||||
if (splitter.nostemexps.front())
|
||||
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||
m_hldata.ugroups.push_back(splitter.terms);
|
||||
processSimpleSpan(db, ermsg, splitter.terms.front(),
|
||||
lmods, &pqueries);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
m_hldata.ugroups.push_back(splitter.terms);
|
||||
processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
|
||||
useNear, slack);
|
||||
}
|
||||
if (m_curcl >= getMaxCl()) {
|
||||
ermsg = maxXapClauseMsg;
|
||||
if (!o_index_stripchars)
|
||||
ermsg += maxXapClauseCaseDiacMsg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg();
|
||||
} catch (const string &s) {
|
||||
ermsg = s;
|
||||
} catch (const char *s) {
|
||||
ermsg = s;
|
||||
} catch (...) {
|
||||
ermsg = "Caught unknown exception";
|
||||
}
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("stringToXapianQueries: %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Translate a simple OR or AND search clause.
|
||||
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
|
||||
{
|
||||
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
|
||||
getStemLang().c_str()));
|
||||
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
||||
Xapian::Query::op op;
|
||||
switch (m_tp) {
|
||||
case SCLT_AND: op = Xapian::Query::OP_AND; break;
|
||||
case SCLT_OR: op = Xapian::Query::OP_OR; break;
|
||||
default:
|
||||
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<Xapian::Query> pqueries;
|
||||
if (!processUserString(db, m_text, m_reason, &pqueries))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||
return true;
|
||||
}
|
||||
|
||||
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
|
||||
if (m_weight != 1.0) {
|
||||
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Translate a FILENAME search clause. This always comes
|
||||
// from a "filename" search from the gui or recollq. A query language
|
||||
// "filename:"-prefixed field will not go through here, but through
|
||||
// the generic field-processing code.
|
||||
//
|
||||
// We do not split the entry any more (used to do some crazy thing
|
||||
// about expanding multiple fragments in the past). We just take the
|
||||
// value blanks and all and expand this against the indexed unsplit
|
||||
// file names
|
||||
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
|
||||
{
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
||||
int maxexp = getSoftMaxExp();
|
||||
if (maxexp == -1)
|
||||
maxexp = getMaxExp();
|
||||
|
||||
vector<string> names;
|
||||
db.filenameWildExp(m_text, names, maxexp);
|
||||
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
||||
|
||||
if (m_weight != 1.0) {
|
||||
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Translate a dir: path filtering clause. See comments in .h
|
||||
bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
|
||||
{
|
||||
LOGDEB(("SearchDataClausePath::toNativeQuery: [%s]\n", m_text.c_str()));
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
||||
if (m_text.empty()) {
|
||||
LOGERR(("SearchDataClausePath: empty path??\n"));
|
||||
m_reason = "Empty path ?";
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<Xapian::Query> orqueries;
|
||||
|
||||
if (m_text[0] == '/')
|
||||
orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix)));
|
||||
else
|
||||
m_text = path_tildexpand(m_text);
|
||||
|
||||
vector<string> vpath;
|
||||
stringToTokens(m_text, vpath, "/");
|
||||
|
||||
for (vector<string>::const_iterator pit = vpath.begin();
|
||||
pit != vpath.end(); pit++){
|
||||
|
||||
string sterm;
|
||||
vector<string> exp;
|
||||
if (!expandTerm(db, m_reason,
|
||||
SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS,
|
||||
*pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
|
||||
return false;
|
||||
}
|
||||
LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
|
||||
listVector("", exp);
|
||||
if (exp.size() == 1)
|
||||
orqueries.push_back(Xapian::Query(exp[0]));
|
||||
else
|
||||
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||
exp.begin(), exp.end()));
|
||||
m_curcl += exp.size();
|
||||
if (m_curcl >= getMaxCl())
|
||||
return false;
|
||||
}
|
||||
|
||||
*qp = Xapian::Query(Xapian::Query::OP_PHRASE,
|
||||
orqueries.begin(), orqueries.end());
|
||||
|
||||
if (m_weight != 1.0) {
|
||||
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Translate NEAR or PHRASE clause.
|
||||
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
|
||||
{
|
||||
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
|
||||
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
||||
vector<Xapian::Query> pqueries;
|
||||
Xapian::Query nq;
|
||||
|
||||
// We produce a single phrase out of the user entry then use
|
||||
// stringToXapianQueries() to lowercase and simplify the phrase
|
||||
// terms etc. This will result into a single (complex)
|
||||
// Xapian::Query.
|
||||
if (m_text.find('\"') != string::npos) {
|
||||
m_text = neutchars(m_text, "\"");
|
||||
}
|
||||
string s = cstr_dquote + m_text + cstr_dquote;
|
||||
bool useNear = (m_tp == SCLT_NEAR);
|
||||
if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
||||
return true;
|
||||
}
|
||||
|
||||
*qp = *pqueries.begin();
|
||||
if (m_weight != 1.0) {
|
||||
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // Namespace Rcl
|
||||
|
||||
@ -102,7 +102,7 @@ public:
|
||||
bool toNativeQuery(Rcl::Db &db, void *);
|
||||
|
||||
/** We become the owner of cl and will delete it */
|
||||
bool addClause(SearchDataClause *cl);
|
||||
bool addClause(SearchDataClause* cl);
|
||||
|
||||
/** If this is a simple query (one field only, no distance clauses),
|
||||
* add phrase made of query terms to query, so that docs containing the
|
||||
@ -164,7 +164,7 @@ public:
|
||||
private:
|
||||
// Combine type. Only SCLT_AND or SCLT_OR here
|
||||
SClType m_tp;
|
||||
// Complex query descriptor
|
||||
// The clauses
|
||||
std::vector<SearchDataClause*> m_query;
|
||||
// Restricted set of filetypes if not empty.
|
||||
std::vector<std::string> m_filetypes;
|
||||
@ -173,14 +173,18 @@ private:
|
||||
// Autophrase if set. Can't be part of the normal chain because
|
||||
// it uses OP_AND_MAYBE
|
||||
RefCntr<SearchDataClauseDist> m_autophrase;
|
||||
//
|
||||
|
||||
// Special stuff produced by input which looks like a clause but means
|
||||
// something else (date and size specs)
|
||||
bool m_haveDates;
|
||||
DateInterval m_dates; // Restrict to date interval
|
||||
size_t m_maxSize;
|
||||
size_t m_minSize;
|
||||
|
||||
// Printable expanded version of the complete query, retrieved/set
|
||||
// from rcldb after the Xapian::setQuery() call
|
||||
std::string m_description;
|
||||
// Error diag
|
||||
std::string m_reason;
|
||||
bool m_haveWildCards;
|
||||
std::string m_stemlang;
|
||||
@ -215,10 +219,12 @@ class SearchDataClause {
|
||||
public:
|
||||
enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2,
|
||||
SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16};
|
||||
enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE};
|
||||
|
||||
SearchDataClause(SClType tp)
|
||||
: m_tp(tp), m_parentSearch(0), m_haveWildCards(0),
|
||||
m_modifiers(SDCM_NONE), m_weight(1.0), m_exclude(false)
|
||||
m_modifiers(SDCM_NONE), m_weight(1.0), m_exclude(false),
|
||||
m_rel(REL_CONTAINS)
|
||||
{}
|
||||
virtual ~SearchDataClause() {}
|
||||
virtual bool toNativeQuery(Rcl::Db &db, void *) = 0;
|
||||
@ -230,6 +236,9 @@ public:
|
||||
{
|
||||
return m_tp;
|
||||
}
|
||||
void setTp(SClType tp) {
|
||||
m_tp = tp;
|
||||
}
|
||||
void setParent(SearchData *p)
|
||||
{
|
||||
m_parentSearch = p;
|
||||
@ -279,7 +288,12 @@ public:
|
||||
{
|
||||
m_exclude = onoff;
|
||||
}
|
||||
|
||||
virtual void setrel(Relation rel) {
|
||||
m_rel = rel;
|
||||
}
|
||||
virtual Relation getrel() {
|
||||
return m_rel;
|
||||
}
|
||||
friend class SearchData;
|
||||
protected:
|
||||
std::string m_reason;
|
||||
@ -289,6 +303,8 @@ protected:
|
||||
Modifier m_modifiers;
|
||||
float m_weight;
|
||||
bool m_exclude;
|
||||
Relation m_rel;
|
||||
|
||||
private:
|
||||
SearchDataClause(const SearchDataClause&)
|
||||
{
|
||||
@ -339,13 +355,15 @@ public:
|
||||
{
|
||||
return m_field;
|
||||
}
|
||||
virtual void setfield(const string& field) {
|
||||
m_field = field;
|
||||
}
|
||||
protected:
|
||||
std::string m_text; // Raw user entry text.
|
||||
std::string m_field; // Field specification if any
|
||||
HighlightData m_hldata;
|
||||
// Current count of Xapian clauses, to check against expansion limit
|
||||
int m_curcl;
|
||||
|
||||
bool processUserString(Rcl::Db &db, const string &iq,
|
||||
std::string &ermsg,
|
||||
void* pq, int slack = 0, bool useNear = false);
|
||||
@ -444,6 +462,9 @@ public:
|
||||
{
|
||||
return m_slack;
|
||||
}
|
||||
virtual void setslack(int slack) {
|
||||
m_slack = slack;
|
||||
}
|
||||
private:
|
||||
int m_slack;
|
||||
};
|
||||
|
||||
983
src/rcldb/searchdatatox.cpp
Normal file
983
src/rcldb/searchdatatox.cpp
Normal file
@ -0,0 +1,983 @@
|
||||
/* Copyright (C) 2006 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
// Handle translation from rcl's SearchData structures to Xapian Queries
|
||||
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
using namespace std;
|
||||
|
||||
#include "xapian.h"
|
||||
|
||||
#include "cstr.h"
|
||||
#include "rcldb.h"
|
||||
#include "rcldb_p.h"
|
||||
#include "searchdata.h"
|
||||
#include "debuglog.h"
|
||||
#include "smallut.h"
|
||||
#include "textsplit.h"
|
||||
#include "unacpp.h"
|
||||
#include "utf8iter.h"
|
||||
#include "stoplist.h"
|
||||
#include "rclconfig.h"
|
||||
#include "termproc.h"
|
||||
#include "synfamily.h"
|
||||
#include "stemdb.h"
|
||||
#include "expansiondbs.h"
|
||||
#include "base64.h"
|
||||
#include "daterange.h"
|
||||
|
||||
namespace Rcl {
|
||||
|
||||
typedef vector<SearchDataClause *>::iterator qlist_it_t;
|
||||
|
||||
static const int original_term_wqf_booster = 10;
|
||||
|
||||
// Expand categories and mime type wild card exps Categories are
|
||||
// expanded against the configuration, mimetypes against the index
|
||||
// (for wildcards).
|
||||
bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
|
||||
{
|
||||
const RclConfig *cfg = db.getConf();
|
||||
if (!cfg) {
|
||||
LOGFATAL(("Db::expandFileTypes: null configuration!!\n"));
|
||||
return false;
|
||||
}
|
||||
vector<string> exptps;
|
||||
|
||||
for (vector<string>::iterator it = tps.begin(); it != tps.end(); it++) {
|
||||
if (cfg->isMimeCategory(*it)) {
|
||||
vector<string>tps;
|
||||
cfg->getMimeCatTypes(*it, tps);
|
||||
exptps.insert(exptps.end(), tps.begin(), tps.end());
|
||||
} else {
|
||||
TermMatchResult res;
|
||||
string mt = stringtolower((const string&)*it);
|
||||
// We set casesens|diacsens to get an equivalent of ixTermMatch()
|
||||
db.termMatch(Db::ET_WILD|Db::ET_CASESENS|Db::ET_DIACSENS, string(),
|
||||
mt, res, -1, "mtype");
|
||||
if (res.entries.empty()) {
|
||||
exptps.push_back(it->c_str());
|
||||
} else {
|
||||
for (vector<TermMatchEntry>::const_iterator rit =
|
||||
res.entries.begin(); rit != res.entries.end(); rit++) {
|
||||
exptps.push_back(strip_prefix(rit->term));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
sort(exptps.begin(), exptps.end());
|
||||
exptps.erase(unique(exptps.begin(), exptps.end()), exptps.end());
|
||||
|
||||
tps = exptps;
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char *maxXapClauseMsg =
|
||||
"Maximum Xapian query size exceeded. Increase maxXapianClauses "
|
||||
"in the configuration. ";
|
||||
static const char *maxXapClauseCaseDiacMsg =
|
||||
"Or try to use case (C) or diacritics (D) sensitivity qualifiers, or less "
|
||||
"wildcards ?"
|
||||
;
|
||||
|
||||
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
|
||||
vector<SearchDataClause*>& query,
|
||||
string& reason, void *d)
|
||||
{
|
||||
Xapian::Query xq;
|
||||
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
|
||||
Xapian::Query nq;
|
||||
if (!(*it)->toNativeQuery(db, &nq)) {
|
||||
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
|
||||
(*it)->getReason().c_str()));
|
||||
reason += (*it)->getReason() + " ";
|
||||
return false;
|
||||
}
|
||||
if (nq.empty()) {
|
||||
LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
|
||||
continue;
|
||||
}
|
||||
// If this structure is an AND list, must use AND_NOT for excl clauses.
|
||||
// Else this is an OR list, and there can't be excl clauses (checked by
|
||||
// addClause())
|
||||
Xapian::Query::op op;
|
||||
if (tp == SCLT_AND) {
|
||||
if ((*it)->getexclude()) {
|
||||
op = Xapian::Query::OP_AND_NOT;
|
||||
} else {
|
||||
op = Xapian::Query::OP_AND;
|
||||
}
|
||||
} else {
|
||||
op = Xapian::Query::OP_OR;
|
||||
}
|
||||
if (xq.empty()) {
|
||||
if (op == Xapian::Query::OP_AND_NOT)
|
||||
xq = Xapian::Query(op, Xapian::Query::MatchAll, nq);
|
||||
else
|
||||
xq = nq;
|
||||
} else {
|
||||
xq = Xapian::Query(op, xq, nq);
|
||||
}
|
||||
if (int(xq.get_length()) >= getMaxCl()) {
|
||||
LOGERR(("%s\n", maxXapClauseMsg));
|
||||
m_reason += maxXapClauseMsg;
|
||||
if (!o_index_stripchars)
|
||||
m_reason += maxXapClauseCaseDiacMsg;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
LOGDEB0(("SearchData::clausesToQuery: got %d clauses\n", xq.get_length()));
|
||||
|
||||
if (xq.empty())
|
||||
xq = Xapian::Query::MatchAll;
|
||||
|
||||
*((Xapian::Query *)d) = xq;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
||||
{
|
||||
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
|
||||
m_reason.erase();
|
||||
|
||||
db.getConf()->getConfParam("maxTermExpand", &m_maxexp);
|
||||
db.getConf()->getConfParam("maxXapianClauses", &m_maxcl);
|
||||
|
||||
// Walk the clause list translating each in turn and building the
|
||||
// Xapian query tree
|
||||
Xapian::Query xq;
|
||||
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
|
||||
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",
|
||||
m_reason.c_str()));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (m_haveDates) {
|
||||
// If one of the extremities is unset, compute db extremas
|
||||
if (m_dates.y1 == 0 || m_dates.y2 == 0) {
|
||||
int minyear = 1970, maxyear = 2100;
|
||||
if (!db.maxYearSpan(&minyear, &maxyear)) {
|
||||
LOGERR(("Can't retrieve index min/max dates\n"));
|
||||
//whatever, go on.
|
||||
}
|
||||
|
||||
if (m_dates.y1 == 0) {
|
||||
m_dates.y1 = minyear;
|
||||
m_dates.m1 = 1;
|
||||
m_dates.d1 = 1;
|
||||
}
|
||||
if (m_dates.y2 == 0) {
|
||||
m_dates.y2 = maxyear;
|
||||
m_dates.m2 = 12;
|
||||
m_dates.d2 = 31;
|
||||
}
|
||||
}
|
||||
LOGDEB(("Db::toNativeQuery: date interval: %d-%d-%d/%d-%d-%d\n",
|
||||
m_dates.y1, m_dates.m1, m_dates.d1,
|
||||
m_dates.y2, m_dates.m2, m_dates.d2));
|
||||
Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1,
|
||||
m_dates.y2, m_dates.m2, m_dates.d2);
|
||||
if (dq.empty()) {
|
||||
LOGINFO(("Db::toNativeQuery: date filter is empty\n"));
|
||||
}
|
||||
// If no probabilistic query is provided then promote the daterange
|
||||
// filter to be THE query instead of filtering an empty query.
|
||||
if (xq.empty()) {
|
||||
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
|
||||
xq = dq;
|
||||
} else {
|
||||
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (m_minSize != size_t(-1) || m_maxSize != size_t(-1)) {
|
||||
Xapian::Query sq;
|
||||
char min[50], max[50];
|
||||
sprintf(min, "%lld", (long long)m_minSize);
|
||||
sprintf(max, "%lld", (long long)m_maxSize);
|
||||
if (m_minSize == size_t(-1)) {
|
||||
string value(max);
|
||||
leftzeropad(value, 12);
|
||||
sq = Xapian::Query(Xapian::Query::OP_VALUE_LE, VALUE_SIZE, value);
|
||||
} else if (m_maxSize == size_t(-1)) {
|
||||
string value(min);
|
||||
leftzeropad(value, 12);
|
||||
sq = Xapian::Query(Xapian::Query::OP_VALUE_GE, VALUE_SIZE, value);
|
||||
} else {
|
||||
string minvalue(min);
|
||||
leftzeropad(minvalue, 12);
|
||||
string maxvalue(max);
|
||||
leftzeropad(maxvalue, 12);
|
||||
sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE,
|
||||
minvalue, maxvalue);
|
||||
}
|
||||
|
||||
// If no probabilistic query is provided then promote the
|
||||
// filter to be THE query instead of filtering an empty query.
|
||||
if (xq.empty()) {
|
||||
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
|
||||
xq = sq;
|
||||
} else {
|
||||
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, sq);
|
||||
}
|
||||
}
|
||||
|
||||
// Add the autophrase if any
|
||||
if (m_autophrase.isNotNull()) {
|
||||
Xapian::Query apq;
|
||||
if (m_autophrase->toNativeQuery(db, &apq)) {
|
||||
xq = xq.empty() ? apq :
|
||||
Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq);
|
||||
}
|
||||
}
|
||||
|
||||
// Add the file type filtering clause if any
|
||||
if (!m_filetypes.empty()) {
|
||||
expandFileTypes(db, m_filetypes);
|
||||
|
||||
Xapian::Query tq;
|
||||
for (vector<string>::iterator it = m_filetypes.begin();
|
||||
it != m_filetypes.end(); it++) {
|
||||
string term = wrap_prefix(mimetype_prefix) + *it;
|
||||
LOGDEB0(("Adding file type term: [%s]\n", term.c_str()));
|
||||
tq = tq.empty() ? Xapian::Query(term) :
|
||||
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
|
||||
}
|
||||
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
|
||||
}
|
||||
|
||||
// Add the neg file type filtering clause if any
|
||||
if (!m_nfiletypes.empty()) {
|
||||
expandFileTypes(db, m_nfiletypes);
|
||||
|
||||
Xapian::Query tq;
|
||||
for (vector<string>::iterator it = m_nfiletypes.begin();
|
||||
it != m_nfiletypes.end(); it++) {
|
||||
string term = wrap_prefix(mimetype_prefix) + *it;
|
||||
LOGDEB0(("Adding negative file type term: [%s]\n", term.c_str()));
|
||||
tq = tq.empty() ? Xapian::Query(term) :
|
||||
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
|
||||
}
|
||||
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq);
|
||||
}
|
||||
|
||||
*((Xapian::Query *)d) = xq;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Splitter callback for breaking a user string into simple terms and
|
||||
// phrases. This is for parts of the user entry which would appear as
|
||||
// a single word because there is no white space inside, but are
|
||||
// actually multiple terms to rcldb (ie term1,term2)
|
||||
class TextSplitQ : public TextSplitP {
|
||||
public:
|
||||
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
|
||||
: TextSplitP(prc, flags),
|
||||
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
|
||||
{}
|
||||
|
||||
bool takeword(const std::string &term, int pos, int bs, int be)
|
||||
{
|
||||
// Check if the first letter is a majuscule in which
|
||||
// case we do not want to do stem expansion. Need to do this
|
||||
// before unac of course...
|
||||
curnostemexp = unaciscapital(term);
|
||||
|
||||
return TextSplitP::takeword(term, pos, bs, be);
|
||||
}
|
||||
|
||||
bool curnostemexp;
|
||||
vector<string> terms;
|
||||
vector<bool> nostemexps;
|
||||
const StopList &stops;
|
||||
// Count of terms including stopwords: this is for adjusting
|
||||
// phrase/near slack
|
||||
int alltermcount;
|
||||
int lastpos;
|
||||
};
|
||||
|
||||
class TermProcQ : public TermProc {
|
||||
public:
|
||||
TermProcQ() : TermProc(0), m_ts(0) {}
|
||||
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
|
||||
|
||||
bool takeword(const std::string &term, int pos, int bs, int be)
|
||||
{
|
||||
m_ts->alltermcount++;
|
||||
if (m_ts->lastpos < pos)
|
||||
m_ts->lastpos = pos;
|
||||
bool noexpand = be ? m_ts->curnostemexp : true;
|
||||
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
|
||||
term.c_str(), pos, noexpand));
|
||||
if (m_terms[pos].size() < term.size()) {
|
||||
m_terms[pos] = term;
|
||||
m_nste[pos] = noexpand;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
bool flush()
|
||||
{
|
||||
for (map<int, string>::const_iterator it = m_terms.begin();
|
||||
it != m_terms.end(); it++) {
|
||||
m_ts->terms.push_back(it->second);
|
||||
m_ts->nostemexps.push_back(m_nste[it->first]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
TextSplitQ *m_ts;
|
||||
map<int, string> m_terms;
|
||||
map<int, bool> m_nste;
|
||||
};
|
||||
|
||||
|
||||
#if 1
|
||||
static void listVector(const string& what, const vector<string>&l)
|
||||
{
|
||||
string a;
|
||||
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
||||
a = a + *it + " ";
|
||||
}
|
||||
LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
|
||||
}
|
||||
#endif
|
||||
|
||||
/** Expand term into term list, using appropriate mode: stem, wildcards,
|
||||
* diacritics...
|
||||
*
|
||||
* @param mods stem expansion, case and diacritics sensitivity control.
|
||||
* @param term input single word
|
||||
* @param oexp output expansion list
|
||||
* @param sterm output original input term if there were no wildcards
|
||||
* @param prefix field prefix in index. We could recompute it, but the caller
|
||||
* has it already. Used in the simple case where there is nothing to expand,
|
||||
* and we just return the prefixed term (else Db::termMatch deals with it).
|
||||
*/
|
||||
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
||||
string& ermsg, int mods,
|
||||
const string& term,
|
||||
vector<string>& oexp, string &sterm,
|
||||
const string& prefix)
|
||||
{
|
||||
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
|
||||
mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
|
||||
sterm.clear();
|
||||
oexp.clear();
|
||||
if (term.empty())
|
||||
return true;
|
||||
|
||||
bool maxexpissoft = false;
|
||||
int maxexpand = getSoftMaxExp();
|
||||
if (maxexpand != -1) {
|
||||
maxexpissoft = true;
|
||||
} else {
|
||||
maxexpand = getMaxExp();
|
||||
}
|
||||
|
||||
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
||||
|
||||
// If there are no wildcards, add term to the list of user-entered terms
|
||||
if (!haswild) {
|
||||
m_hldata.uterms.insert(term);
|
||||
sterm = term;
|
||||
}
|
||||
// No stem expansion if there are wildcards or if prevented by caller
|
||||
bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
|
||||
if (haswild || getStemLang().empty()) {
|
||||
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
||||
nostemexp = true;
|
||||
}
|
||||
|
||||
// noexpansion can be modified further down by possible case/diac expansion
|
||||
bool noexpansion = nostemexp && !haswild;
|
||||
|
||||
int termmatchsens = 0;
|
||||
|
||||
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
|
||||
bool case_sensitive = (mods & SDCM_CASESENS) != 0;
|
||||
|
||||
if (o_index_stripchars) {
|
||||
diac_sensitive = case_sensitive = false;
|
||||
} else {
|
||||
// If we are working with a raw index, apply the rules for case and
|
||||
// diacritics sensitivity.
|
||||
|
||||
// If any character has a diacritic, we become
|
||||
// diacritic-sensitive. Note that the way that the test is
|
||||
// performed (conversion+comparison) will automatically ignore
|
||||
// accented characters which are actually a separate letter
|
||||
if (getAutoDiac() && unachasaccents(term)) {
|
||||
LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
|
||||
diac_sensitive = true;
|
||||
}
|
||||
|
||||
// If any character apart the first is uppercase, we become
|
||||
// case-sensitive. The first character is reserved for
|
||||
// turning off stemming. You need to use a query language
|
||||
// modifier to search for Floor in a case-sensitive way.
|
||||
Utf8Iter it(term);
|
||||
it++;
|
||||
if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
|
||||
LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
|
||||
case_sensitive = true;
|
||||
}
|
||||
|
||||
// If we are sensitive to case or diacritics turn stemming off
|
||||
if (diac_sensitive || case_sensitive) {
|
||||
LOGDEB0(("expandTerm: diac or case sens set -> stemexpand off\n"));
|
||||
nostemexp = true;
|
||||
}
|
||||
|
||||
if (!case_sensitive || !diac_sensitive)
|
||||
noexpansion = false;
|
||||
}
|
||||
|
||||
if (case_sensitive)
|
||||
termmatchsens |= Db::ET_CASESENS;
|
||||
if (diac_sensitive)
|
||||
termmatchsens |= Db::ET_DIACSENS;
|
||||
|
||||
if (noexpansion) {
|
||||
oexp.push_back(prefix + term);
|
||||
m_hldata.terms[term] = term;
|
||||
LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
Db::MatchType mtyp = haswild ? Db::ET_WILD :
|
||||
nostemexp ? Db::ET_NONE : Db::ET_STEM;
|
||||
TermMatchResult res;
|
||||
if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
|
||||
m_field)) {
|
||||
// Let it go through
|
||||
}
|
||||
|
||||
// Term match entries to vector of terms
|
||||
if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
|
||||
ermsg = "Maximum term expansion size exceeded."
|
||||
" Maybe use case/diacritics sensitivity or increase maxTermExpand.";
|
||||
return false;
|
||||
}
|
||||
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
||||
it != res.entries.end(); it++) {
|
||||
oexp.push_back(it->term);
|
||||
}
|
||||
// If the term does not exist at all in the db, the return from
|
||||
// termMatch() is going to be empty, which is not what we want (we
|
||||
// would then compute an empty Xapian query)
|
||||
if (oexp.empty())
|
||||
oexp.push_back(prefix + term);
|
||||
|
||||
// Remember the uterm-to-expansion links
|
||||
for (vector<string>::const_iterator it = oexp.begin();
|
||||
it != oexp.end(); it++) {
|
||||
m_hldata.terms[strip_prefix(*it)] = term;
|
||||
}
|
||||
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
|
||||
void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
||||
vector<vector<string> >::const_iterator vvend,
|
||||
vector<string>& comb,
|
||||
vector<vector<string> >&allcombs)
|
||||
{
|
||||
// Remember my string vector and compute next, for recursive calls.
|
||||
vector<vector<string> >::const_iterator myvit = vvit++;
|
||||
|
||||
// Walk the string vector I'm called upon and, for each string,
|
||||
// add it to current result, an call myself recursively on the
|
||||
// next string vector. The last call (last element of the vector of
|
||||
// vectors), adds the elementary result to the output
|
||||
|
||||
// Walk my string vector
|
||||
for (vector<string>::const_iterator strit = (*myvit).begin();
|
||||
strit != (*myvit).end(); strit++) {
|
||||
|
||||
// Add my current value to the string vector we're building
|
||||
comb.push_back(*strit);
|
||||
|
||||
if (vvit == vvend) {
|
||||
// Last call: store current result
|
||||
allcombs.push_back(comb);
|
||||
} else {
|
||||
// Call recursively on next string vector
|
||||
multiply_groups(vvit, vvend, comb, allcombs);
|
||||
}
|
||||
// Pop the value I just added (make room for the next element in my
|
||||
// vector)
|
||||
comb.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
|
||||
const string& span,
|
||||
int mods, void * pq)
|
||||
{
|
||||
vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
|
||||
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
|
||||
span.c_str(), (unsigned int)mods));
|
||||
vector<string> exp;
|
||||
string sterm; // dumb version of user term
|
||||
|
||||
string prefix;
|
||||
const FieldTraits *ftp;
|
||||
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
|
||||
prefix = wrap_prefix(ftp->pfx);
|
||||
}
|
||||
|
||||
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
|
||||
return;
|
||||
|
||||
// Set up the highlight data. No prefix should go in there
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
|
||||
m_hldata.slacks.push_back(0);
|
||||
m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
|
||||
}
|
||||
|
||||
// Push either term or OR of stem-expanded set
|
||||
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
|
||||
m_curcl += exp.size();
|
||||
|
||||
// If sterm (simplified original user term) is not null, give it a
|
||||
// relevance boost. We do this even if no expansion occurred (else
|
||||
// the non-expanded terms in a term list would end-up with even
|
||||
// less wqf). This does not happen if there are wildcards anywhere
|
||||
// in the search.
|
||||
// We normally boost the original term in the stem expansion list. Don't
|
||||
// do it if there are wildcards anywhere, this would skew the results.
|
||||
bool doBoostUserTerm =
|
||||
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
||||
(m_parentSearch == 0 && !m_haveWildCards);
|
||||
if (doBoostUserTerm && !sterm.empty()) {
|
||||
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
|
||||
Xapian::Query(prefix+sterm,
|
||||
original_term_wqf_booster));
|
||||
}
|
||||
pqueries.push_back(xq);
|
||||
}
|
||||
|
||||
// User entry element had several terms: transform into a PHRASE or
|
||||
// NEAR xapian query, the elements of which can themselves be OR
|
||||
// queries if the terms get expanded by stemming or wildcards (we
|
||||
// don't do stemming for PHRASE though)
|
||||
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
||||
TextSplitQ *splitData,
|
||||
int mods, void *pq,
|
||||
bool useNear, int slack)
|
||||
{
|
||||
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
||||
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
||||
Xapian::Query::OP_PHRASE;
|
||||
vector<Xapian::Query> orqueries;
|
||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||
bool hadmultiple = false;
|
||||
#endif
|
||||
vector<vector<string> >groups;
|
||||
|
||||
string prefix;
|
||||
const FieldTraits *ftp;
|
||||
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
|
||||
prefix = wrap_prefix(ftp->pfx);
|
||||
}
|
||||
|
||||
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
|
||||
orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
|
||||
slack++;
|
||||
}
|
||||
|
||||
// Go through the list and perform stem/wildcard expansion for each element
|
||||
vector<bool>::iterator nxit = splitData->nostemexps.begin();
|
||||
for (vector<string>::iterator it = splitData->terms.begin();
|
||||
it != splitData->terms.end(); it++, nxit++) {
|
||||
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
|
||||
// Adjust when we do stem expansion. Not if disabled by
|
||||
// caller, not inside phrases, and some versions of xapian
|
||||
// will accept only one OR clause inside NEAR.
|
||||
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
|
||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||
|| hadmultiple
|
||||
#endif // single OR inside NEAR
|
||||
;
|
||||
int lmods = mods;
|
||||
if (nostemexp)
|
||||
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||
string sterm;
|
||||
vector<string> exp;
|
||||
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
|
||||
return;
|
||||
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
|
||||
listVector("", exp);
|
||||
// groups is used for highlighting, we don't want prefixes in there.
|
||||
vector<string> noprefs;
|
||||
for (vector<string>::const_iterator it = exp.begin();
|
||||
it != exp.end(); it++) {
|
||||
noprefs.push_back(it->substr(prefix.size()));
|
||||
}
|
||||
groups.push_back(noprefs);
|
||||
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||
exp.begin(), exp.end()));
|
||||
m_curcl += exp.size();
|
||||
if (m_curcl >= getMaxCl())
|
||||
return;
|
||||
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
||||
if (exp.size() > 1)
|
||||
hadmultiple = true;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
|
||||
orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
|
||||
slack++;
|
||||
}
|
||||
|
||||
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
||||
// For phrases, give a relevance boost like we do for original terms
|
||||
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
||||
splitData->alltermcount, splitData->lastpos));
|
||||
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
||||
splitData->lastpos + 1 + slack);
|
||||
if (op == Xapian::Query::OP_PHRASE)
|
||||
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
|
||||
original_term_wqf_booster);
|
||||
pqueries.push_back(xq);
|
||||
|
||||
// Add all combinations of NEAR/PHRASE groups to the highlighting data.
|
||||
vector<vector<string> > allcombs;
|
||||
vector<string> comb;
|
||||
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
|
||||
|
||||
// Insert the search groups and slacks in the highlight data, with
|
||||
// a reference to the user entry that generated them:
|
||||
m_hldata.groups.insert(m_hldata.groups.end(),
|
||||
allcombs.begin(), allcombs.end());
|
||||
m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
|
||||
m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(),
|
||||
m_hldata.ugroups.size() - 1);
|
||||
}
|
||||
|
||||
// Trim string beginning with ^ or ending with $ and convert to flags
|
||||
static int stringToMods(string& s)
|
||||
{
|
||||
int mods = 0;
|
||||
// Check for an anchored search
|
||||
trimstring(s);
|
||||
if (s.length() > 0 && s[0] == '^') {
|
||||
mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
|
||||
s.erase(0, 1);
|
||||
}
|
||||
if (s.length() > 0 && s[s.length()-1] == '$') {
|
||||
mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
|
||||
s.erase(s.length()-1);
|
||||
}
|
||||
return mods;
|
||||
}
|
||||
|
||||
/**
|
||||
* Turn user entry string (NOT query language) into a list of xapian queries.
|
||||
* We just separate words and phrases, and do wildcard and stem expansion,
|
||||
*
|
||||
* This is used to process data entered into an OR/AND/NEAR/PHRASE field of
|
||||
* the GUI (in the case of NEAR/PHRASE, clausedist adds dquotes to the user
|
||||
* entry).
|
||||
*
|
||||
* This appears awful, and it would seem that the split into
|
||||
* terms/phrases should be performed in the upper layer so that we
|
||||
* only receive pure term or near/phrase pure elements here, but in
|
||||
* fact there are things that would appear like terms to naive code,
|
||||
* and which will actually may be turned into phrases (ie: tom:jerry),
|
||||
* in a manner which intimately depends on the index implementation,
|
||||
* so that it makes sense to process this here.
|
||||
*
|
||||
* The final list contains one query for each term or phrase
|
||||
* - Elements corresponding to a stem-expanded part are an OP_OR
|
||||
* composition of the stem-expanded terms (or a single term query).
|
||||
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
|
||||
* composition of the phrase terms (no stem expansion in this case)
|
||||
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
||||
* count)
|
||||
*/
|
||||
bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
|
||||
string &ermsg, void *pq,
|
||||
int slack, bool useNear)
|
||||
{
|
||||
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
||||
int mods = m_modifiers;
|
||||
|
||||
LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
|
||||
"slack %d near %d\n",
|
||||
iq.c_str(), m_field.c_str(), mods, slack, useNear));
|
||||
ermsg.erase();
|
||||
m_curcl = 0;
|
||||
const StopList stops = db.getStopList();
|
||||
|
||||
// Simple whitespace-split input into user-level words and
|
||||
// double-quoted phrases: word1 word2 "this is a phrase".
|
||||
//
|
||||
// The text splitter may further still decide that the resulting
|
||||
// "words" are really phrases, this depends on separators:
|
||||
// [paul@dom.net] would still be a word (span), but [about:me]
|
||||
// will probably be handled as a phrase.
|
||||
vector<string> phrases;
|
||||
TextSplit::stringToStrings(iq, phrases);
|
||||
|
||||
// Process each element: textsplit into terms, handle stem/wildcard
|
||||
// expansion and transform into an appropriate Xapian::Query
|
||||
try {
|
||||
for (vector<string>::iterator it = phrases.begin();
|
||||
it != phrases.end(); it++) {
|
||||
LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
|
||||
// Anchoring modifiers
|
||||
int amods = stringToMods(*it);
|
||||
int terminc = amods != 0 ? 1 : 0;
|
||||
mods |= amods;
|
||||
// If there are multiple spans in this element, including
|
||||
// at least one composite, we have to increase the slack
|
||||
// else a phrase query including a span would fail.
|
||||
// Ex: "term0@term1 term2" is onlyspans-split as:
|
||||
// 0 term0@term1 0 12
|
||||
// 2 term2 13 18
|
||||
// The position of term2 is 2, not 1, so a phrase search
|
||||
// would fail.
|
||||
// We used to do word split, searching for
|
||||
// "term0 term1 term2" instead, which may have worse
|
||||
// performance, but will succeed.
|
||||
// We now adjust the phrase/near slack by comparing the term count
|
||||
// and the last position
|
||||
|
||||
// The term processing pipeline:
|
||||
TermProcQ tpq;
|
||||
TermProc *nxt = &tpq;
|
||||
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
||||
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
||||
//tpcommon.onlygrams(true);
|
||||
TermProcPrep tpprep(nxt);
|
||||
if (o_index_stripchars)
|
||||
nxt = &tpprep;
|
||||
|
||||
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||
TextSplit::TXTS_KEEPWILD),
|
||||
stops, nxt);
|
||||
tpq.setTSQ(&splitter);
|
||||
splitter.text_to_words(*it);
|
||||
|
||||
slack += splitter.lastpos - splitter.terms.size() + 1;
|
||||
|
||||
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
|
||||
switch (splitter.terms.size() + terminc) {
|
||||
case 0:
|
||||
continue;// ??
|
||||
case 1: {
|
||||
int lmods = mods;
|
||||
if (splitter.nostemexps.front())
|
||||
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
||||
m_hldata.ugroups.push_back(splitter.terms);
|
||||
processSimpleSpan(db, ermsg, splitter.terms.front(),
|
||||
lmods, &pqueries);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
m_hldata.ugroups.push_back(splitter.terms);
|
||||
processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
|
||||
useNear, slack);
|
||||
}
|
||||
if (m_curcl >= getMaxCl()) {
|
||||
ermsg = maxXapClauseMsg;
|
||||
if (!o_index_stripchars)
|
||||
ermsg += maxXapClauseCaseDiacMsg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg();
|
||||
} catch (const string &s) {
|
||||
ermsg = s;
|
||||
} catch (const char *s) {
|
||||
ermsg = s;
|
||||
} catch (...) {
|
||||
ermsg = "Caught unknown exception";
|
||||
}
|
||||
if (!ermsg.empty()) {
|
||||
LOGERR(("stringToXapianQueries: %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Translate a simple OR or AND search clause.
|
||||
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
|
||||
{
|
||||
LOGDEB(("SearchDataClauseSimple::toNativeQuery: fld [%s] val [%s] "
|
||||
"stemlang [%s]\n", m_field.c_str(), m_text.c_str(),
|
||||
getStemLang().c_str()));
|
||||
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
||||
Xapian::Query::op op;
|
||||
switch (m_tp) {
|
||||
case SCLT_AND: op = Xapian::Query::OP_AND; break;
|
||||
case SCLT_OR: op = Xapian::Query::OP_OR; break;
|
||||
default:
|
||||
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<Xapian::Query> pqueries;
|
||||
if (!processUserString(db, m_text, m_reason, &pqueries))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
||||
return true;
|
||||
}
|
||||
|
||||
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
|
||||
if (m_weight != 1.0) {
|
||||
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Translate a FILENAME search clause. This always comes
|
||||
// from a "filename" search from the gui or recollq. A query language
|
||||
// "filename:"-prefixed field will not go through here, but through
|
||||
// the generic field-processing code.
|
||||
//
|
||||
// We do not split the entry any more (used to do some crazy thing
|
||||
// about expanding multiple fragments in the past). We just take the
|
||||
// value blanks and all and expand this against the indexed unsplit
|
||||
// file names
|
||||
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
|
||||
{
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
||||
int maxexp = getSoftMaxExp();
|
||||
if (maxexp == -1)
|
||||
maxexp = getMaxExp();
|
||||
|
||||
vector<string> names;
|
||||
db.filenameWildExp(m_text, names, maxexp);
|
||||
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
||||
|
||||
if (m_weight != 1.0) {
|
||||
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Translate a dir: path filtering clause. See comments in .h
|
||||
bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
|
||||
{
|
||||
LOGDEB(("SearchDataClausePath::toNativeQuery: [%s]\n", m_text.c_str()));
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
||||
if (m_text.empty()) {
|
||||
LOGERR(("SearchDataClausePath: empty path??\n"));
|
||||
m_reason = "Empty path ?";
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<Xapian::Query> orqueries;
|
||||
|
||||
if (m_text[0] == '/')
|
||||
orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix)));
|
||||
else
|
||||
m_text = path_tildexpand(m_text);
|
||||
|
||||
vector<string> vpath;
|
||||
stringToTokens(m_text, vpath, "/");
|
||||
|
||||
for (vector<string>::const_iterator pit = vpath.begin();
|
||||
pit != vpath.end(); pit++){
|
||||
|
||||
string sterm;
|
||||
vector<string> exp;
|
||||
if (!expandTerm(db, m_reason,
|
||||
SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS,
|
||||
*pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
|
||||
return false;
|
||||
}
|
||||
LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
|
||||
listVector("", exp);
|
||||
if (exp.size() == 1)
|
||||
orqueries.push_back(Xapian::Query(exp[0]));
|
||||
else
|
||||
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
||||
exp.begin(), exp.end()));
|
||||
m_curcl += exp.size();
|
||||
if (m_curcl >= getMaxCl())
|
||||
return false;
|
||||
}
|
||||
|
||||
*qp = Xapian::Query(Xapian::Query::OP_PHRASE,
|
||||
orqueries.begin(), orqueries.end());
|
||||
|
||||
if (m_weight != 1.0) {
|
||||
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Translate NEAR or PHRASE clause.
|
||||
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
|
||||
{
|
||||
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
|
||||
|
||||
Xapian::Query *qp = (Xapian::Query *)p;
|
||||
*qp = Xapian::Query();
|
||||
|
||||
vector<Xapian::Query> pqueries;
|
||||
Xapian::Query nq;
|
||||
|
||||
// We produce a single phrase out of the user entry then use
|
||||
// stringToXapianQueries() to lowercase and simplify the phrase
|
||||
// terms etc. This will result into a single (complex)
|
||||
// Xapian::Query.
|
||||
if (m_text.find('\"') != string::npos) {
|
||||
m_text = neutchars(m_text, "\"");
|
||||
}
|
||||
string s = cstr_dquote + m_text + cstr_dquote;
|
||||
bool useNear = (m_tp == SCLT_NEAR);
|
||||
if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
|
||||
return false;
|
||||
if (pqueries.empty()) {
|
||||
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
||||
return true;
|
||||
}
|
||||
|
||||
*qp = *pqueries.begin();
|
||||
if (m_weight != 1.0) {
|
||||
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // Namespace Rcl
|
||||
Loading…
x
Reference in New Issue
Block a user