From 3fb7183eae17d274cd68197ae41ef9f5335d7b5b Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 29 Jan 2015 16:15:17 +0100 Subject: [PATCH] Converted query language parser from the old regexp jungle to bison. Allow using parentheses for clearer syntax. --- src/Makefile.in | 2 + src/kde/kioslave/kio_recoll/htmlif.cpp | 1 - src/kde/kioslave/kio_recoll/kio_recoll.cpp | 1 - src/lib/mkMake.in | 5 +- src/php/recoll/recoll.cpp | 1 - src/python/recoll/pyrecoll.cpp | 1 - src/query/Makefile | 10 +- src/query/recollq.cpp | 1 - src/query/wasaparse.cpp | 235 +++++ src/query/wasaparse.y | 415 +++++++++ src/query/wasaparserdriver.h | 81 ++ src/query/wasastringtoquery.cpp | 515 ----------- src/query/wasastringtoquery.h | 112 --- src/query/wasatorcl.cpp | 286 ------ src/query/wasatorcl.h | 17 +- src/rcldb/searchdata.cpp | 928 ------------------- src/rcldb/searchdata.h | 33 +- src/rcldb/searchdatatox.cpp | 983 +++++++++++++++++++++ 18 files changed, 1765 insertions(+), 1862 deletions(-) create mode 100644 src/query/wasaparse.cpp create mode 100644 src/query/wasaparse.y create mode 100644 src/query/wasaparserdriver.h delete mode 100644 src/query/wasastringtoquery.cpp delete mode 100644 src/query/wasastringtoquery.h delete mode 100644 src/query/wasatorcl.cpp create mode 100644 src/rcldb/searchdatatox.cpp diff --git a/src/Makefile.in b/src/Makefile.in index b8b30e0b..a49e500a 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -15,6 +15,7 @@ QTGUI = @QTGUI@ RCLLIBVERSION=@RCLLIBVERSION@ all: configure mk/sysconf + ${MAKE} -C query wasaparse.tab.cpp (cd lib; sh mkMake) ${MAKE} -C lib ${MAKE} -C index depend recollindex @@ -59,6 +60,7 @@ clean: # Note: we don't remove the top Makefile, to keep the "clean" targets # available but a "Make" won't work without a configure anyway distclean: clean + ${MAKE} -C query distclean -${MAKE} -C desktop/unity-lens-recoll distclean -${MAKE} -C python/recoll distclean rm -f mk/sysconf mk/localdefs sampleconf/recoll.conf \ diff --git a/src/kde/kioslave/kio_recoll/htmlif.cpp b/src/kde/kioslave/kio_recoll/htmlif.cpp index 1ca1bf16..d31dd72c 100644 --- a/src/kde/kioslave/kio_recoll/htmlif.cpp +++ b/src/kde/kioslave/kio_recoll/htmlif.cpp @@ -34,7 +34,6 @@ using namespace std; #include "pathut.h" #include "searchdata.h" #include "rclquery.h" -#include "wasastringtoquery.h" #include "wasatorcl.h" #include "kio_recoll.h" #include "docseqdb.h" diff --git a/src/kde/kioslave/kio_recoll/kio_recoll.cpp b/src/kde/kioslave/kio_recoll/kio_recoll.cpp index a477e8c4..df986f1a 100644 --- a/src/kde/kioslave/kio_recoll/kio_recoll.cpp +++ b/src/kde/kioslave/kio_recoll/kio_recoll.cpp @@ -38,7 +38,6 @@ using namespace std; #include "pathut.h" #include "searchdata.h" #include "rclquery.h" -#include "wasastringtoquery.h" #include "wasatorcl.h" #include "kio_recoll.h" #include "docseqdb.h" diff --git a/src/lib/mkMake.in b/src/lib/mkMake.in index 43da597a..7c803e62 100755 --- a/src/lib/mkMake.in +++ b/src/lib/mkMake.in @@ -42,8 +42,8 @@ ${depth}/query/plaintorich.cpp \ ${depth}/query/recollq.cpp \ ${depth}/query/reslistpager.cpp \ ${depth}/query/sortseq.cpp \ -${depth}/query/wasastringtoquery.cpp \ -${depth}/query/wasatorcl.cpp \ +${depth}/query/wasaparse.cpp \ +${depth}/query/wasaparse.tab.cpp \ ${depth}/rcldb/daterange.cpp \ ${depth}/rcldb/expansiondbs.cpp \ ${depth}/rcldb/rclabstract.cpp \ @@ -53,6 +53,7 @@ ${depth}/rcldb/rcldups.cpp \ ${depth}/rcldb/rclquery.cpp \ ${depth}/rcldb/rclterms.cpp \ ${depth}/rcldb/searchdata.cpp \ +${depth}/rcldb/searchdatatox.cpp \ ${depth}/rcldb/searchdataxml.cpp \ ${depth}/rcldb/stemdb.cpp \ ${depth}/rcldb/stoplist.cpp \ diff --git a/src/php/recoll/recoll.cpp b/src/php/recoll/recoll.cpp index 4d68c8db..8a096feb 100644 --- a/src/php/recoll/recoll.cpp +++ b/src/php/recoll/recoll.cpp @@ -37,7 +37,6 @@ #include "pathut.h" #include "rclinit.h" #include "debuglog.h" -#include "wasastringtoquery.h" #include "wasatorcl.h" #include "internfile.h" #include "wipedir.h" diff --git a/src/python/recoll/pyrecoll.cpp b/src/python/recoll/pyrecoll.cpp index 2c16b74f..55a5e2c9 100644 --- a/src/python/recoll/pyrecoll.cpp +++ b/src/python/recoll/pyrecoll.cpp @@ -32,7 +32,6 @@ using namespace std; #include "searchdata.h" #include "rclquery.h" #include "pathut.h" -#include "wasastringtoquery.h" #include "wasatorcl.h" #include "debuglog.h" #include "pathut.h" diff --git a/src/query/Makefile b/src/query/Makefile index a1c2f935..303a8a11 100644 --- a/src/query/Makefile +++ b/src/query/Makefile @@ -4,8 +4,12 @@ include $(depth)/mk/sysconf PROGS = xadump recollq #trhist qtry qxtry SRCS = xadump.cpp -all: depend librecoll $(PROGS) +all: wasaparse.tab.cpp depend librecoll $(PROGS) +wasaparse.tab.cpp : wasaparse.y + bison wasaparse.y + mv -f wasaparse.tab.c wasaparse.tab.cpp + XADUMP_OBJS= xadump.o xadump : $(XADUMP_OBJS) $(CXX) $(ALL_CXXFLAGS) -o xadump $(XADUMP_OBJS) \ @@ -39,3 +43,7 @@ trwasastrtoq.o : wasastringtoquery.cpp wasastringtoquery.h include $(depth)/mk/commontargets include alldeps + +distclean:: + -rm -f location.hh position.hh stack.hh \ + wasaparse.tab.c wasaparse.tab.cpp wasaparse.tab.h diff --git a/src/query/recollq.cpp b/src/query/recollq.cpp index 76cc5d2f..c68bc968 100644 --- a/src/query/recollq.cpp +++ b/src/query/recollq.cpp @@ -36,7 +36,6 @@ using namespace std; #include "pathut.h" #include "rclinit.h" #include "debuglog.h" -#include "wasastringtoquery.h" #include "wasatorcl.h" #include "internfile.h" #include "wipedir.h" diff --git a/src/query/wasaparse.cpp b/src/query/wasaparse.cpp new file mode 100644 index 00000000..b8eb5669 --- /dev/null +++ b/src/query/wasaparse.cpp @@ -0,0 +1,235 @@ +/* Copyright (C) 2006 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include "autoconfig.h" + +#include + +#include "wasatorcl.h" +#include "wasaparserdriver.h" +#include "searchdata.h" +#include "debuglog.h" + +#define YYDEBUG 1 + +#include "wasaparse.tab.h" + +using namespace std; +using namespace Rcl; + + +void +yy::parser::error (const location_type& l, const std::string& m) +{ + d->setreason(m); +} + + +SearchData *wasaStringToRcl(const RclConfig *config, + const std::string& stemlang, + const std::string& query, string &reason, + const std::string& autosuffs) +{ + WasaParserDriver d(config, stemlang, autosuffs); + SearchData *sd = d.parse(query); + if (!sd) + reason = d.getreason(); + return sd; +} + +SearchData *WasaParserDriver::parse(const std::string& in) +{ + m_input = in; + m_index = 0; + delete m_result; + m_result = 0; + m_returns = stack(); + + yy::parser parser(this); + parser.set_debug_level(0); + + if (parser.parse() != 0) { + delete m_result; + m_result = 0; + } + + return m_result; +} + +int WasaParserDriver::GETCHAR() +{ + if (!m_returns.empty()) { + int c = m_returns.top(); + m_returns.pop(); + return c; + } + if (m_index < m_input.size()) + return m_input[m_index++]; + return 0; +} +void WasaParserDriver::UNGETCHAR(int c) +{ + m_returns.push(c); +} + +// Add clause to query, handling special pseudo-clauses for size/date +// etc. (mostly determined on field name). +bool WasaParserDriver::addClause(SearchData *sd, + SearchDataClauseSimple* cl) +{ + if (cl->getfield().empty()) { + // Simple clause with empty field spec. + // Possibly change terms found in the "autosuffs" list into "ext" + // field queries + if (!m_autosuffs.empty()) { + vector asfv; + if (stringToStrings(m_autosuffs, asfv)) { + if (find_if(asfv.begin(), asfv.end(), + StringIcmpPred(cl->gettext())) != asfv.end()) { + cl->setfield("ext"); + cl->addModifier(SearchDataClause::SDCM_NOSTEMMING); + } + } + } + return sd->addClause(cl); + } + + + const string& fld = cl->getfield(); + + // MIME types and categories + if (!stringicmp("mime", fld) ||!stringicmp("format", fld)) { + if (cl->getexclude()) { + sd->remFiletype(cl->gettext()); + } else { + sd->addFiletype(cl->gettext()); + } + delete cl; + return true; + } + + if (!stringicmp("rclcat", fld) || !stringicmp("type", fld)) { + vector mtypes; + if (m_config && m_config->getMimeCatTypes(cl->gettext(), mtypes)) { + for (vector::iterator mit = mtypes.begin(); + mit != mtypes.end(); mit++) { + if (cl->getexclude()) { + sd->remFiletype(*mit); + } else { + sd->addFiletype(*mit); + } + } + } + delete cl; + return true; + } + + // Handle "date" spec + if (!stringicmp("date", fld)) { + DateInterval di; + if (!parsedateinterval(cl->gettext(), &di)) { + LOGERR(("Bad date interval format: %s\n", + cl->gettext().c_str())); + m_reason = "Bad date interval format"; + delete cl; + return false; + } + LOGDEB(("addClause:: date span: %d-%d-%d/%d-%d-%d\n", + di.y1,di.m1,di.d1, di.y2,di.m2,di.d2)); + sd->setDateSpan(&di); + delete cl; + return true; + } + + // Handle "size" spec + if (!stringicmp("size", fld)) { + char *cp; + size_t size = strtoll(cl->gettext().c_str(), &cp, 10); + if (*cp != 0) { + switch (*cp) { + case 'k': case 'K': size *= 1E3;break; + case 'm': case 'M': size *= 1E6;break; + case 'g': case 'G': size *= 1E9;break; + case 't': case 'T': size *= 1E12;break; + default: + m_reason = string("Bad multiplier suffix: ") + *cp; + delete cl; + return false; + } + } + + SearchDataClause::Relation rel = cl->getrel(); + + delete cl; + + switch (rel) { + case SearchDataClause::REL_EQUALS: + sd->setMaxSize(size); + sd->setMinSize(size); + break; + case SearchDataClause::REL_LT: + case SearchDataClause::REL_LTE: + sd->setMaxSize(size); + break; + case SearchDataClause::REL_GT: + case SearchDataClause::REL_GTE: + sd->setMinSize(size); + break; + default: + m_reason = "Bad relation operator with size query. Use > < or ="; + return false; + } + return true; + } + + if (!stringicmp("dir", fld)) { + // dir filtering special case + SearchDataClausePath *nclause = + new SearchDataClausePath(cl->gettext(), cl->getexclude()); + delete cl; + sd->addClause(nclause); + } + + if (cl->getTp() == SCLT_OR || cl->getTp() == SCLT_AND) { + // If this is a normal clause and the term has commas or + // slashes inside, take it as a list, turn the slashes/commas + // to spaces, leave unquoted. Otherwise, this would end up as + // a phrase query. This is a handy way to enter multiple terms + // to be searched inside a field. We interpret ',' as AND, and + // '/' as OR. No mixes allowed and ',' wins. + SClType tp = SCLT_FILENAME;// impossible value + string ns = neutchars(cl->gettext(), ","); + if (ns.compare(cl->gettext())) { + // had ',' + tp = SCLT_AND; + } else { + ns = neutchars(cl->gettext(), "/"); + if (ns.compare(cl->gettext())) { + // had not ',' but has '/' + tp = SCLT_OR; + } + } + + if (tp != SCLT_FILENAME) { + SearchDataClauseSimple *ncl = + new SearchDataClauseSimple(tp, ns, fld); + delete cl; + return sd->addClause(ncl); + } + } + return sd->addClause(cl); +} + diff --git a/src/query/wasaparse.y b/src/query/wasaparse.y new file mode 100644 index 00000000..68b41056 --- /dev/null +++ b/src/query/wasaparse.y @@ -0,0 +1,415 @@ +%{ +#define YYDEBUG 1 + +#include + +#include +#include + +#include "searchdata.h" +#include "wasaparserdriver.h" +#include "wasaparse.tab.h" + +using namespace std; + +int yylex(yy::parser::semantic_type *, WasaParserDriver *); +void yyerror(char const *); +static void qualify(Rcl::SearchDataClauseDist *, const string &); + +static void addSubQuery(WasaParserDriver *d, + Rcl::SearchData *sd, Rcl::SearchData *sq) +{ + sd->addClause(new Rcl::SearchDataClauseSub(RefCntr(sq))); +} + +%} + +%skeleton "lalr1.cc" +%defines +%error-verbose + +%parse-param {WasaParserDriver* d} +%lex-param {WasaParserDriver* d} + +%union { + std::string *str; + Rcl::SearchDataClauseSimple *cl; + Rcl::SearchData *sd; +} +%destructor {delete $$;} + +%type qualquote +%type fieldexpr +%type term +%type query +%type complexfieldname + + /* Non operator tokens need precedence because of the possibility of + concatenation which needs to have lower prec than OR */ +%left WORD +%left QUOTED +%left QUALIFIERS +%left AND UCONCAT +%left OR + +%token EQUALS CONTAINS SMALLEREQ SMALLER GREATEREQ GREATER + +%% + +topquery: query +{ + d->m_result = $1; +} + +query: +query query %prec UCONCAT +{ + //cerr << "q: query query" << endl; + Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang); + addSubQuery(d, sd, $1); + addSubQuery(d, sd, $2); + $$ = sd; +} +| query AND query +{ + //cerr << "q: query AND query" << endl; + Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang); + addSubQuery(d, sd, $1); + addSubQuery(d, sd, $3); + $$ = sd; +} +| query OR query +{ + //cerr << "q: query OR query" << endl; + Rcl::SearchData *top = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang); + Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_OR, d->m_stemlang); + addSubQuery(d, sd, $1); + addSubQuery(d, sd, $3); + addSubQuery(d, top, sd); + $$ = top; +} +| '(' query ')' +{ + //cerr << "q: ( query )" << endl; + $$ = $2; +} +| +fieldexpr %prec UCONCAT +{ + //cerr << "q: fieldexpr" << endl; + Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang); + d->addClause(sd, $1); + $$ = sd; +} +; + +fieldexpr: term +{ + // cerr << "fe: simple fieldexpr: " << $1->gettext() << endl; + $$ = $1; +} +| complexfieldname EQUALS term +{ + // cerr << "fe: " << *$1 << " = " << $3->gettext() << endl; + $3->setfield(*$1); + $3->setrel(Rcl::SearchDataClause::REL_EQUALS); + $$ = $3; + delete $1; +} +| complexfieldname CONTAINS term +{ + // cerr << "fe: " << *$1 << " : " << $3->gettext() << endl; + $3->setfield(*$1); + $3->setrel(Rcl::SearchDataClause::REL_CONTAINS); + $$ = $3; + delete $1; +} +| complexfieldname SMALLER term +{ + // cerr << "fe: " << *$1 << " < " << $3->gettext() << endl; + $3->setfield(*$1); + $3->setrel(Rcl::SearchDataClause::REL_LT); + $$ = $3; + delete $1; +} +| complexfieldname SMALLEREQ term +{ + // cerr << "fe: " << *$1 << " <= " << $3->gettext() << endl; + $3->setfield(*$1); + $3->setrel(Rcl::SearchDataClause::REL_LTE); + $$ = $3; + delete $1; +} +| complexfieldname GREATER term +{ + // cerr << "fe: " << *$1 << " > " << $3->gettext() << endl; + $3->setfield(*$1); + $3->setrel(Rcl::SearchDataClause::REL_GT); + $$ = $3; + delete $1; +} +| complexfieldname GREATEREQ term +{ + // cerr << "fe: " << *$1 << " >= " << $3->gettext() << endl; + $3->setfield(*$1); + $3->setrel(Rcl::SearchDataClause::REL_GTE); + $$ = $3; + delete $1; +} +| '-' fieldexpr +{ + // cerr << "fe: - fieldexpr[" << $2->gettext() << "]" << endl; + $2->setexclude(true); + $$ = $2; +} +; + +/* Deal with field names like dc:title */ +complexfieldname: +WORD +{ + // cerr << "cfn: WORD" << endl; + $$ = $1; +} +| +complexfieldname CONTAINS WORD +{ + // cerr << "cfn: complexfieldname ':' WORD" << endl; + $$ = new string(*$1 + string(":") + *$3); + delete $1; + delete $3; +} + +term: +WORD +{ + //cerr << "term[" << *$1 << "]" << endl; + $$ = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, *$1); + delete $1; +} +| qualquote +{ + $$ = $1; +} + +qualquote: +QUOTED +{ + // cerr << "QUOTED[" << *$1 << "]" << endl; + $$ = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0); + delete $1; +} +| QUOTED QUALIFIERS +{ + // cerr << "QUOTED[" << *$1 << "] QUALIFIERS[" << *$2 << "]" << endl; + Rcl::SearchDataClauseDist *cl = + new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0); + qualify(cl, *$2); + $$ = cl; + delete $1; + delete $2; +} + + +%% + +#include + +// Look for int at index, skip and return new index found? value. +static unsigned int qualGetInt(const string& q, unsigned int cur, int *pval) +{ + unsigned int ncur = cur; + if (cur < q.size() - 1) { + char *endptr; + int val = strtol(&q[cur + 1], &endptr, 10); + if (endptr != &q[cur + 1]) { + ncur += endptr - &q[cur + 1]; + *pval = val; + } + } + return ncur; +} + +static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals) +{ + // cerr << "qualify(" << cl << ", " << quals << ")" << endl; + for (unsigned int i = 0; i < quals.length(); i++) { + //fprintf(stderr, "qual char %c\n", quals[i]); + switch (quals[i]) { + case 'b': + cl->setWeight(10.0); + break; + case 'c': break; + case 'C': + cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS); + break; + case 'd': break; + case 'D': + cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS); + break; + case 'e': + cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS); + cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS); + cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING); + break; + case 'l': + cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING); + break; + case 'L': break; + case 'o': + { + int slack = 10; + i = qualGetInt(quals, i, &slack); + cl->setslack(slack); + //cerr << "set slack " << cl->getslack() << " done" << endl; + } + break; + case 'p': + cl->setTp(Rcl::SCLT_NEAR); + if (cl->getslack() == 0) { + cl->setslack(10); + //cerr << "set slack " << cl->getslack() << " done" << endl; + } + break; + case '.':case '0':case '1':case '2':case '3':case '4': + case '5':case '6':case '7':case '8':case '9': + { + int n = 0; + float factor = 1.0; + if (sscanf(&(quals[i]), "%f %n", &factor, &n)) { + if (factor != 1.0) { + cl->setWeight(factor); + } + } + if (n > 0) + i += n - 1; + } + default: + break; + } + } +} + + +// specialstartchars are special only at the beginning of a token +// (e.g. doctor-who is a term, not 2 terms separated by '-') +static const string specialstartchars("-"); +// specialinchars are special everywhere except inside a quoted string +static const string specialinchars(":=<>()"); + +// Called with the first dquote already read +static int parseString(WasaParserDriver *d, yy::parser::semantic_type *yylval) +{ + string* value = new string(); + d->qualifiers().clear(); + int c; + while ((c = d->GETCHAR())) { + switch (c) { + case '\\': + /* Escape: get next char */ + c = d->GETCHAR(); + if (c == 0) { + value->push_back(c); + goto out; + } + value->push_back(c); + break; + case '"': + /* End of string. Look for qualifiers */ + while ((c = d->GETCHAR()) && !isspace(c)) + d->qualifiers().push_back(c); + goto out; + default: + value->push_back(c); + } + } +out: + //cerr << "GOT QUOTED ["<qualifiers() << "]" << endl; + yylval->str = value; + return yy::parser::token::QUOTED; +} + + +int yylex(yy::parser::semantic_type *yylval, WasaParserDriver *d) +{ + if (!d->qualifiers().empty()) { + yylval->str = new string(); + yylval->str->swap(d->qualifiers()); + return yy::parser::token::QUALIFIERS; + } + + int c; + + /* Skip white space. */ + while ((c = d->GETCHAR()) && isspace(c)) + continue; + + if (c == 0) + return 0; + + if (specialstartchars.find_first_of(c) != string::npos) { + //cerr << "yylex: return " << c << endl; + return c; + } + + // field-term relations + switch (c) { + case '=': return yy::parser::token::EQUALS; + case ':': return yy::parser::token::CONTAINS; + case '<': { + int c1 = d->GETCHAR(); + if (c1 == '=') { + return yy::parser::token::SMALLEREQ; + } else { + d->UNGETCHAR(c1); + return yy::parser::token::SMALLER; + } + } + case '>': { + int c1 = d->GETCHAR(); + if (c1 == '=') { + return yy::parser::token::GREATEREQ; + } else { + d->UNGETCHAR(c1); + return yy::parser::token::GREATER; + } + } + case '(': case ')': + return c; + } + + if (c == '"') + return parseString(d, yylval); + + d->UNGETCHAR(c); + + // Other chars start a term or field name or reserved word + string* word = new string(); + while ((c = d->GETCHAR())) { + if (isspace(c)) { + //cerr << "Word broken by whitespace" << endl; + break; + } else if (specialinchars.find_first_of(c) != string::npos) { + //cerr << "Word broken by special char" << endl; + d->UNGETCHAR(c); + break; + } else if (c == 0) { + //cerr << "Word broken by EOF" << endl; + break; + } else { + word->push_back(c); + } + } + + if (!word->compare("AND") || !word->compare("&&")) { + delete word; + return yy::parser::token::AND; + } else if (!word->compare("OR") || !word->compare("||")) { + delete word; + return yy::parser::token::OR; + } + +// cerr << "Got word [" << word << "]" << endl; + yylval->str = word; + return yy::parser::token::WORD; +} diff --git a/src/query/wasaparserdriver.h b/src/query/wasaparserdriver.h new file mode 100644 index 00000000..da6fe6dd --- /dev/null +++ b/src/query/wasaparserdriver.h @@ -0,0 +1,81 @@ +/* Copyright (C) 2006 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#ifndef _WASAPARSERDRIVER_H_INCLUDED_ +#define _WASAPARSERDRIVER_H_INCLUDED_ + +#include +#include + +class WasaParserDriver; +namespace Rcl { + class SearchData; + class SearchDataClauseSimple; +} +namespace yy { + class parser; +} + +class RclConfig; + +class WasaParserDriver { +public: + + WasaParserDriver(const RclConfig *c, const std::string sl, + const std::string& as) + : m_stemlang(sl), m_autosuffs(as), m_config(c), + m_index(0), m_result(0) {} + + Rcl::SearchData *parse(const std::string&); + bool addClause(Rcl::SearchData *sd, Rcl::SearchDataClauseSimple* cl); + + int GETCHAR(); + void UNGETCHAR(int c); + + std::string& qualifiers() { + return m_qualifiers; + } + void setreason(const std::string& reason) { + m_reason = reason; + } + const std::string& getreason() const { + return m_reason; + } + +private: + friend class yy::parser; + + std::string m_stemlang; + std::string m_autosuffs; + const RclConfig *m_config; + + std::string m_input; + unsigned int m_index; + std::stack m_returns; + Rcl::SearchData *m_result; + + std::string m_reason; + + // Let the quoted string reader store qualifiers in there, simpler + // than handling this in the parser, because their nature is + // determined by the absence of white space after the closing + // dquote. e.g "some term"abc. We could avoid this by making white + // space a token. + std::string m_qualifiers; +}; + + +#endif /* _WASAPARSERDRIVER_H_INCLUDED_ */ diff --git a/src/query/wasastringtoquery.cpp b/src/query/wasastringtoquery.cpp deleted file mode 100644 index c9718665..00000000 --- a/src/query/wasastringtoquery.cpp +++ /dev/null @@ -1,515 +0,0 @@ -/* Copyright (C) 2006 J.F.Dockes - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#ifndef TEST_WASASTRINGTOQUERY -#include -#include -#include -#include - -#include "smallut.h" -#include "wasastringtoquery.h" - -#undef DEB_WASASTRINGTOQ -#ifdef DEB_WASASTRINGTOQ -#define DPRINT(X) fprintf X -#define DUMPQ(Q) {string D;Q->describe(D);fprintf(stderr, "%s\n", D.c_str());} -#else -#define DPRINT(X) -#define DUMPQ(Q) -#endif - -WasaQuery::~WasaQuery() -{ - for (vector::iterator it = m_subs.begin(); - it != m_subs.end(); it++) { - delete *it; - } - m_subs.clear(); -} - -static const char* reltosrel(WasaQuery::Rel rel) -{ - switch (rel) { - case WasaQuery::REL_EQUALS: return "="; - case WasaQuery::REL_CONTAINS: return ":"; - case WasaQuery::REL_LT: return "<"; - case WasaQuery::REL_LTE: return "<="; - case WasaQuery::REL_GT: return ">"; - case WasaQuery::REL_GTE: return ">="; - default: return "?"; - } -} - -void WasaQuery::describe(string &desc) const -{ - desc += "("; - string fieldspec = m_fieldspec.empty() ? string() : m_fieldspec + - reltosrel(m_rel); - switch (m_op) { - case OP_NULL: - desc += "NULL"; - break; - case OP_LEAF: - if (m_exclude) - desc += "NOT ("; - desc += fieldspec + m_value; - if (m_exclude) - desc += ")"; - break; - case OP_OR: - case OP_AND: - for (vector::const_iterator it = m_subs.begin(); - it != m_subs.end(); it++) { - (*it)->describe(desc); - vector::const_iterator it1 = it; - it1++; - if (it1 != m_subs.end()) - desc += m_op == OP_OR ? "OR ": "AND "; - } - break; - } - if (desc[desc.length() - 1] == ' ') - desc.erase(desc.length() - 1); - desc += ")"; - if (m_modifiers != 0) { - if (m_modifiers & WQM_BOOST) desc += "BOOST|"; - if (m_modifiers & WQM_CASESENS) desc += "CASESENS|"; - if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|"; - if (m_modifiers & WQM_FUZZY) desc += "FUZZY|"; - if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|"; - if (m_modifiers & WQM_PHRASESLACK) { - char buf[100]; - sprintf(buf, "%d", m_slack); - desc += "PHRASESLACK(" + string(buf) + string(")|"); - } - if (m_modifiers & WQM_PROX) desc += "PROX|"; - if (m_modifiers & WQM_REGEX) desc += "REGEX|"; - if (m_modifiers & WQM_SLOPPY) desc += "SLOPPY|"; - if (m_modifiers & WQM_WORDS) desc += "WORDS|"; - - if (desc.length() > 0 && desc[desc.length()-1] == '|') - desc.erase(desc.length()-1); - } - desc += " "; -} - -// The string query parser code: - -/* Shamelessly lifted from Beagle: - * This is our regular Expression Pattern: - * we expect something like this: - * -key:"Value String"modifiers - * key:Value - * or - * Value -*/ - -/* The master regular expression used to parse a query string - * Sub-expressions in parenthesis are numbered from 1. Each opening - * parenthesis increases the index, but we're not interested in all - * Deviations from standard: - * Relation: the standard-conformant line read as (release<1.16): - "(:|=|<|>|<=|>=)" //7 Relation - but we are not actually making use of the relation type - (interpreting all as ":"), and this can product unexpected results - as a (ie pasted) search for nonexfield=value will silently drop - the nonexfield part, while the user probably was not aware of - triggering a field search (expecting just ':' to do this). - */ -static const char * parserExpr = - "(OR|\\|\\|)[[:space:]]*" //1 OR,|| - "|" - "(AND|&&)[[:space:]]*" // 2 AND,&& (ignored, default) - "|" - "(" //3 - "([+-])?" //4 Force or exclude indicator - "(" //5 - "([[:alpha:]][[:alnum:]:]*)" //6 Field spec: ie: "dc:title:letitre" - "[[:space:]]*" - "(:|=|>|<)" //7 Relation - "[[:space:]]*)?" - "(" //8 - "(\"" //9 - "([^\"]+)" //10 "A quoted term" - "\")" - "([bcCdDeflLoprsw.0-9]*)" //11 modifiers - "|" - "([^[:space:]\"]+)" //12 ANormalTerm - ")" - ")[[:space:]]*" -; - -// For debugging the parser. But see also NMATCH -static const char *matchNames[] = { - /* 0*/ "", - /* 1*/ "OR", - /* 2*/ "AND", - /* 3*/ "", - /* 4*/ "+-", - /* 5*/ "", - /* 6*/ "FIELD", - /* 7*/ "RELATION", - /* 8*/ "", - /* 9*/ "", - /*10*/ "QUOTEDTERM", - /*11*/ "MODIFIERS", - /*12*/ "TERM", -}; -#define NMATCH (sizeof(matchNames) / sizeof(char *)) - -// Symbolic names for the interesting submatch indices -enum SbMatchIdx {SMI_OR=1, SMI_AND=2, SMI_PM=4, SMI_FIELD=6, SMI_REL=7, - SMI_QUOTED=10, SMI_MODIF=11, SMI_TERM=12}; - -static const int maxmatchlen = 1024; -static const int errbuflen = 300; - -class StringToWasaQuery::Internal { -public: - Internal() - : m_rxneedsfree(false) - {} - ~Internal() - { - if (m_rxneedsfree) - regfree(&m_rx); - } - bool checkSubMatch(int i, char *match, string& reason) - { - if (i < 0 || i >= int(NMATCH) || m_pmatch[i].rm_so == -1) { - //DPRINT((stderr, "checkSubMatch: no match: i %d rm_so %d\n", - //i, m_pmatch[i].rm_so)); - return false; - } - if (m_pmatch[i].rm_eo - m_pmatch[i].rm_so <= 0) { - // weird and fatal - reason = "Internal regular expression handling error"; - return false; - } - //DPRINT((stderr, "checkSubMatch: so %d eo %d\n", m_pmatch[i].rm_so, - //m_pmatch[i].rm_eo)); - memcpy(match, m_cp + m_pmatch[i].rm_so, - m_pmatch[i].rm_eo - m_pmatch[i].rm_so); - match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0; - return true; - } - - WasaQuery *stringToQuery(const string& str, string& reason); - - friend class StringToWasaQuery; -private: - const char *m_cp; - regex_t m_rx; - bool m_rxneedsfree; - regmatch_t m_pmatch[NMATCH]; -}; - -StringToWasaQuery::StringToWasaQuery() - : internal(new Internal) -{ -} - -StringToWasaQuery::~StringToWasaQuery() -{ - delete internal; -} - -WasaQuery * -StringToWasaQuery::stringToQuery(const string& str, string& reason) -{ - if (internal == 0) - return 0; - WasaQuery *wq = internal->stringToQuery(str, reason); - DUMPQ(wq); - return wq; -} - -WasaQuery * -StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) -{ - if (m_rxneedsfree) - regfree(&m_rx); - - char errbuf[errbuflen+1]; - int errcode; - if ((errcode = regcomp(&m_rx, parserExpr, REG_EXTENDED)) != 0) { - regerror(errcode, &m_rx, errbuf, errbuflen); - reason = errbuf; - return 0; - } - m_rxneedsfree = true; - - const char *cpe; - m_cp = str.c_str(); - cpe = str.c_str() + str.length(); - - WasaQuery *query = new WasaQuery; - query->m_op = WasaQuery::OP_AND; - WasaQuery *orChain = 0; - bool prev_or = false; - - // Loop on repeated regexp matches on the main string. - for (int loop = 0;;loop++) { - if ((errcode = regexec(&m_rx, m_cp, NMATCH, m_pmatch, 0))) { - regerror(errcode, &m_rx, errbuf, errbuflen); - reason = errbuf; - return 0; - } - if (m_pmatch[0].rm_eo <= 0) { - // weird and fatal - reason = "Internal regular expression handling error"; - return 0; - } - -#ifdef DEB_WASASTRINGTOQ - DPRINT((stderr, "Next part:\n")); - for (unsigned int i = 0; i < NMATCH; i++) { - if (m_pmatch[i].rm_so == -1) continue; - char match[maxmatchlen+1]; - memcpy(match, m_cp + m_pmatch[i].rm_so, - m_pmatch[i].rm_eo - m_pmatch[i].rm_so); - match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0; - if (matchNames[i][0]) - DPRINT((stderr, "%10s: [%s] (%d->%d)\n", matchNames[i], match, - (int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo)); - } -#endif - - char match[maxmatchlen+1]; - if (checkSubMatch(SMI_OR, match, reason)) { - if (prev_or) { - // Bad syntax - reason = "Bad syntax: consecutive OR"; - return 0; - } - - if (orChain == 0) { - // Fist OR seen: start OR subclause. - if ((orChain = new WasaQuery()) == 0) { - reason = "Out of memory"; - return 0; - } - orChain->m_op = WasaQuery::OP_OR; - } - - // For the first OR, we need to transfer the previous - // query from the main vector to the OR subquery - if (orChain->m_subs.empty() && !query->m_subs.empty()) { - orChain->m_subs.push_back(query->m_subs.back()); - query->m_subs.pop_back(); - } - prev_or = true; - - } else if (checkSubMatch(SMI_AND, match, reason)) { - // Do nothing, AND is the default. We might want to check for - // errors like consecutive ANDs, or OR AND - - } else { - - WasaQuery *nclause = new WasaQuery; - if (nclause == 0) { - reason = "Out of memory"; - return 0; - } - - // Check for quoted or unquoted value - unsigned int mods = 0; - if (checkSubMatch(SMI_QUOTED, match, reason)) { - nclause->m_value = match; - mods |= WasaQuery::WQM_QUOTED; - } else if (checkSubMatch(SMI_TERM, match, reason)) { - nclause->m_value = match; - } - - if (nclause->m_value.empty()) { - // Isolated +- or fieldname: without a value. Ignore until - // told otherwise. - DPRINT((stderr, "Clause with empty value, skipping\n")); - delete nclause; - goto nextfield; - } - - if (checkSubMatch(SMI_MODIF, match, reason)) { - DPRINT((stderr, "Got modifiers: [%s]\n", match)); - for (unsigned int i = 0; i < strlen(match); i++) { - switch (match[i]) { - case 'b': - mods |= WasaQuery::WQM_BOOST; - nclause->m_weight = 10.0; - break; - case 'c': break; - case 'C': mods |= WasaQuery::WQM_CASESENS; break; - case 'd': break; - case 'D': mods |= WasaQuery::WQM_DIACSENS; break; - case 'e': mods |= WasaQuery::WQM_CASESENS | - WasaQuery::WQM_DIACSENS | - WasaQuery::WQM_NOSTEM; - break; - case 'f': mods |= WasaQuery::WQM_FUZZY; break; - case 'l': mods |= WasaQuery::WQM_NOSTEM; break; - case 'L': break; - case 'o': - mods |= WasaQuery::WQM_PHRASESLACK; - // Default slack if specified only by 'o' is 10. - nclause->m_slack = 10; - if (i < strlen(match) - 1) { - char *endptr; - int slack = strtol(match+i+1, &endptr, 10); - if (endptr != match+i+1) { - i += endptr - (match+i+1); - nclause->m_slack = slack; - } - } - break; - case 'p': - mods |= WasaQuery::WQM_PROX; - nclause->m_slack = 10; - break; - case 'r': mods |= WasaQuery::WQM_REGEX; break; - case 's': mods |= WasaQuery::WQM_SLOPPY; break; - case 'w': mods |= WasaQuery::WQM_WORDS; break; - case '.':case '0':case '1':case '2':case '3':case '4': - case '5':case '6':case '7':case '8':case '9': - { - int n; - float factor; - if (sscanf(match+i, "%f %n", &factor, &n)) { - nclause->m_weight = factor; - DPRINT((stderr, "Got factor %.2f len %d\n", - factor, n)); - } - if (n) - i += n-1; - } - } - } - } - nclause->m_modifiers = WasaQuery::Modifier(mods); - - // Field indicator ? - if (checkSubMatch(SMI_FIELD, match, reason)) { - // We used Check for special fields indicating sorting - // etc. here but this went away from the spec. See 1.4 - // if it comes back - nclause->m_fieldspec = match; - if (checkSubMatch(SMI_REL, match, reason)) { - switch (match[0]) { - case '=':nclause->m_rel = WasaQuery::REL_EQUALS;break; - case ':':nclause->m_rel = WasaQuery::REL_CONTAINS;break; - case '<': - if (match[1] == '=') - nclause->m_rel = WasaQuery::REL_LTE; - else - nclause->m_rel = WasaQuery::REL_LT; - break; - case '>': - if (match[1] == '=') - nclause->m_rel = WasaQuery::REL_GTE; - else - nclause->m_rel = WasaQuery::REL_GT; - break; - default: - nclause->m_rel = WasaQuery::REL_CONTAINS; - } - } else { - // ?? If field matched we should have a relation - nclause->m_rel = WasaQuery::REL_CONTAINS; - } - } - - nclause->m_op = WasaQuery::OP_LEAF; - // +- indicator ? - if (checkSubMatch(SMI_PM, match, reason) && match[0] == '-') { - nclause->m_exclude = true; - } else { - nclause->m_exclude = false; - } - - if (prev_or) { - // The precedent token was an OR, add new clause to or chain - //DPRINT((stderr, "Adding to OR chain\n")); - orChain->m_subs.push_back(nclause); - } else { - if (orChain) { - // Getting out of OR. Add the OR subquery to the main one - //DPRINT((stderr, "Adding OR chain to main\n")); - query->m_subs.push_back(orChain); - orChain = 0; - } - //DPRINT((stderr, "Adding to main chain\n")); - // Add new clause to main query - query->m_subs.push_back(nclause); - } - - prev_or = false; - } - - nextfield: - // Advance current string position. We checked earlier that - // the increment is strictly positive, so we won't loop - // forever - m_cp += m_pmatch[0].rm_eo; - if (m_cp >= cpe) - break; - } - - if (orChain) { - // Getting out of OR. Add the OR subquery to the main one - DPRINT((stderr, "Adding OR chain to main.Before: \n")); - DUMPQ(query); - DUMPQ(orChain); - query->m_subs.push_back(orChain); - } - - regfree(&m_rx); - m_rxneedsfree = false; - return query; -} - -#else // TEST - -#include -#include - -#include "wasastringtoquery.h" - -static char *thisprog; - -int main(int argc, char **argv) -{ - thisprog = argv[0]; - argc--; argv++; - - if (argc != 1) { - fprintf(stderr, "need one arg\n"); - exit(1); - } - const string str = *argv++;argc--; - string reason; - StringToWasaQuery qparser; - WasaQuery *q = qparser.stringToQuery(str, reason); - if (q == 0) { - fprintf(stderr, "stringToQuery failed: %s\n", reason.c_str()); - exit(1); - } - string desc; - q->describe(desc); - fprintf(stderr, "Finally: %s\n", desc.c_str()); - exit(0); -} - -#endif // TEST_WASASTRINGTOQUERY diff --git a/src/query/wasastringtoquery.h b/src/query/wasastringtoquery.h deleted file mode 100644 index b4aa0491..00000000 --- a/src/query/wasastringtoquery.h +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (C) 2006 J.F.Dockes - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#ifndef _WASASTRINGTOQUERY_H_INCLUDED_ -#define _WASASTRINGTOQUERY_H_INCLUDED_ - -#include -#include - -using std::string; -using std::vector; -/* Note: Xesam used to be named wasabi. We changed the references to wasabi in - the comments, but not the code */ - -/** - * A simple class to represent a parsed Xesam user language element. - * Can hold one leaf element or an array of subqueries to be joined by AND/OR - * - * The complete query is represented by a top WasaQuery holding a - * chain of ANDed subclauses. Some of the subclauses may be themselves - * OR'ed lists (it doesn't go deeper). Entries in the AND list may be - * negated (AND NOT). - * - * For LEAF elements, the value can hold one or several words. In the - * latter case, it should be interpreted as a phrase (comes from a - * user-entered "quoted string"), except if the modifier flags say otherwise. - * - * Some fields only make sense either for compound or LEAF queries. This - * is commented for each. We should subclass really. - * - * Note that wasaStringToQuery supposedly parses the whole Xesam - * User Search Language v 0.95, but that some elements are dropped or - * ignored during the translation to a native Recoll query in wasaToRcl - */ -class WasaQuery { -public: - /** Type of this element: leaf or AND/OR chain */ - enum Op {OP_NULL, OP_LEAF, OP_OR, OP_AND}; - /** Relation to be searched between field and value. Recoll actually only - supports "contain" except for a size field */ - enum Rel {REL_NULL, REL_EQUALS, REL_CONTAINS, REL_LT, REL_LTE, - REL_GT, REL_GTE}; - /** Modifiers for terms: case/diacritics handling, - stemming control... */ - enum Modifier {WQM_CASESENS = 1, WQM_DIACSENS = 2, WQM_NOSTEM = 4, - WQM_BOOST = 8, WQM_PROX = 0x10, WQM_SLOPPY = 0x20, - WQM_WORDS = 0x40, WQM_PHRASESLACK = 0x80, WQM_REGEX = 0x100, - WQM_FUZZY = 0x200, WQM_QUOTED = 0x400}; - - typedef vector subqlist_t; - - WasaQuery() - : m_op(OP_NULL), m_rel(REL_NULL), m_exclude(false), - m_modifiers(0), m_slack(0), m_weight(1.0) - {} - - ~WasaQuery(); - - /** Get string describing the query tree from this point */ - void describe(string &desc) const; - - /** Op to be performed on either value (may be LEAF or EXCL, or subqs */ - WasaQuery::Op m_op; - - /** Field specification if any (ie: title, author ...) Only OPT_LEAF */ - string m_fieldspec; - /** Relation between field and value: =, :, <,>,<=, >= */ - WasaQuery::Rel m_rel; - - /* Negating flag */ - bool m_exclude; - - /* String value. Valid for op == OP_LEAF or EXCL */ - string m_value; - - /** Subqueries. Valid for conjunctions */ - vector m_subs; - - unsigned int m_modifiers; - int m_slack; - float m_weight; -}; - -/** - * Wasabi query string parser class. Could be a simple function - * really, but there might be some parser initialization work done in - * the constructor. - */ -class StringToWasaQuery { -public: - StringToWasaQuery(); - ~StringToWasaQuery(); - WasaQuery *stringToQuery(const string& str, string& reason); - class Internal; -private: - Internal *internal; -}; - -#endif /* _WASASTRINGTOQUERY_H_INCLUDED_ */ diff --git a/src/query/wasatorcl.cpp b/src/query/wasatorcl.cpp deleted file mode 100644 index 10a0e6f3..00000000 --- a/src/query/wasatorcl.cpp +++ /dev/null @@ -1,286 +0,0 @@ -/* Copyright (C) 2006 J.F.Dockes - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the - * Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include -#include -#include -#include -using std::string; -using std::list; - -#include "rclconfig.h" -#include "wasastringtoquery.h" -#include "rcldb.h" -#include "searchdata.h" -#include "wasatorcl.h" -#include "debuglog.h" -#include "smallut.h" -#include "rclconfig.h" -#include "refcntr.h" -#include "textsplit.h" - -static Rcl::SearchData *wasaQueryToRcl(const RclConfig *config, - const string& stemlang, - WasaQuery *wasa, - const string& autosuffs, string& reason) -{ - if (wasa == 0) { - reason = "NULL query"; - return 0; - } - if (wasa->m_op != WasaQuery::OP_AND && wasa->m_op != WasaQuery::OP_OR) { - reason = "Top query neither AND nor OR ?"; - LOGERR(("wasaQueryToRcl: top query neither AND nor OR!\n")); - return 0; - } - - Rcl::SearchData *sdata = new - Rcl::SearchData(wasa->m_op == WasaQuery::OP_AND ? Rcl::SCLT_AND : - Rcl::SCLT_OR, stemlang); - LOGDEB2(("wasaQueryToRcl: %s chain\n", wasa->m_op == WasaQuery::OP_AND ? - "AND" : "OR")); - - WasaQuery::subqlist_t::iterator it; - Rcl::SearchDataClause *nclause; - - // Walk the list of clauses. Some pseudo-field types need special - // processing, which results in setting data in the top struct - // instead of adding a clause. We check for these first - for (it = wasa->m_subs.begin(); it != wasa->m_subs.end(); it++) { - - if (!stringicmp("mime", (*it)->m_fieldspec) || - !stringicmp("format", (*it)->m_fieldspec)) { - if ((*it)->m_op == WasaQuery::OP_LEAF) { - if ((*it)->m_exclude) { - sdata->remFiletype((*it)->m_value); - } else { - sdata->addFiletype((*it)->m_value); - } - } else { - reason = "internal error: mime clause not leaf??"; - return 0; - } - continue; - } - - // Xesam uses "type", we also support "rclcat", for broad - // categories like "audio", "presentation", etc. - if (!stringicmp("rclcat", (*it)->m_fieldspec) || - !stringicmp("type", (*it)->m_fieldspec)) { - if ((*it)->m_op != WasaQuery::OP_LEAF) { - reason = "internal error: rclcat/type clause not leaf??"; - return 0; - } - vector mtypes; - if (config && config->getMimeCatTypes((*it)->m_value, mtypes) - && !mtypes.empty()) { - for (vector::iterator mit = mtypes.begin(); - mit != mtypes.end(); mit++) { - if ((*it)->m_exclude) { - sdata->remFiletype(*mit); - } else { - sdata->addFiletype(*mit); - } - } - } else { - reason = "Unknown rclcat/type value: no mime types found"; - return 0; - } - continue; - } - - // Handle "date" spec - if (!stringicmp("date", (*it)->m_fieldspec)) { - if ((*it)->m_op != WasaQuery::OP_LEAF) { - reason = "Negative date filtering not supported"; - return 0; - } - DateInterval di; - if (!parsedateinterval((*it)->m_value, &di)) { - LOGERR(("wasaQueryToRcl: bad date interval format\n")); - reason = "Bad date interval format"; - return 0; - } - LOGDEB(("wasaQueryToRcl:: date span: %d-%d-%d/%d-%d-%d\n", - di.y1,di.m1,di.d1, di.y2,di.m2,di.d2)); - sdata->setDateSpan(&di); - continue; - } - - // Handle "size" spec - if (!stringicmp("size", (*it)->m_fieldspec)) { - if ((*it)->m_op != WasaQuery::OP_LEAF) { - reason = "Negative size filtering not supported"; - return 0; - } - char *cp; - size_t size = strtoll((*it)->m_value.c_str(), &cp, 10); - if (*cp != 0) { - switch (*cp) { - case 'k': case 'K': size *= 1E3;break; - case 'm': case 'M': size *= 1E6;break; - case 'g': case 'G': size *= 1E9;break; - case 't': case 'T': size *= 1E12;break; - default: - reason = string("Bad multiplier suffix: ") + *cp; - return 0; - } - } - - switch ((*it)->m_rel) { - case WasaQuery::REL_EQUALS: - sdata->setMaxSize(size); - sdata->setMinSize(size); - break; - case WasaQuery::REL_LT: - case WasaQuery::REL_LTE: - sdata->setMaxSize(size); - break; - case WasaQuery::REL_GT: - case WasaQuery::REL_GTE: - sdata->setMinSize(size); - break; - default: - reason = "Bad relation operator with size query. Use > < or ="; - return 0; - } - continue; - } - - // "Regular" processing follows: - unsigned int mods = (unsigned int)(*it)->m_modifiers; - LOGDEB0(("wasaQueryToRcl: clause modifiers 0x%x\n", mods)); - nclause = 0; - - switch ((*it)->m_op) { - case WasaQuery::OP_NULL: - case WasaQuery::OP_AND: - default: - reason = "Found bad NULL or AND query type in list"; - LOGERR(("wasaQueryToRcl: found bad NULL or AND q type in list\n")); - continue; - - case WasaQuery::OP_LEAF: { - LOGDEB0(("wasaQueryToRcl: leaf clause [%s:%s] slack %d excl %d\n", - (*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(), - (*it)->m_slack, (*it)->m_exclude)); - - // Change terms found in the "autosuffs" list into "ext" - // field queries - if ((*it)->m_fieldspec.empty() && !autosuffs.empty()) { - vector asfv; - if (stringToStrings(autosuffs, asfv)) { - if (find_if(asfv.begin(), asfv.end(), - StringIcmpPred((*it)->m_value)) != asfv.end()) { - (*it)->m_fieldspec = "ext"; - (*it)->m_modifiers |= WasaQuery::WQM_NOSTEM; - } - } - } - - if (!stringicmp("dir", (*it)->m_fieldspec)) { - // dir filtering special case - nclause = new Rcl::SearchDataClausePath((*it)->m_value, - (*it)->m_exclude); - } else { - if ((*it)->m_exclude && wasa->m_op != WasaQuery::OP_AND) { - LOGERR(("wasaQueryToRcl: excl clause inside OR list!\n")); - continue; - } - - if (mods & WasaQuery::WQM_QUOTED) { - Rcl::SClType tp = (mods & WasaQuery::WQM_PROX) ? - Rcl::SCLT_NEAR : - Rcl::SCLT_PHRASE; - nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value, - (*it)->m_slack, - (*it)->m_fieldspec); - } else { - // If term has commas or slashes inside, take it - // as a list, turn the slashes/commas to spaces, - // leave unquoted. Otherwise, this would end up as - // a phrase query. This is a handy way to enter - // multiple terms to be searched inside a - // field. We interpret ',' as AND, and '/' as - // OR. No mixes allowed and ',' wins. - Rcl::SClType tp = (*it)->m_exclude ? Rcl::SCLT_OR: - Rcl::SCLT_AND; - string ns = neutchars((*it)->m_value, ","); - if (ns.compare((*it)->m_value)) { - // had ',' - tp = Rcl::SCLT_AND; - } else { - ns = neutchars((*it)->m_value, "/"); - if (ns.compare((*it)->m_value)) { - tp = Rcl::SCLT_OR; - } - } - nclause = new Rcl::SearchDataClauseSimple(tp, ns, - (*it)->m_fieldspec); - } - nclause->setexclude((*it)->m_exclude); - } - - if (nclause == 0) { - reason = "Out of memory"; - LOGERR(("wasaQueryToRcl: out of memory\n")); - return 0; - } - } - break; - - case WasaQuery::OP_OR: - LOGDEB2(("wasaQueryToRcl: OR clause [%s]:[%s]\n", - (*it)->m_fieldspec.c_str(), (*it)->m_value.c_str())); - // Create a subquery. - Rcl::SearchData *sub = - wasaQueryToRcl(config, stemlang, *it, autosuffs, reason); - if (sub == 0) { - continue; - } - nclause = - new Rcl::SearchDataClauseSub(RefCntr(sub)); - if (nclause == 0) { - LOGERR(("wasaQueryToRcl: out of memory\n")); - reason = "Out of memory"; - return 0; - } - } - - if (mods & WasaQuery::WQM_NOSTEM) - nclause->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING); - if (mods & WasaQuery::WQM_DIACSENS) - nclause->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS); - if (mods & WasaQuery::WQM_CASESENS) - nclause->addModifier(Rcl::SearchDataClause::SDCM_CASESENS); - if ((*it)->m_weight != 1.0) - nclause->setWeight((*it)->m_weight); - sdata->addClause(nclause); - } - - return sdata; -} - -Rcl::SearchData *wasaStringToRcl(const RclConfig *config, const string& stemlang, - const string &qs, string &reason, - const string& autosuffs) -{ - StringToWasaQuery parser; - WasaQuery *wq = parser.stringToQuery(qs, reason); - if (wq == 0) - return 0; - return wasaQueryToRcl(config, stemlang, wq, autosuffs, reason); -} diff --git a/src/query/wasatorcl.h b/src/query/wasatorcl.h index 5b334f8a..154d2426 100644 --- a/src/query/wasatorcl.h +++ b/src/query/wasatorcl.h @@ -17,15 +17,18 @@ #ifndef _WASATORCL_H_INCLUDED_ #define _WASATORCL_H_INCLUDED_ + #include -using std::string; - -#include "rcldb.h" -#include "searchdata.h" +namespace Rcl { + class SearchData; +} class RclConfig; -extern Rcl::SearchData *wasaStringToRcl(const RclConfig *, const string& stemlang, - const string& query, string &reason, - const string& autosuffs = string()); +extern Rcl::SearchData *wasaStringToRcl(const RclConfig *, + const std::string& stemlang, + const std::string& query, + std::string &reason, + const std::string& autosuffs = ""); + #endif /* _WASATORCL_H_INCLUDED_ */ diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index e6807991..8dcc19d1 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -52,8 +52,6 @@ namespace Rcl { typedef vector::iterator qlist_it_t; typedef vector::const_iterator qlist_cit_t; -static const int original_term_wqf_booster = 10; - void SearchData::commoninit() { m_haveDates = false; @@ -74,241 +72,6 @@ SearchData::~SearchData() delete *it; } -// Expand categories and mime type wild card exps Categories are -// expanded against the configuration, mimetypes against the index -// (for wildcards). -bool SearchData::expandFileTypes(Db &db, vector& tps) -{ - const RclConfig *cfg = db.getConf(); - if (!cfg) { - LOGFATAL(("Db::expandFileTypes: null configuration!!\n")); - return false; - } - vector exptps; - - for (vector::iterator it = tps.begin(); it != tps.end(); it++) { - if (cfg->isMimeCategory(*it)) { - vectortps; - cfg->getMimeCatTypes(*it, tps); - exptps.insert(exptps.end(), tps.begin(), tps.end()); - } else { - TermMatchResult res; - string mt = stringtolower((const string&)*it); - // We set casesens|diacsens to get an equivalent of ixTermMatch() - db.termMatch(Db::ET_WILD|Db::ET_CASESENS|Db::ET_DIACSENS, string(), - mt, res, -1, "mtype"); - if (res.entries.empty()) { - exptps.push_back(it->c_str()); - } else { - for (vector::const_iterator rit = - res.entries.begin(); rit != res.entries.end(); rit++) { - exptps.push_back(strip_prefix(rit->term)); - } - } - } - } - sort(exptps.begin(), exptps.end()); - exptps.erase(unique(exptps.begin(), exptps.end()), exptps.end()); - - tps = exptps; - return true; -} - -static const char *maxXapClauseMsg = - "Maximum Xapian query size exceeded. Increase maxXapianClauses " - "in the configuration. "; -static const char *maxXapClauseCaseDiacMsg = - "Or try to use case (C) or diacritics (D) sensitivity qualifiers, or less " - "wildcards ?" - ; - -bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, - vector& query, - string& reason, void *d) -{ - Xapian::Query xq; - for (qlist_it_t it = query.begin(); it != query.end(); it++) { - Xapian::Query nq; - if (!(*it)->toNativeQuery(db, &nq)) { - LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n", - (*it)->getReason().c_str())); - reason += (*it)->getReason() + " "; - return false; - } - if (nq.empty()) { - LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n")); - continue; - } - // If this structure is an AND list, must use AND_NOT for excl clauses. - // Else this is an OR list, and there can't be excl clauses (checked by - // addClause()) - Xapian::Query::op op; - if (tp == SCLT_AND) { - if ((*it)->getexclude()) { - op = Xapian::Query::OP_AND_NOT; - } else { - op = Xapian::Query::OP_AND; - } - } else { - op = Xapian::Query::OP_OR; - } - if (xq.empty()) { - if (op == Xapian::Query::OP_AND_NOT) - xq = Xapian::Query(op, Xapian::Query::MatchAll, nq); - else - xq = nq; - } else { - xq = Xapian::Query(op, xq, nq); - } - if (int(xq.get_length()) >= getMaxCl()) { - LOGERR(("%s\n", maxXapClauseMsg)); - m_reason += maxXapClauseMsg; - if (!o_index_stripchars) - m_reason += maxXapClauseCaseDiacMsg; - return false; - } - } - - LOGDEB0(("SearchData::clausesToQuery: got %d clauses\n", xq.get_length())); - - if (xq.empty()) - xq = Xapian::Query::MatchAll; - - *((Xapian::Query *)d) = xq; - return true; -} - -bool SearchData::toNativeQuery(Rcl::Db &db, void *d) -{ - LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str())); - m_reason.erase(); - - db.getConf()->getConfParam("maxTermExpand", &m_maxexp); - db.getConf()->getConfParam("maxXapianClauses", &m_maxcl); - - // Walk the clause list translating each in turn and building the - // Xapian query tree - Xapian::Query xq; - if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) { - LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n", - m_reason.c_str())); - return false; - } - - if (m_haveDates) { - // If one of the extremities is unset, compute db extremas - if (m_dates.y1 == 0 || m_dates.y2 == 0) { - int minyear = 1970, maxyear = 2100; - if (!db.maxYearSpan(&minyear, &maxyear)) { - LOGERR(("Can't retrieve index min/max dates\n")); - //whatever, go on. - } - - if (m_dates.y1 == 0) { - m_dates.y1 = minyear; - m_dates.m1 = 1; - m_dates.d1 = 1; - } - if (m_dates.y2 == 0) { - m_dates.y2 = maxyear; - m_dates.m2 = 12; - m_dates.d2 = 31; - } - } - LOGDEB(("Db::toNativeQuery: date interval: %d-%d-%d/%d-%d-%d\n", - m_dates.y1, m_dates.m1, m_dates.d1, - m_dates.y2, m_dates.m2, m_dates.d2)); - Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1, - m_dates.y2, m_dates.m2, m_dates.d2); - if (dq.empty()) { - LOGINFO(("Db::toNativeQuery: date filter is empty\n")); - } - // If no probabilistic query is provided then promote the daterange - // filter to be THE query instead of filtering an empty query. - if (xq.empty()) { - LOGINFO(("Db::toNativeQuery: proba query is empty\n")); - xq = dq; - } else { - xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq); - } - } - - - if (m_minSize != size_t(-1) || m_maxSize != size_t(-1)) { - Xapian::Query sq; - char min[50], max[50]; - sprintf(min, "%lld", (long long)m_minSize); - sprintf(max, "%lld", (long long)m_maxSize); - if (m_minSize == size_t(-1)) { - string value(max); - leftzeropad(value, 12); - sq = Xapian::Query(Xapian::Query::OP_VALUE_LE, VALUE_SIZE, value); - } else if (m_maxSize == size_t(-1)) { - string value(min); - leftzeropad(value, 12); - sq = Xapian::Query(Xapian::Query::OP_VALUE_GE, VALUE_SIZE, value); - } else { - string minvalue(min); - leftzeropad(minvalue, 12); - string maxvalue(max); - leftzeropad(maxvalue, 12); - sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE, - minvalue, maxvalue); - } - - // If no probabilistic query is provided then promote the - // filter to be THE query instead of filtering an empty query. - if (xq.empty()) { - LOGINFO(("Db::toNativeQuery: proba query is empty\n")); - xq = sq; - } else { - xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, sq); - } - } - - // Add the autophrase if any - if (m_autophrase.isNotNull()) { - Xapian::Query apq; - if (m_autophrase->toNativeQuery(db, &apq)) { - xq = xq.empty() ? apq : - Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq); - } - } - - // Add the file type filtering clause if any - if (!m_filetypes.empty()) { - expandFileTypes(db, m_filetypes); - - Xapian::Query tq; - for (vector::iterator it = m_filetypes.begin(); - it != m_filetypes.end(); it++) { - string term = wrap_prefix(mimetype_prefix) + *it; - LOGDEB0(("Adding file type term: [%s]\n", term.c_str())); - tq = tq.empty() ? Xapian::Query(term) : - Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term)); - } - xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq); - } - - // Add the neg file type filtering clause if any - if (!m_nfiletypes.empty()) { - expandFileTypes(db, m_nfiletypes); - - Xapian::Query tq; - for (vector::iterator it = m_nfiletypes.begin(); - it != m_nfiletypes.end(); it++) { - string term = wrap_prefix(mimetype_prefix) + *it; - LOGDEB0(("Adding negative file type term: [%s]\n", term.c_str())); - tq = tq.empty() ? Xapian::Query(term) : - Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term)); - } - xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq); - } - - *((Xapian::Query *)d) = xq; - return true; -} - // This is called by the GUI simple search if the option is set: add // (OR) phrase to a query (if it is simple enough) so that results // where the search terms are close and in order will come up on top. @@ -428,695 +191,4 @@ void SearchData::getTerms(HighlightData &hld) const return; } -// Splitter callback for breaking a user string into simple terms and -// phrases. This is for parts of the user entry which would appear as -// a single word because there is no white space inside, but are -// actually multiple terms to rcldb (ie term1,term2) -class TextSplitQ : public TextSplitP { - public: - TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc) - : TextSplitP(prc, flags), - curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0) - {} - - bool takeword(const std::string &term, int pos, int bs, int be) - { - // Check if the first letter is a majuscule in which - // case we do not want to do stem expansion. Need to do this - // before unac of course... - curnostemexp = unaciscapital(term); - - return TextSplitP::takeword(term, pos, bs, be); - } - - bool curnostemexp; - vector terms; - vector nostemexps; - const StopList &stops; - // Count of terms including stopwords: this is for adjusting - // phrase/near slack - int alltermcount; - int lastpos; -}; - -class TermProcQ : public TermProc { -public: - TermProcQ() : TermProc(0), m_ts(0) {} - void setTSQ(TextSplitQ *ts) {m_ts = ts;} - - bool takeword(const std::string &term, int pos, int bs, int be) - { - m_ts->alltermcount++; - if (m_ts->lastpos < pos) - m_ts->lastpos = pos; - bool noexpand = be ? m_ts->curnostemexp : true; - LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n", - term.c_str(), pos, noexpand)); - if (m_terms[pos].size() < term.size()) { - m_terms[pos] = term; - m_nste[pos] = noexpand; - } - return true; - } - bool flush() - { - for (map::const_iterator it = m_terms.begin(); - it != m_terms.end(); it++) { - m_ts->terms.push_back(it->second); - m_ts->nostemexps.push_back(m_nste[it->first]); - } - return true; - } -private: - TextSplitQ *m_ts; - map m_terms; - map m_nste; -}; - - -#if 1 -static void listVector(const string& what, const vector&l) -{ - string a; - for (vector::const_iterator it = l.begin(); it != l.end(); it++) { - a = a + *it + " "; - } - LOGDEB0(("%s: %s\n", what.c_str(), a.c_str())); -} -#endif - -/** Expand term into term list, using appropriate mode: stem, wildcards, - * diacritics... - * - * @param mods stem expansion, case and diacritics sensitivity control. - * @param term input single word - * @param oexp output expansion list - * @param sterm output original input term if there were no wildcards - * @param prefix field prefix in index. We could recompute it, but the caller - * has it already. Used in the simple case where there is nothing to expand, - * and we just return the prefixed term (else Db::termMatch deals with it). - */ -bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, - string& ermsg, int mods, - const string& term, - vector& oexp, string &sterm, - const string& prefix) -{ - LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n", - mods, m_field.c_str(), term.c_str(), getStemLang().c_str())); - sterm.clear(); - oexp.clear(); - if (term.empty()) - return true; - - bool maxexpissoft = false; - int maxexpand = getSoftMaxExp(); - if (maxexpand != -1) { - maxexpissoft = true; - } else { - maxexpand = getMaxExp(); - } - - bool haswild = term.find_first_of(cstr_minwilds) != string::npos; - - // If there are no wildcards, add term to the list of user-entered terms - if (!haswild) { - m_hldata.uterms.insert(term); - sterm = term; - } - // No stem expansion if there are wildcards or if prevented by caller - bool nostemexp = (mods & SDCM_NOSTEMMING) != 0; - if (haswild || getStemLang().empty()) { - LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n")); - nostemexp = true; - } - - // noexpansion can be modified further down by possible case/diac expansion - bool noexpansion = nostemexp && !haswild; - - int termmatchsens = 0; - - bool diac_sensitive = (mods & SDCM_DIACSENS) != 0; - bool case_sensitive = (mods & SDCM_CASESENS) != 0; - - if (o_index_stripchars) { - diac_sensitive = case_sensitive = false; - } else { - // If we are working with a raw index, apply the rules for case and - // diacritics sensitivity. - - // If any character has a diacritic, we become - // diacritic-sensitive. Note that the way that the test is - // performed (conversion+comparison) will automatically ignore - // accented characters which are actually a separate letter - if (getAutoDiac() && unachasaccents(term)) { - LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n")); - diac_sensitive = true; - } - - // If any character apart the first is uppercase, we become - // case-sensitive. The first character is reserved for - // turning off stemming. You need to use a query language - // modifier to search for Floor in a case-sensitive way. - Utf8Iter it(term); - it++; - if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) { - LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n")); - case_sensitive = true; - } - - // If we are sensitive to case or diacritics turn stemming off - if (diac_sensitive || case_sensitive) { - LOGDEB0(("expandTerm: diac or case sens set -> stemexpand off\n")); - nostemexp = true; - } - - if (!case_sensitive || !diac_sensitive) - noexpansion = false; - } - - if (case_sensitive) - termmatchsens |= Db::ET_CASESENS; - if (diac_sensitive) - termmatchsens |= Db::ET_DIACSENS; - - if (noexpansion) { - oexp.push_back(prefix + term); - m_hldata.terms[term] = term; - LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str())); - return true; - } - - Db::MatchType mtyp = haswild ? Db::ET_WILD : - nostemexp ? Db::ET_NONE : Db::ET_STEM; - TermMatchResult res; - if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand, - m_field)) { - // Let it go through - } - - // Term match entries to vector of terms - if (int(res.entries.size()) >= maxexpand && !maxexpissoft) { - ermsg = "Maximum term expansion size exceeded." - " Maybe use case/diacritics sensitivity or increase maxTermExpand."; - return false; - } - for (vector::const_iterator it = res.entries.begin(); - it != res.entries.end(); it++) { - oexp.push_back(it->term); - } - // If the term does not exist at all in the db, the return from - // termMatch() is going to be empty, which is not what we want (we - // would then compute an empty Xapian query) - if (oexp.empty()) - oexp.push_back(prefix + term); - - // Remember the uterm-to-expansion links - for (vector::const_iterator it = oexp.begin(); - it != oexp.end(); it++) { - m_hldata.terms[strip_prefix(*it)] = term; - } - LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str())); - return true; -} - -// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d -void multiply_groups(vector >::const_iterator vvit, - vector >::const_iterator vvend, - vector& comb, - vector >&allcombs) -{ - // Remember my string vector and compute next, for recursive calls. - vector >::const_iterator myvit = vvit++; - - // Walk the string vector I'm called upon and, for each string, - // add it to current result, an call myself recursively on the - // next string vector. The last call (last element of the vector of - // vectors), adds the elementary result to the output - - // Walk my string vector - for (vector::const_iterator strit = (*myvit).begin(); - strit != (*myvit).end(); strit++) { - - // Add my current value to the string vector we're building - comb.push_back(*strit); - - if (vvit == vvend) { - // Last call: store current result - allcombs.push_back(comb); - } else { - // Call recursively on next string vector - multiply_groups(vvit, vvend, comb, allcombs); - } - // Pop the value I just added (make room for the next element in my - // vector) - comb.pop_back(); - } -} - -void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg, - const string& span, - int mods, void * pq) -{ - vector& pqueries(*(vector*)pq); - LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n", - span.c_str(), (unsigned int)mods)); - vector exp; - string sterm; // dumb version of user term - - string prefix; - const FieldTraits *ftp; - if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) { - prefix = wrap_prefix(ftp->pfx); - } - - if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix)) - return; - - // Set up the highlight data. No prefix should go in there - for (vector::const_iterator it = exp.begin(); - it != exp.end(); it++) { - m_hldata.groups.push_back(vector(1, it->substr(prefix.size()))); - m_hldata.slacks.push_back(0); - m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1); - } - - // Push either term or OR of stem-expanded set - Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end()); - m_curcl += exp.size(); - - // If sterm (simplified original user term) is not null, give it a - // relevance boost. We do this even if no expansion occurred (else - // the non-expanded terms in a term list would end-up with even - // less wqf). This does not happen if there are wildcards anywhere - // in the search. - // We normally boost the original term in the stem expansion list. Don't - // do it if there are wildcards anywhere, this would skew the results. - bool doBoostUserTerm = - (m_parentSearch && !m_parentSearch->haveWildCards()) || - (m_parentSearch == 0 && !m_haveWildCards); - if (doBoostUserTerm && !sterm.empty()) { - xq = Xapian::Query(Xapian::Query::OP_OR, xq, - Xapian::Query(prefix+sterm, - original_term_wqf_booster)); - } - pqueries.push_back(xq); -} - -// User entry element had several terms: transform into a PHRASE or -// NEAR xapian query, the elements of which can themselves be OR -// queries if the terms get expanded by stemming or wildcards (we -// don't do stemming for PHRASE though) -void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, - TextSplitQ *splitData, - int mods, void *pq, - bool useNear, int slack) -{ - vector &pqueries(*(vector*)pq); - Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : - Xapian::Query::OP_PHRASE; - vector orqueries; -#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF - bool hadmultiple = false; -#endif - vector >groups; - - string prefix; - const FieldTraits *ftp; - if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) { - prefix = wrap_prefix(ftp->pfx); - } - - if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) { - orqueries.push_back(Xapian::Query(prefix + start_of_field_term)); - slack++; - } - - // Go through the list and perform stem/wildcard expansion for each element - vector::iterator nxit = splitData->nostemexps.begin(); - for (vector::iterator it = splitData->terms.begin(); - it != splitData->terms.end(); it++, nxit++) { - LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str())); - // Adjust when we do stem expansion. Not if disabled by - // caller, not inside phrases, and some versions of xapian - // will accept only one OR clause inside NEAR. - bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) -#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF - || hadmultiple -#endif // single OR inside NEAR - ; - int lmods = mods; - if (nostemexp) - lmods |= SearchDataClause::SDCM_NOSTEMMING; - string sterm; - vector exp; - if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix)) - return; - LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size())); - listVector("", exp); - // groups is used for highlighting, we don't want prefixes in there. - vector noprefs; - for (vector::const_iterator it = exp.begin(); - it != exp.end(); it++) { - noprefs.push_back(it->substr(prefix.size())); - } - groups.push_back(noprefs); - orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, - exp.begin(), exp.end())); - m_curcl += exp.size(); - if (m_curcl >= getMaxCl()) - return; -#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF - if (exp.size() > 1) - hadmultiple = true; -#endif - } - - if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) { - orqueries.push_back(Xapian::Query(prefix + end_of_field_term)); - slack++; - } - - // Generate an appropriate PHRASE/NEAR query with adjusted slack - // For phrases, give a relevance boost like we do for original terms - LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n", - splitData->alltermcount, splitData->lastpos)); - Xapian::Query xq(op, orqueries.begin(), orqueries.end(), - splitData->lastpos + 1 + slack); - if (op == Xapian::Query::OP_PHRASE) - xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq, - original_term_wqf_booster); - pqueries.push_back(xq); - - // Add all combinations of NEAR/PHRASE groups to the highlighting data. - vector > allcombs; - vector comb; - multiply_groups(groups.begin(), groups.end(), comb, allcombs); - - // Insert the search groups and slacks in the highlight data, with - // a reference to the user entry that generated them: - m_hldata.groups.insert(m_hldata.groups.end(), - allcombs.begin(), allcombs.end()); - m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack); - m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(), - m_hldata.ugroups.size() - 1); -} - -// Trim string beginning with ^ or ending with $ and convert to flags -static int stringToMods(string& s) -{ - int mods = 0; - // Check for an anchored search - trimstring(s); - if (s.length() > 0 && s[0] == '^') { - mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART; - s.erase(0, 1); - } - if (s.length() > 0 && s[s.length()-1] == '$') { - mods |= Rcl::SearchDataClause::SDCM_ANCHOREND; - s.erase(s.length()-1); - } - return mods; -} - -/** - * Turn user entry string (NOT query language) into a list of xapian queries. - * We just separate words and phrases, and do wildcard and stem expansion, - * - * This is used to process data entered into an OR/AND/NEAR/PHRASE field of - * the GUI (in the case of NEAR/PHRASE, clausedist adds dquotes to the user - * entry). - * - * This appears awful, and it would seem that the split into - * terms/phrases should be performed in the upper layer so that we - * only receive pure term or near/phrase pure elements here, but in - * fact there are things that would appear like terms to naive code, - * and which will actually may be turned into phrases (ie: tom:jerry), - * in a manner which intimately depends on the index implementation, - * so that it makes sense to process this here. - * - * The final list contains one query for each term or phrase - * - Elements corresponding to a stem-expanded part are an OP_OR - * composition of the stem-expanded terms (or a single term query). - * - Elements corresponding to phrase/near are an OP_PHRASE/NEAR - * composition of the phrase terms (no stem expansion in this case) - * @return the subquery count (either or'd stem-expanded terms or phrase word - * count) - */ -bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq, - string &ermsg, void *pq, - int slack, bool useNear) -{ - vector &pqueries(*(vector*)pq); - int mods = m_modifiers; - - LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x " - "slack %d near %d\n", - iq.c_str(), m_field.c_str(), mods, slack, useNear)); - ermsg.erase(); - m_curcl = 0; - const StopList stops = db.getStopList(); - - // Simple whitespace-split input into user-level words and - // double-quoted phrases: word1 word2 "this is a phrase". - // - // The text splitter may further still decide that the resulting - // "words" are really phrases, this depends on separators: - // [paul@dom.net] would still be a word (span), but [about:me] - // will probably be handled as a phrase. - vector phrases; - TextSplit::stringToStrings(iq, phrases); - - // Process each element: textsplit into terms, handle stem/wildcard - // expansion and transform into an appropriate Xapian::Query - try { - for (vector::iterator it = phrases.begin(); - it != phrases.end(); it++) { - LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str())); - // Anchoring modifiers - int amods = stringToMods(*it); - int terminc = amods != 0 ? 1 : 0; - mods |= amods; - // If there are multiple spans in this element, including - // at least one composite, we have to increase the slack - // else a phrase query including a span would fail. - // Ex: "term0@term1 term2" is onlyspans-split as: - // 0 term0@term1 0 12 - // 2 term2 13 18 - // The position of term2 is 2, not 1, so a phrase search - // would fail. - // We used to do word split, searching for - // "term0 term1 term2" instead, which may have worse - // performance, but will succeed. - // We now adjust the phrase/near slack by comparing the term count - // and the last position - - // The term processing pipeline: - TermProcQ tpq; - TermProc *nxt = &tpq; - TermProcStop tpstop(nxt, stops); nxt = &tpstop; - //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon; - //tpcommon.onlygrams(true); - TermProcPrep tpprep(nxt); - if (o_index_stripchars) - nxt = &tpprep; - - TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | - TextSplit::TXTS_KEEPWILD), - stops, nxt); - tpq.setTSQ(&splitter); - splitter.text_to_words(*it); - - slack += splitter.lastpos - splitter.terms.size() + 1; - - LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size())); - switch (splitter.terms.size() + terminc) { - case 0: - continue;// ?? - case 1: { - int lmods = mods; - if (splitter.nostemexps.front()) - lmods |= SearchDataClause::SDCM_NOSTEMMING; - m_hldata.ugroups.push_back(splitter.terms); - processSimpleSpan(db, ermsg, splitter.terms.front(), - lmods, &pqueries); - } - break; - default: - m_hldata.ugroups.push_back(splitter.terms); - processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries, - useNear, slack); - } - if (m_curcl >= getMaxCl()) { - ermsg = maxXapClauseMsg; - if (!o_index_stripchars) - ermsg += maxXapClauseCaseDiacMsg; - break; - } - } - } catch (const Xapian::Error &e) { - ermsg = e.get_msg(); - } catch (const string &s) { - ermsg = s; - } catch (const char *s) { - ermsg = s; - } catch (...) { - ermsg = "Caught unknown exception"; - } - if (!ermsg.empty()) { - LOGERR(("stringToXapianQueries: %s\n", ermsg.c_str())); - return false; - } - return true; -} - -// Translate a simple OR or AND search clause. -bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p) -{ - LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n", - getStemLang().c_str())); - - Xapian::Query *qp = (Xapian::Query *)p; - *qp = Xapian::Query(); - - Xapian::Query::op op; - switch (m_tp) { - case SCLT_AND: op = Xapian::Query::OP_AND; break; - case SCLT_OR: op = Xapian::Query::OP_OR; break; - default: - LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp)); - return false; - } - - vector pqueries; - if (!processUserString(db, m_text, m_reason, &pqueries)) - return false; - if (pqueries.empty()) { - LOGERR(("SearchDataClauseSimple: resolved to null query\n")); - return true; - } - - *qp = Xapian::Query(op, pqueries.begin(), pqueries.end()); - if (m_weight != 1.0) { - *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight); - } - return true; -} - -// Translate a FILENAME search clause. This always comes -// from a "filename" search from the gui or recollq. A query language -// "filename:"-prefixed field will not go through here, but through -// the generic field-processing code. -// -// We do not split the entry any more (used to do some crazy thing -// about expanding multiple fragments in the past). We just take the -// value blanks and all and expand this against the indexed unsplit -// file names -bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p) -{ - Xapian::Query *qp = (Xapian::Query *)p; - *qp = Xapian::Query(); - - int maxexp = getSoftMaxExp(); - if (maxexp == -1) - maxexp = getMaxExp(); - - vector names; - db.filenameWildExp(m_text, names, maxexp); - *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end()); - - if (m_weight != 1.0) { - *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight); - } - return true; -} - -// Translate a dir: path filtering clause. See comments in .h -bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p) -{ - LOGDEB(("SearchDataClausePath::toNativeQuery: [%s]\n", m_text.c_str())); - Xapian::Query *qp = (Xapian::Query *)p; - *qp = Xapian::Query(); - - if (m_text.empty()) { - LOGERR(("SearchDataClausePath: empty path??\n")); - m_reason = "Empty path ?"; - return false; - } - - vector orqueries; - - if (m_text[0] == '/') - orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix))); - else - m_text = path_tildexpand(m_text); - - vector vpath; - stringToTokens(m_text, vpath, "/"); - - for (vector::const_iterator pit = vpath.begin(); - pit != vpath.end(); pit++){ - - string sterm; - vector exp; - if (!expandTerm(db, m_reason, - SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS, - *pit, exp, sterm, wrap_prefix(pathelt_prefix))) { - return false; - } - LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size())); - listVector("", exp); - if (exp.size() == 1) - orqueries.push_back(Xapian::Query(exp[0])); - else - orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, - exp.begin(), exp.end())); - m_curcl += exp.size(); - if (m_curcl >= getMaxCl()) - return false; - } - - *qp = Xapian::Query(Xapian::Query::OP_PHRASE, - orqueries.begin(), orqueries.end()); - - if (m_weight != 1.0) { - *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight); - } - return true; -} - -// Translate NEAR or PHRASE clause. -bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p) -{ - LOGDEB(("SearchDataClauseDist::toNativeQuery\n")); - - Xapian::Query *qp = (Xapian::Query *)p; - *qp = Xapian::Query(); - - vector pqueries; - Xapian::Query nq; - - // We produce a single phrase out of the user entry then use - // stringToXapianQueries() to lowercase and simplify the phrase - // terms etc. This will result into a single (complex) - // Xapian::Query. - if (m_text.find('\"') != string::npos) { - m_text = neutchars(m_text, "\""); - } - string s = cstr_dquote + m_text + cstr_dquote; - bool useNear = (m_tp == SCLT_NEAR); - if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear)) - return false; - if (pqueries.empty()) { - LOGERR(("SearchDataClauseDist: resolved to null query\n")); - return true; - } - - *qp = *pqueries.begin(); - if (m_weight != 1.0) { - *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight); - } - return true; -} - } // Namespace Rcl diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index 3879e859..0fbced7b 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -102,7 +102,7 @@ public: bool toNativeQuery(Rcl::Db &db, void *); /** We become the owner of cl and will delete it */ - bool addClause(SearchDataClause *cl); + bool addClause(SearchDataClause* cl); /** If this is a simple query (one field only, no distance clauses), * add phrase made of query terms to query, so that docs containing the @@ -164,7 +164,7 @@ public: private: // Combine type. Only SCLT_AND or SCLT_OR here SClType m_tp; - // Complex query descriptor + // The clauses std::vector m_query; // Restricted set of filetypes if not empty. std::vector m_filetypes; @@ -173,14 +173,18 @@ private: // Autophrase if set. Can't be part of the normal chain because // it uses OP_AND_MAYBE RefCntr m_autophrase; - // + + // Special stuff produced by input which looks like a clause but means + // something else (date and size specs) bool m_haveDates; DateInterval m_dates; // Restrict to date interval size_t m_maxSize; size_t m_minSize; + // Printable expanded version of the complete query, retrieved/set // from rcldb after the Xapian::setQuery() call std::string m_description; + // Error diag std::string m_reason; bool m_haveWildCards; std::string m_stemlang; @@ -215,10 +219,12 @@ class SearchDataClause { public: enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2, SDCM_ANCHOREND=4, SDCM_CASESENS=8, SDCM_DIACSENS=16}; + enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE}; SearchDataClause(SClType tp) : m_tp(tp), m_parentSearch(0), m_haveWildCards(0), - m_modifiers(SDCM_NONE), m_weight(1.0), m_exclude(false) + m_modifiers(SDCM_NONE), m_weight(1.0), m_exclude(false), + m_rel(REL_CONTAINS) {} virtual ~SearchDataClause() {} virtual bool toNativeQuery(Rcl::Db &db, void *) = 0; @@ -230,6 +236,9 @@ public: { return m_tp; } + void setTp(SClType tp) { + m_tp = tp; + } void setParent(SearchData *p) { m_parentSearch = p; @@ -279,7 +288,12 @@ public: { m_exclude = onoff; } - + virtual void setrel(Relation rel) { + m_rel = rel; + } + virtual Relation getrel() { + return m_rel; + } friend class SearchData; protected: std::string m_reason; @@ -289,6 +303,8 @@ protected: Modifier m_modifiers; float m_weight; bool m_exclude; + Relation m_rel; + private: SearchDataClause(const SearchDataClause&) { @@ -339,13 +355,15 @@ public: { return m_field; } + virtual void setfield(const string& field) { + m_field = field; + } protected: std::string m_text; // Raw user entry text. std::string m_field; // Field specification if any HighlightData m_hldata; // Current count of Xapian clauses, to check against expansion limit int m_curcl; - bool processUserString(Rcl::Db &db, const string &iq, std::string &ermsg, void* pq, int slack = 0, bool useNear = false); @@ -444,6 +462,9 @@ public: { return m_slack; } + virtual void setslack(int slack) { + m_slack = slack; + } private: int m_slack; }; diff --git a/src/rcldb/searchdatatox.cpp b/src/rcldb/searchdatatox.cpp new file mode 100644 index 00000000..9124ad5f --- /dev/null +++ b/src/rcldb/searchdatatox.cpp @@ -0,0 +1,983 @@ +/* Copyright (C) 2006 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +// Handle translation from rcl's SearchData structures to Xapian Queries + +#include "autoconfig.h" + +#include + +#include +#include +#include +#include +using namespace std; + +#include "xapian.h" + +#include "cstr.h" +#include "rcldb.h" +#include "rcldb_p.h" +#include "searchdata.h" +#include "debuglog.h" +#include "smallut.h" +#include "textsplit.h" +#include "unacpp.h" +#include "utf8iter.h" +#include "stoplist.h" +#include "rclconfig.h" +#include "termproc.h" +#include "synfamily.h" +#include "stemdb.h" +#include "expansiondbs.h" +#include "base64.h" +#include "daterange.h" + +namespace Rcl { + +typedef vector::iterator qlist_it_t; + +static const int original_term_wqf_booster = 10; + +// Expand categories and mime type wild card exps Categories are +// expanded against the configuration, mimetypes against the index +// (for wildcards). +bool SearchData::expandFileTypes(Db &db, vector& tps) +{ + const RclConfig *cfg = db.getConf(); + if (!cfg) { + LOGFATAL(("Db::expandFileTypes: null configuration!!\n")); + return false; + } + vector exptps; + + for (vector::iterator it = tps.begin(); it != tps.end(); it++) { + if (cfg->isMimeCategory(*it)) { + vectortps; + cfg->getMimeCatTypes(*it, tps); + exptps.insert(exptps.end(), tps.begin(), tps.end()); + } else { + TermMatchResult res; + string mt = stringtolower((const string&)*it); + // We set casesens|diacsens to get an equivalent of ixTermMatch() + db.termMatch(Db::ET_WILD|Db::ET_CASESENS|Db::ET_DIACSENS, string(), + mt, res, -1, "mtype"); + if (res.entries.empty()) { + exptps.push_back(it->c_str()); + } else { + for (vector::const_iterator rit = + res.entries.begin(); rit != res.entries.end(); rit++) { + exptps.push_back(strip_prefix(rit->term)); + } + } + } + } + sort(exptps.begin(), exptps.end()); + exptps.erase(unique(exptps.begin(), exptps.end()), exptps.end()); + + tps = exptps; + return true; +} + +static const char *maxXapClauseMsg = + "Maximum Xapian query size exceeded. Increase maxXapianClauses " + "in the configuration. "; +static const char *maxXapClauseCaseDiacMsg = + "Or try to use case (C) or diacritics (D) sensitivity qualifiers, or less " + "wildcards ?" + ; + +bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, + vector& query, + string& reason, void *d) +{ + Xapian::Query xq; + for (qlist_it_t it = query.begin(); it != query.end(); it++) { + Xapian::Query nq; + if (!(*it)->toNativeQuery(db, &nq)) { + LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n", + (*it)->getReason().c_str())); + reason += (*it)->getReason() + " "; + return false; + } + if (nq.empty()) { + LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n")); + continue; + } + // If this structure is an AND list, must use AND_NOT for excl clauses. + // Else this is an OR list, and there can't be excl clauses (checked by + // addClause()) + Xapian::Query::op op; + if (tp == SCLT_AND) { + if ((*it)->getexclude()) { + op = Xapian::Query::OP_AND_NOT; + } else { + op = Xapian::Query::OP_AND; + } + } else { + op = Xapian::Query::OP_OR; + } + if (xq.empty()) { + if (op == Xapian::Query::OP_AND_NOT) + xq = Xapian::Query(op, Xapian::Query::MatchAll, nq); + else + xq = nq; + } else { + xq = Xapian::Query(op, xq, nq); + } + if (int(xq.get_length()) >= getMaxCl()) { + LOGERR(("%s\n", maxXapClauseMsg)); + m_reason += maxXapClauseMsg; + if (!o_index_stripchars) + m_reason += maxXapClauseCaseDiacMsg; + return false; + } + } + + LOGDEB0(("SearchData::clausesToQuery: got %d clauses\n", xq.get_length())); + + if (xq.empty()) + xq = Xapian::Query::MatchAll; + + *((Xapian::Query *)d) = xq; + return true; +} + +bool SearchData::toNativeQuery(Rcl::Db &db, void *d) +{ + LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str())); + m_reason.erase(); + + db.getConf()->getConfParam("maxTermExpand", &m_maxexp); + db.getConf()->getConfParam("maxXapianClauses", &m_maxcl); + + // Walk the clause list translating each in turn and building the + // Xapian query tree + Xapian::Query xq; + if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) { + LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n", + m_reason.c_str())); + return false; + } + + if (m_haveDates) { + // If one of the extremities is unset, compute db extremas + if (m_dates.y1 == 0 || m_dates.y2 == 0) { + int minyear = 1970, maxyear = 2100; + if (!db.maxYearSpan(&minyear, &maxyear)) { + LOGERR(("Can't retrieve index min/max dates\n")); + //whatever, go on. + } + + if (m_dates.y1 == 0) { + m_dates.y1 = minyear; + m_dates.m1 = 1; + m_dates.d1 = 1; + } + if (m_dates.y2 == 0) { + m_dates.y2 = maxyear; + m_dates.m2 = 12; + m_dates.d2 = 31; + } + } + LOGDEB(("Db::toNativeQuery: date interval: %d-%d-%d/%d-%d-%d\n", + m_dates.y1, m_dates.m1, m_dates.d1, + m_dates.y2, m_dates.m2, m_dates.d2)); + Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1, + m_dates.y2, m_dates.m2, m_dates.d2); + if (dq.empty()) { + LOGINFO(("Db::toNativeQuery: date filter is empty\n")); + } + // If no probabilistic query is provided then promote the daterange + // filter to be THE query instead of filtering an empty query. + if (xq.empty()) { + LOGINFO(("Db::toNativeQuery: proba query is empty\n")); + xq = dq; + } else { + xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq); + } + } + + + if (m_minSize != size_t(-1) || m_maxSize != size_t(-1)) { + Xapian::Query sq; + char min[50], max[50]; + sprintf(min, "%lld", (long long)m_minSize); + sprintf(max, "%lld", (long long)m_maxSize); + if (m_minSize == size_t(-1)) { + string value(max); + leftzeropad(value, 12); + sq = Xapian::Query(Xapian::Query::OP_VALUE_LE, VALUE_SIZE, value); + } else if (m_maxSize == size_t(-1)) { + string value(min); + leftzeropad(value, 12); + sq = Xapian::Query(Xapian::Query::OP_VALUE_GE, VALUE_SIZE, value); + } else { + string minvalue(min); + leftzeropad(minvalue, 12); + string maxvalue(max); + leftzeropad(maxvalue, 12); + sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE, + minvalue, maxvalue); + } + + // If no probabilistic query is provided then promote the + // filter to be THE query instead of filtering an empty query. + if (xq.empty()) { + LOGINFO(("Db::toNativeQuery: proba query is empty\n")); + xq = sq; + } else { + xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, sq); + } + } + + // Add the autophrase if any + if (m_autophrase.isNotNull()) { + Xapian::Query apq; + if (m_autophrase->toNativeQuery(db, &apq)) { + xq = xq.empty() ? apq : + Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq); + } + } + + // Add the file type filtering clause if any + if (!m_filetypes.empty()) { + expandFileTypes(db, m_filetypes); + + Xapian::Query tq; + for (vector::iterator it = m_filetypes.begin(); + it != m_filetypes.end(); it++) { + string term = wrap_prefix(mimetype_prefix) + *it; + LOGDEB0(("Adding file type term: [%s]\n", term.c_str())); + tq = tq.empty() ? Xapian::Query(term) : + Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term)); + } + xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq); + } + + // Add the neg file type filtering clause if any + if (!m_nfiletypes.empty()) { + expandFileTypes(db, m_nfiletypes); + + Xapian::Query tq; + for (vector::iterator it = m_nfiletypes.begin(); + it != m_nfiletypes.end(); it++) { + string term = wrap_prefix(mimetype_prefix) + *it; + LOGDEB0(("Adding negative file type term: [%s]\n", term.c_str())); + tq = tq.empty() ? Xapian::Query(term) : + Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term)); + } + xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq); + } + + *((Xapian::Query *)d) = xq; + return true; +} + +// Splitter callback for breaking a user string into simple terms and +// phrases. This is for parts of the user entry which would appear as +// a single word because there is no white space inside, but are +// actually multiple terms to rcldb (ie term1,term2) +class TextSplitQ : public TextSplitP { + public: + TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc) + : TextSplitP(prc, flags), + curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0) + {} + + bool takeword(const std::string &term, int pos, int bs, int be) + { + // Check if the first letter is a majuscule in which + // case we do not want to do stem expansion. Need to do this + // before unac of course... + curnostemexp = unaciscapital(term); + + return TextSplitP::takeword(term, pos, bs, be); + } + + bool curnostemexp; + vector terms; + vector nostemexps; + const StopList &stops; + // Count of terms including stopwords: this is for adjusting + // phrase/near slack + int alltermcount; + int lastpos; +}; + +class TermProcQ : public TermProc { +public: + TermProcQ() : TermProc(0), m_ts(0) {} + void setTSQ(TextSplitQ *ts) {m_ts = ts;} + + bool takeword(const std::string &term, int pos, int bs, int be) + { + m_ts->alltermcount++; + if (m_ts->lastpos < pos) + m_ts->lastpos = pos; + bool noexpand = be ? m_ts->curnostemexp : true; + LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n", + term.c_str(), pos, noexpand)); + if (m_terms[pos].size() < term.size()) { + m_terms[pos] = term; + m_nste[pos] = noexpand; + } + return true; + } + bool flush() + { + for (map::const_iterator it = m_terms.begin(); + it != m_terms.end(); it++) { + m_ts->terms.push_back(it->second); + m_ts->nostemexps.push_back(m_nste[it->first]); + } + return true; + } +private: + TextSplitQ *m_ts; + map m_terms; + map m_nste; +}; + + +#if 1 +static void listVector(const string& what, const vector&l) +{ + string a; + for (vector::const_iterator it = l.begin(); it != l.end(); it++) { + a = a + *it + " "; + } + LOGDEB0(("%s: %s\n", what.c_str(), a.c_str())); +} +#endif + +/** Expand term into term list, using appropriate mode: stem, wildcards, + * diacritics... + * + * @param mods stem expansion, case and diacritics sensitivity control. + * @param term input single word + * @param oexp output expansion list + * @param sterm output original input term if there were no wildcards + * @param prefix field prefix in index. We could recompute it, but the caller + * has it already. Used in the simple case where there is nothing to expand, + * and we just return the prefixed term (else Db::termMatch deals with it). + */ +bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, + string& ermsg, int mods, + const string& term, + vector& oexp, string &sterm, + const string& prefix) +{ + LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n", + mods, m_field.c_str(), term.c_str(), getStemLang().c_str())); + sterm.clear(); + oexp.clear(); + if (term.empty()) + return true; + + bool maxexpissoft = false; + int maxexpand = getSoftMaxExp(); + if (maxexpand != -1) { + maxexpissoft = true; + } else { + maxexpand = getMaxExp(); + } + + bool haswild = term.find_first_of(cstr_minwilds) != string::npos; + + // If there are no wildcards, add term to the list of user-entered terms + if (!haswild) { + m_hldata.uterms.insert(term); + sterm = term; + } + // No stem expansion if there are wildcards or if prevented by caller + bool nostemexp = (mods & SDCM_NOSTEMMING) != 0; + if (haswild || getStemLang().empty()) { + LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n")); + nostemexp = true; + } + + // noexpansion can be modified further down by possible case/diac expansion + bool noexpansion = nostemexp && !haswild; + + int termmatchsens = 0; + + bool diac_sensitive = (mods & SDCM_DIACSENS) != 0; + bool case_sensitive = (mods & SDCM_CASESENS) != 0; + + if (o_index_stripchars) { + diac_sensitive = case_sensitive = false; + } else { + // If we are working with a raw index, apply the rules for case and + // diacritics sensitivity. + + // If any character has a diacritic, we become + // diacritic-sensitive. Note that the way that the test is + // performed (conversion+comparison) will automatically ignore + // accented characters which are actually a separate letter + if (getAutoDiac() && unachasaccents(term)) { + LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n")); + diac_sensitive = true; + } + + // If any character apart the first is uppercase, we become + // case-sensitive. The first character is reserved for + // turning off stemming. You need to use a query language + // modifier to search for Floor in a case-sensitive way. + Utf8Iter it(term); + it++; + if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) { + LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n")); + case_sensitive = true; + } + + // If we are sensitive to case or diacritics turn stemming off + if (diac_sensitive || case_sensitive) { + LOGDEB0(("expandTerm: diac or case sens set -> stemexpand off\n")); + nostemexp = true; + } + + if (!case_sensitive || !diac_sensitive) + noexpansion = false; + } + + if (case_sensitive) + termmatchsens |= Db::ET_CASESENS; + if (diac_sensitive) + termmatchsens |= Db::ET_DIACSENS; + + if (noexpansion) { + oexp.push_back(prefix + term); + m_hldata.terms[term] = term; + LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str())); + return true; + } + + Db::MatchType mtyp = haswild ? Db::ET_WILD : + nostemexp ? Db::ET_NONE : Db::ET_STEM; + TermMatchResult res; + if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand, + m_field)) { + // Let it go through + } + + // Term match entries to vector of terms + if (int(res.entries.size()) >= maxexpand && !maxexpissoft) { + ermsg = "Maximum term expansion size exceeded." + " Maybe use case/diacritics sensitivity or increase maxTermExpand."; + return false; + } + for (vector::const_iterator it = res.entries.begin(); + it != res.entries.end(); it++) { + oexp.push_back(it->term); + } + // If the term does not exist at all in the db, the return from + // termMatch() is going to be empty, which is not what we want (we + // would then compute an empty Xapian query) + if (oexp.empty()) + oexp.push_back(prefix + term); + + // Remember the uterm-to-expansion links + for (vector::const_iterator it = oexp.begin(); + it != oexp.end(); it++) { + m_hldata.terms[strip_prefix(*it)] = term; + } + LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str())); + return true; +} + +// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d +void multiply_groups(vector >::const_iterator vvit, + vector >::const_iterator vvend, + vector& comb, + vector >&allcombs) +{ + // Remember my string vector and compute next, for recursive calls. + vector >::const_iterator myvit = vvit++; + + // Walk the string vector I'm called upon and, for each string, + // add it to current result, an call myself recursively on the + // next string vector. The last call (last element of the vector of + // vectors), adds the elementary result to the output + + // Walk my string vector + for (vector::const_iterator strit = (*myvit).begin(); + strit != (*myvit).end(); strit++) { + + // Add my current value to the string vector we're building + comb.push_back(*strit); + + if (vvit == vvend) { + // Last call: store current result + allcombs.push_back(comb); + } else { + // Call recursively on next string vector + multiply_groups(vvit, vvend, comb, allcombs); + } + // Pop the value I just added (make room for the next element in my + // vector) + comb.pop_back(); + } +} + +void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg, + const string& span, + int mods, void * pq) +{ + vector& pqueries(*(vector*)pq); + LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n", + span.c_str(), (unsigned int)mods)); + vector exp; + string sterm; // dumb version of user term + + string prefix; + const FieldTraits *ftp; + if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) { + prefix = wrap_prefix(ftp->pfx); + } + + if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix)) + return; + + // Set up the highlight data. No prefix should go in there + for (vector::const_iterator it = exp.begin(); + it != exp.end(); it++) { + m_hldata.groups.push_back(vector(1, it->substr(prefix.size()))); + m_hldata.slacks.push_back(0); + m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1); + } + + // Push either term or OR of stem-expanded set + Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end()); + m_curcl += exp.size(); + + // If sterm (simplified original user term) is not null, give it a + // relevance boost. We do this even if no expansion occurred (else + // the non-expanded terms in a term list would end-up with even + // less wqf). This does not happen if there are wildcards anywhere + // in the search. + // We normally boost the original term in the stem expansion list. Don't + // do it if there are wildcards anywhere, this would skew the results. + bool doBoostUserTerm = + (m_parentSearch && !m_parentSearch->haveWildCards()) || + (m_parentSearch == 0 && !m_haveWildCards); + if (doBoostUserTerm && !sterm.empty()) { + xq = Xapian::Query(Xapian::Query::OP_OR, xq, + Xapian::Query(prefix+sterm, + original_term_wqf_booster)); + } + pqueries.push_back(xq); +} + +// User entry element had several terms: transform into a PHRASE or +// NEAR xapian query, the elements of which can themselves be OR +// queries if the terms get expanded by stemming or wildcards (we +// don't do stemming for PHRASE though) +void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, + TextSplitQ *splitData, + int mods, void *pq, + bool useNear, int slack) +{ + vector &pqueries(*(vector*)pq); + Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : + Xapian::Query::OP_PHRASE; + vector orqueries; +#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF + bool hadmultiple = false; +#endif + vector >groups; + + string prefix; + const FieldTraits *ftp; + if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) { + prefix = wrap_prefix(ftp->pfx); + } + + if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) { + orqueries.push_back(Xapian::Query(prefix + start_of_field_term)); + slack++; + } + + // Go through the list and perform stem/wildcard expansion for each element + vector::iterator nxit = splitData->nostemexps.begin(); + for (vector::iterator it = splitData->terms.begin(); + it != splitData->terms.end(); it++, nxit++) { + LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str())); + // Adjust when we do stem expansion. Not if disabled by + // caller, not inside phrases, and some versions of xapian + // will accept only one OR clause inside NEAR. + bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) +#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF + || hadmultiple +#endif // single OR inside NEAR + ; + int lmods = mods; + if (nostemexp) + lmods |= SearchDataClause::SDCM_NOSTEMMING; + string sterm; + vector exp; + if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix)) + return; + LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size())); + listVector("", exp); + // groups is used for highlighting, we don't want prefixes in there. + vector noprefs; + for (vector::const_iterator it = exp.begin(); + it != exp.end(); it++) { + noprefs.push_back(it->substr(prefix.size())); + } + groups.push_back(noprefs); + orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, + exp.begin(), exp.end())); + m_curcl += exp.size(); + if (m_curcl >= getMaxCl()) + return; +#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF + if (exp.size() > 1) + hadmultiple = true; +#endif + } + + if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) { + orqueries.push_back(Xapian::Query(prefix + end_of_field_term)); + slack++; + } + + // Generate an appropriate PHRASE/NEAR query with adjusted slack + // For phrases, give a relevance boost like we do for original terms + LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n", + splitData->alltermcount, splitData->lastpos)); + Xapian::Query xq(op, orqueries.begin(), orqueries.end(), + splitData->lastpos + 1 + slack); + if (op == Xapian::Query::OP_PHRASE) + xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq, + original_term_wqf_booster); + pqueries.push_back(xq); + + // Add all combinations of NEAR/PHRASE groups to the highlighting data. + vector > allcombs; + vector comb; + multiply_groups(groups.begin(), groups.end(), comb, allcombs); + + // Insert the search groups and slacks in the highlight data, with + // a reference to the user entry that generated them: + m_hldata.groups.insert(m_hldata.groups.end(), + allcombs.begin(), allcombs.end()); + m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack); + m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(), + m_hldata.ugroups.size() - 1); +} + +// Trim string beginning with ^ or ending with $ and convert to flags +static int stringToMods(string& s) +{ + int mods = 0; + // Check for an anchored search + trimstring(s); + if (s.length() > 0 && s[0] == '^') { + mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART; + s.erase(0, 1); + } + if (s.length() > 0 && s[s.length()-1] == '$') { + mods |= Rcl::SearchDataClause::SDCM_ANCHOREND; + s.erase(s.length()-1); + } + return mods; +} + +/** + * Turn user entry string (NOT query language) into a list of xapian queries. + * We just separate words and phrases, and do wildcard and stem expansion, + * + * This is used to process data entered into an OR/AND/NEAR/PHRASE field of + * the GUI (in the case of NEAR/PHRASE, clausedist adds dquotes to the user + * entry). + * + * This appears awful, and it would seem that the split into + * terms/phrases should be performed in the upper layer so that we + * only receive pure term or near/phrase pure elements here, but in + * fact there are things that would appear like terms to naive code, + * and which will actually may be turned into phrases (ie: tom:jerry), + * in a manner which intimately depends on the index implementation, + * so that it makes sense to process this here. + * + * The final list contains one query for each term or phrase + * - Elements corresponding to a stem-expanded part are an OP_OR + * composition of the stem-expanded terms (or a single term query). + * - Elements corresponding to phrase/near are an OP_PHRASE/NEAR + * composition of the phrase terms (no stem expansion in this case) + * @return the subquery count (either or'd stem-expanded terms or phrase word + * count) + */ +bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq, + string &ermsg, void *pq, + int slack, bool useNear) +{ + vector &pqueries(*(vector*)pq); + int mods = m_modifiers; + + LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x " + "slack %d near %d\n", + iq.c_str(), m_field.c_str(), mods, slack, useNear)); + ermsg.erase(); + m_curcl = 0; + const StopList stops = db.getStopList(); + + // Simple whitespace-split input into user-level words and + // double-quoted phrases: word1 word2 "this is a phrase". + // + // The text splitter may further still decide that the resulting + // "words" are really phrases, this depends on separators: + // [paul@dom.net] would still be a word (span), but [about:me] + // will probably be handled as a phrase. + vector phrases; + TextSplit::stringToStrings(iq, phrases); + + // Process each element: textsplit into terms, handle stem/wildcard + // expansion and transform into an appropriate Xapian::Query + try { + for (vector::iterator it = phrases.begin(); + it != phrases.end(); it++) { + LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str())); + // Anchoring modifiers + int amods = stringToMods(*it); + int terminc = amods != 0 ? 1 : 0; + mods |= amods; + // If there are multiple spans in this element, including + // at least one composite, we have to increase the slack + // else a phrase query including a span would fail. + // Ex: "term0@term1 term2" is onlyspans-split as: + // 0 term0@term1 0 12 + // 2 term2 13 18 + // The position of term2 is 2, not 1, so a phrase search + // would fail. + // We used to do word split, searching for + // "term0 term1 term2" instead, which may have worse + // performance, but will succeed. + // We now adjust the phrase/near slack by comparing the term count + // and the last position + + // The term processing pipeline: + TermProcQ tpq; + TermProc *nxt = &tpq; + TermProcStop tpstop(nxt, stops); nxt = &tpstop; + //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon; + //tpcommon.onlygrams(true); + TermProcPrep tpprep(nxt); + if (o_index_stripchars) + nxt = &tpprep; + + TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | + TextSplit::TXTS_KEEPWILD), + stops, nxt); + tpq.setTSQ(&splitter); + splitter.text_to_words(*it); + + slack += splitter.lastpos - splitter.terms.size() + 1; + + LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size())); + switch (splitter.terms.size() + terminc) { + case 0: + continue;// ?? + case 1: { + int lmods = mods; + if (splitter.nostemexps.front()) + lmods |= SearchDataClause::SDCM_NOSTEMMING; + m_hldata.ugroups.push_back(splitter.terms); + processSimpleSpan(db, ermsg, splitter.terms.front(), + lmods, &pqueries); + } + break; + default: + m_hldata.ugroups.push_back(splitter.terms); + processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries, + useNear, slack); + } + if (m_curcl >= getMaxCl()) { + ermsg = maxXapClauseMsg; + if (!o_index_stripchars) + ermsg += maxXapClauseCaseDiacMsg; + break; + } + } + } catch (const Xapian::Error &e) { + ermsg = e.get_msg(); + } catch (const string &s) { + ermsg = s; + } catch (const char *s) { + ermsg = s; + } catch (...) { + ermsg = "Caught unknown exception"; + } + if (!ermsg.empty()) { + LOGERR(("stringToXapianQueries: %s\n", ermsg.c_str())); + return false; + } + return true; +} + +// Translate a simple OR or AND search clause. +bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p) +{ + LOGDEB(("SearchDataClauseSimple::toNativeQuery: fld [%s] val [%s] " + "stemlang [%s]\n", m_field.c_str(), m_text.c_str(), + getStemLang().c_str())); + + Xapian::Query *qp = (Xapian::Query *)p; + *qp = Xapian::Query(); + + Xapian::Query::op op; + switch (m_tp) { + case SCLT_AND: op = Xapian::Query::OP_AND; break; + case SCLT_OR: op = Xapian::Query::OP_OR; break; + default: + LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp)); + return false; + } + + vector pqueries; + if (!processUserString(db, m_text, m_reason, &pqueries)) + return false; + if (pqueries.empty()) { + LOGERR(("SearchDataClauseSimple: resolved to null query\n")); + return true; + } + + *qp = Xapian::Query(op, pqueries.begin(), pqueries.end()); + if (m_weight != 1.0) { + *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight); + } + return true; +} + +// Translate a FILENAME search clause. This always comes +// from a "filename" search from the gui or recollq. A query language +// "filename:"-prefixed field will not go through here, but through +// the generic field-processing code. +// +// We do not split the entry any more (used to do some crazy thing +// about expanding multiple fragments in the past). We just take the +// value blanks and all and expand this against the indexed unsplit +// file names +bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p) +{ + Xapian::Query *qp = (Xapian::Query *)p; + *qp = Xapian::Query(); + + int maxexp = getSoftMaxExp(); + if (maxexp == -1) + maxexp = getMaxExp(); + + vector names; + db.filenameWildExp(m_text, names, maxexp); + *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end()); + + if (m_weight != 1.0) { + *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight); + } + return true; +} + +// Translate a dir: path filtering clause. See comments in .h +bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p) +{ + LOGDEB(("SearchDataClausePath::toNativeQuery: [%s]\n", m_text.c_str())); + Xapian::Query *qp = (Xapian::Query *)p; + *qp = Xapian::Query(); + + if (m_text.empty()) { + LOGERR(("SearchDataClausePath: empty path??\n")); + m_reason = "Empty path ?"; + return false; + } + + vector orqueries; + + if (m_text[0] == '/') + orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix))); + else + m_text = path_tildexpand(m_text); + + vector vpath; + stringToTokens(m_text, vpath, "/"); + + for (vector::const_iterator pit = vpath.begin(); + pit != vpath.end(); pit++){ + + string sterm; + vector exp; + if (!expandTerm(db, m_reason, + SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS, + *pit, exp, sterm, wrap_prefix(pathelt_prefix))) { + return false; + } + LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size())); + listVector("", exp); + if (exp.size() == 1) + orqueries.push_back(Xapian::Query(exp[0])); + else + orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, + exp.begin(), exp.end())); + m_curcl += exp.size(); + if (m_curcl >= getMaxCl()) + return false; + } + + *qp = Xapian::Query(Xapian::Query::OP_PHRASE, + orqueries.begin(), orqueries.end()); + + if (m_weight != 1.0) { + *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight); + } + return true; +} + +// Translate NEAR or PHRASE clause. +bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p) +{ + LOGDEB(("SearchDataClauseDist::toNativeQuery\n")); + + Xapian::Query *qp = (Xapian::Query *)p; + *qp = Xapian::Query(); + + vector pqueries; + Xapian::Query nq; + + // We produce a single phrase out of the user entry then use + // stringToXapianQueries() to lowercase and simplify the phrase + // terms etc. This will result into a single (complex) + // Xapian::Query. + if (m_text.find('\"') != string::npos) { + m_text = neutchars(m_text, "\""); + } + string s = cstr_dquote + m_text + cstr_dquote; + bool useNear = (m_tp == SCLT_NEAR); + if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear)) + return false; + if (pqueries.empty()) { + LOGERR(("SearchDataClauseDist: resolved to null query\n")); + return true; + } + + *qp = *pqueries.begin(); + if (m_weight != 1.0) { + *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight); + } + return true; +} + +} // Namespace Rcl