%{ #define YYDEBUG 1 #include "autoconfig.h" #include #include #include #include "searchdata.h" #include "wasaparserdriver.h" #include "wasaparse.hpp" using namespace std; //#define LOG_PARSER #ifdef LOG_PARSER #define LOGP(X) {cerr << X;} #else #define LOGP(X) #endif int yylex(yy::parser::semantic_type *, yy::parser::location_type *, WasaParserDriver *); void yyerror(char const *); static void qualify(Rcl::SearchDataClauseDist *, const string &); static void addSubQuery(WasaParserDriver *d, Rcl::SearchData *sd, Rcl::SearchData *sq) { if (sd && sq) sd->addClause( new Rcl::SearchDataClauseSub(std::shared_ptr(sq))); } %} %skeleton "lalr1.cc" %defines %locations %error-verbose %parse-param {WasaParserDriver* d} %lex-param {WasaParserDriver* d} %union { std::string *str; Rcl::SearchDataClauseRange *rg; Rcl::SearchDataClauseSimple *cl; Rcl::SearchData *sd; } %destructor {delete $$;} %type qualquote %type fieldexpr %type range %type term %type query %type complexfieldname /* Non operator tokens need precedence because of the possibility of concatenation which needs to have lower prec than OR */ %left WORD %left QUOTED %left QUALIFIERS %left AND UCONCAT '(' '-' %left OR %token EQUALS CONTAINS SMALLEREQ SMALLER GREATEREQ GREATER RANGE %% topquery: query { // It's possible that we end up with no query (e.g.: because just a // date filter was set, no terms). Allocate an empty query so that we // have something to set the global criteria on (this will yield a // Xapian search like FILTER xxx if ($1 == 0) d->m_result = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang); else d->m_result = $1; } query: query query %prec UCONCAT { LOGP("q: query query\n"); Rcl::SearchData *sd = 0; if ($1 || $2) { sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang); addSubQuery(d, sd, $1); addSubQuery(d, sd, $2); } $$ = sd; } | query AND query { LOGP("q: query AND query\n"); Rcl::SearchData *sd = 0; if ($1 || $3) { sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang); addSubQuery(d, sd, $1); addSubQuery(d, sd, $3); } $$ = sd; } | query OR query { LOGP("query: query OR query\n"); Rcl::SearchData *top = 0; if ($1 || $3) { top = new Rcl::SearchData(Rcl::SCLT_OR, d->m_stemlang); addSubQuery(d, top, $1); addSubQuery(d, top, $3); } $$ = top; } | '(' query ')' { LOGP("q: ( query )\n"); $$ = $2; } | fieldexpr %prec UCONCAT { LOGP("q: fieldexpr\n"); Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang); if (d->addClause(sd, $1)) { $$ = sd; } else { delete sd; $$ = 0; } } ; fieldexpr: term { LOGP("fe: simple fieldexpr: " << $1->gettext() << endl); $$ = $1; } | complexfieldname EQUALS term { LOGP("fe: " << *$1 << " = " << $3->gettext() << endl); $3->setfield(*$1); $3->setrel(Rcl::SearchDataClause::REL_EQUALS); $$ = $3; delete $1; } | complexfieldname CONTAINS term { LOGP("fe: " << *$1 << " : " << $3->gettext() << endl); $3->setfield(*$1); $3->setrel(Rcl::SearchDataClause::REL_CONTAINS); $$ = $3; delete $1; } | complexfieldname CONTAINS range { LOGP("fe: " << *$1 << " : " << $3->gettext() << endl); $3->setfield(*$1); $3->setrel(Rcl::SearchDataClause::REL_CONTAINS); $$ = $3; delete $1; } | complexfieldname SMALLER term { LOGP("fe: " << *$1 << " < " << $3->gettext() << endl); $3->setfield(*$1); $3->setrel(Rcl::SearchDataClause::REL_LT); $$ = $3; delete $1; } | complexfieldname SMALLEREQ term { LOGP("fe: " << *$1 << " <= " << $3->gettext() << endl); $3->setfield(*$1); $3->setrel(Rcl::SearchDataClause::REL_LTE); $$ = $3; delete $1; } | complexfieldname GREATER term { LOGP("fe: " << *$1 << " > " << $3->gettext() << endl); $3->setfield(*$1); $3->setrel(Rcl::SearchDataClause::REL_GT); $$ = $3; delete $1; } | complexfieldname GREATEREQ term { LOGP("fe: " << *$1 << " >= " << $3->gettext() << endl); $3->setfield(*$1); $3->setrel(Rcl::SearchDataClause::REL_GTE); $$ = $3; delete $1; } | '-' fieldexpr { LOGP("fe: - fieldexpr[" << $2->gettext() << "]" << endl); $2->setexclude(true); $$ = $2; } ; /* Deal with field names like dc:title */ complexfieldname: WORD { LOGP("cfn: WORD" << endl); $$ = $1; } | complexfieldname CONTAINS WORD { LOGP("cfn: complexfieldname ':' WORD" << endl); $$ = new string(*$1 + string(":") + *$3); delete $1; delete $3; } range: WORD RANGE WORD { LOGP("Range: " << *$1 << string(" .. ") << *$3 << endl); $$ = new Rcl::SearchDataClauseRange(*$1, *$3); delete $1; delete $3; } | RANGE WORD { LOGP("Range: " << "" << string(" .. ") << *$2 << endl); $$ = new Rcl::SearchDataClauseRange("", *$2); delete $2; } | WORD RANGE { LOGP("Range: " << *$1 << string(" .. ") << "" << endl); $$ = new Rcl::SearchDataClauseRange(*$1, ""); delete $1; } ; term: WORD { LOGP("term[" << *$1 << "]" << endl); $$ = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, *$1); delete $1; } | qualquote { $$ = $1; } qualquote: QUOTED { LOGP("QUOTED[" << *$1 << "]" << endl); $$ = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0); delete $1; } | QUOTED QUALIFIERS { LOGP("QUOTED[" << *$1 << "] QUALIFIERS[" << *$2 << "]" << endl); Rcl::SearchDataClauseDist *cl = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0); qualify(cl, *$2); $$ = cl; delete $1; delete $2; } %% #include // Look for int at index, skip and return new index found? value. static unsigned int qualGetInt(const string& q, unsigned int cur, int *pval) { unsigned int ncur = cur; if (cur < q.size() - 1) { char *endptr; int val = strtol(&q[cur + 1], &endptr, 10); if (endptr != &q[cur + 1]) { ncur += endptr - &q[cur + 1]; *pval = val; } } return ncur; } static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals) { // cerr << "qualify(" << cl << ", " << quals << ")" << endl; for (unsigned int i = 0; i < quals.length(); i++) { //fprintf(stderr, "qual char %c\n", quals[i]); switch (quals[i]) { case 'b': cl->setWeight(10.0); break; case 'c': break; case 'C': cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS); break; case 'd': break; case 'D': cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS); break; case 'e': cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS); cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS); cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING); break; case 'l': cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING); break; case 'L': break; case 'o': { int slack = 10; i = qualGetInt(quals, i, &slack); cl->setslack(slack); //cerr << "set slack " << cl->getslack() << " done" << endl; } break; case 'p': cl->setTp(Rcl::SCLT_NEAR); if (cl->getslack() == 0) { cl->setslack(10); //cerr << "set slack " << cl->getslack() << " done" << endl; } break; case 's': cl->addModifier(Rcl::SearchDataClause::SDCM_NOSYNS); break; case 'S': break; case '.':case '0':case '1':case '2':case '3':case '4': case '5':case '6':case '7':case '8':case '9': { int n = 0; float factor = 1.0; if (sscanf(&(quals[i]), "%f %n", &factor, &n)) { if (factor != 1.0) { cl->setWeight(factor); } } if (n > 0) i += n - 1; } default: break; } } } // specialstartchars are special only at the beginning of a token // (e.g. doctor-who is a term, not 2 terms separated by '-') static const string specialstartchars("-"); // specialinchars are special everywhere except inside a quoted string static const string specialinchars(":=<>()"); // Called with the first dquote already read static int parseString(WasaParserDriver *d, yy::parser::semantic_type *yylval) { string* value = new string(); d->qualifiers().clear(); int c; while ((c = d->GETCHAR())) { switch (c) { case '\\': /* Escape: get next char */ c = d->GETCHAR(); if (c == 0) { value->push_back(c); goto out; } value->push_back(c); break; case '"': /* End of string. Look for qualifiers */ while ((c = d->GETCHAR()) && (isalnum(c) || c == '.')) d->qualifiers().push_back(c); d->UNGETCHAR(c); goto out; default: value->push_back(c); } } out: //cerr << "GOT QUOTED ["<qualifiers() << "]" << endl; yylval->str = value; return yy::parser::token::QUOTED; } int yylex(yy::parser::semantic_type *yylval, yy::parser::location_type *, WasaParserDriver *d) { if (!d->qualifiers().empty()) { yylval->str = new string(); yylval->str->swap(d->qualifiers()); return yy::parser::token::QUALIFIERS; } int c; /* Skip white space. */ while ((c = d->GETCHAR()) && isspace(c)) continue; if (c == 0) return 0; if (specialstartchars.find_first_of(c) != string::npos) { //cerr << "yylex: return " << c << endl; return c; } // field-term relations, and ranges switch (c) { case '=': return yy::parser::token::EQUALS; case ':': return yy::parser::token::CONTAINS; case '<': { int c1 = d->GETCHAR(); if (c1 == '=') { return yy::parser::token::SMALLEREQ; } else { d->UNGETCHAR(c1); return yy::parser::token::SMALLER; } } case '.': { int c1 = d->GETCHAR(); if (c1 == '.') { return yy::parser::token::RANGE; } else { d->UNGETCHAR(c1); break; } } case '>': { int c1 = d->GETCHAR(); if (c1 == '=') { return yy::parser::token::GREATEREQ; } else { d->UNGETCHAR(c1); return yy::parser::token::GREATER; } } case '(': case ')': return c; } if (c == '"') return parseString(d, yylval); d->UNGETCHAR(c); // Other chars start a term or field name or reserved word string* word = new string(); while ((c = d->GETCHAR())) { if (isspace(c)) { //cerr << "Word broken by whitespace" << endl; break; } else if (specialinchars.find_first_of(c) != string::npos) { //cerr << "Word broken by special char" << endl; d->UNGETCHAR(c); break; } else if (c == '.') { int c1 = d->GETCHAR(); if (c1 == '.') { d->UNGETCHAR(c1); d->UNGETCHAR(c); break; } else { d->UNGETCHAR(c1); word->push_back(c); } } else if (c == 0) { //cerr << "Word broken by EOF" << endl; break; } else { word->push_back(c); } } if (!word->compare("AND") || !word->compare("&&")) { delete word; return yy::parser::token::AND; } else if (!word->compare("OR") || !word->compare("||")) { delete word; return yy::parser::token::OR; } // cerr << "Got word [" << word << "]" << endl; yylval->str = word; return yy::parser::token::WORD; }