506 lines
12 KiB
Plaintext
506 lines
12 KiB
Plaintext
%{
|
|
#define YYDEBUG 1
|
|
#include "autoconfig.h"
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <iostream>
|
|
#include <string>
|
|
|
|
#include "searchdata.h"
|
|
#include "wasaparserdriver.h"
|
|
#include "wasaparse.hpp"
|
|
|
|
using namespace std;
|
|
|
|
//#define LOG_PARSER
|
|
#ifdef LOG_PARSER
|
|
#define LOGP(X) {cerr << X;}
|
|
#else
|
|
#define LOGP(X)
|
|
#endif
|
|
|
|
int yylex(yy::parser::semantic_type *, yy::parser::location_type *,
|
|
WasaParserDriver *);
|
|
void yyerror(char const *);
|
|
static void qualify(Rcl::SearchDataClauseDist *, const string &);
|
|
|
|
static void addSubQuery(WasaParserDriver *d,
|
|
Rcl::SearchData *sd, Rcl::SearchData *sq)
|
|
{
|
|
if (sd && sq)
|
|
sd->addClause(
|
|
new Rcl::SearchDataClauseSub(std::shared_ptr<Rcl::SearchData>(sq)));
|
|
}
|
|
|
|
%}
|
|
|
|
%skeleton "lalr1.cc"
|
|
%defines
|
|
%locations
|
|
%error-verbose
|
|
|
|
%parse-param {WasaParserDriver* d}
|
|
%lex-param {WasaParserDriver* d}
|
|
|
|
%union {
|
|
std::string *str;
|
|
Rcl::SearchDataClauseRange *rg;
|
|
Rcl::SearchDataClauseSimple *cl;
|
|
Rcl::SearchData *sd;
|
|
}
|
|
%destructor {delete $$;} <str>
|
|
|
|
%type <cl> qualquote
|
|
%type <cl> fieldexpr
|
|
%type <rg> range
|
|
%type <cl> term
|
|
%type <sd> query
|
|
%type <str> complexfieldname
|
|
|
|
/* Non operator tokens need precedence because of the possibility of
|
|
concatenation which needs to have lower prec than OR */
|
|
%left <str> WORD
|
|
%left <str> QUOTED
|
|
%left <str> QUALIFIERS
|
|
%left AND UCONCAT '(' '-'
|
|
%left OR
|
|
|
|
%token EQUALS CONTAINS SMALLEREQ SMALLER GREATEREQ GREATER RANGE
|
|
|
|
%%
|
|
|
|
topquery: query
|
|
{
|
|
// It's possible that we end up with no query (e.g.: because just a
|
|
// date filter was set, no terms). Allocate an empty query so that we
|
|
// have something to set the global criteria on (this will yield a
|
|
// Xapian search like <alldocuments> FILTER xxx
|
|
if ($1 == 0)
|
|
d->m_result = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
|
|
else
|
|
d->m_result = $1;
|
|
}
|
|
|
|
query:
|
|
query query %prec UCONCAT
|
|
{
|
|
LOGP("q: query query\n");
|
|
Rcl::SearchData *sd = 0;
|
|
if ($1 || $2) {
|
|
sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
|
|
addSubQuery(d, sd, $1);
|
|
addSubQuery(d, sd, $2);
|
|
}
|
|
$$ = sd;
|
|
}
|
|
| query AND query
|
|
{
|
|
LOGP("q: query AND query\n");
|
|
Rcl::SearchData *sd = 0;
|
|
if ($1 || $3) {
|
|
sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
|
|
addSubQuery(d, sd, $1);
|
|
addSubQuery(d, sd, $3);
|
|
}
|
|
$$ = sd;
|
|
}
|
|
| query OR query
|
|
{
|
|
LOGP("query: query OR query\n");
|
|
Rcl::SearchData *top = 0;
|
|
if ($1 || $3) {
|
|
top = new Rcl::SearchData(Rcl::SCLT_OR, d->m_stemlang);
|
|
addSubQuery(d, top, $1);
|
|
addSubQuery(d, top, $3);
|
|
}
|
|
$$ = top;
|
|
}
|
|
| '(' query ')'
|
|
{
|
|
LOGP("q: ( query )\n");
|
|
$$ = $2;
|
|
}
|
|
|
|
|
fieldexpr %prec UCONCAT
|
|
{
|
|
LOGP("q: fieldexpr\n");
|
|
Rcl::SearchData *sd = new Rcl::SearchData(Rcl::SCLT_AND, d->m_stemlang);
|
|
if (d->addClause(sd, $1)) {
|
|
$$ = sd;
|
|
} else {
|
|
delete sd;
|
|
$$ = 0;
|
|
}
|
|
}
|
|
;
|
|
|
|
fieldexpr: term
|
|
{
|
|
LOGP("fe: simple fieldexpr: " << $1->gettext() << endl);
|
|
$$ = $1;
|
|
}
|
|
| complexfieldname EQUALS term
|
|
{
|
|
LOGP("fe: " << *$1 << " = " << $3->gettext() << endl);
|
|
$3->setfield(*$1);
|
|
$3->setrel(Rcl::SearchDataClause::REL_EQUALS);
|
|
$$ = $3;
|
|
delete $1;
|
|
}
|
|
| complexfieldname CONTAINS term
|
|
{
|
|
LOGP("fe: " << *$1 << " : " << $3->gettext() << endl);
|
|
$3->setfield(*$1);
|
|
$3->setrel(Rcl::SearchDataClause::REL_CONTAINS);
|
|
$$ = $3;
|
|
delete $1;
|
|
}
|
|
| complexfieldname CONTAINS range
|
|
{
|
|
LOGP("fe: " << *$1 << " : " << $3->gettext() << endl);
|
|
$3->setfield(*$1);
|
|
$3->setrel(Rcl::SearchDataClause::REL_CONTAINS);
|
|
$$ = $3;
|
|
delete $1;
|
|
}
|
|
| complexfieldname SMALLER term
|
|
{
|
|
LOGP("fe: " << *$1 << " < " << $3->gettext() << endl);
|
|
$3->setfield(*$1);
|
|
$3->setrel(Rcl::SearchDataClause::REL_LT);
|
|
$$ = $3;
|
|
delete $1;
|
|
}
|
|
| complexfieldname SMALLEREQ term
|
|
{
|
|
LOGP("fe: " << *$1 << " <= " << $3->gettext() << endl);
|
|
$3->setfield(*$1);
|
|
$3->setrel(Rcl::SearchDataClause::REL_LTE);
|
|
$$ = $3;
|
|
delete $1;
|
|
}
|
|
| complexfieldname GREATER term
|
|
{
|
|
LOGP("fe: " << *$1 << " > " << $3->gettext() << endl);
|
|
$3->setfield(*$1);
|
|
$3->setrel(Rcl::SearchDataClause::REL_GT);
|
|
$$ = $3;
|
|
delete $1;
|
|
}
|
|
| complexfieldname GREATEREQ term
|
|
{
|
|
LOGP("fe: " << *$1 << " >= " << $3->gettext() << endl);
|
|
$3->setfield(*$1);
|
|
$3->setrel(Rcl::SearchDataClause::REL_GTE);
|
|
$$ = $3;
|
|
delete $1;
|
|
}
|
|
| '-' fieldexpr
|
|
{
|
|
LOGP("fe: - fieldexpr[" << $2->gettext() << "]" << endl);
|
|
$2->setexclude(true);
|
|
$$ = $2;
|
|
}
|
|
;
|
|
|
|
/* Deal with field names like dc:title */
|
|
complexfieldname:
|
|
WORD
|
|
{
|
|
LOGP("cfn: WORD" << endl);
|
|
$$ = $1;
|
|
}
|
|
|
|
|
complexfieldname CONTAINS WORD
|
|
{
|
|
LOGP("cfn: complexfieldname ':' WORD" << endl);
|
|
$$ = new string(*$1 + string(":") + *$3);
|
|
delete $1;
|
|
delete $3;
|
|
}
|
|
|
|
range:
|
|
WORD RANGE WORD
|
|
{
|
|
LOGP("Range: " << *$1 << string(" .. ") << *$3 << endl);
|
|
$$ = new Rcl::SearchDataClauseRange(*$1, *$3);
|
|
delete $1;
|
|
delete $3;
|
|
}
|
|
|
|
|
RANGE WORD
|
|
{
|
|
LOGP("Range: " << "" << string(" .. ") << *$2 << endl);
|
|
$$ = new Rcl::SearchDataClauseRange("", *$2);
|
|
delete $2;
|
|
}
|
|
|
|
|
WORD RANGE
|
|
{
|
|
LOGP("Range: " << *$1 << string(" .. ") << "" << endl);
|
|
$$ = new Rcl::SearchDataClauseRange(*$1, "");
|
|
delete $1;
|
|
}
|
|
;
|
|
|
|
term:
|
|
WORD
|
|
{
|
|
LOGP("term[" << *$1 << "]" << endl);
|
|
$$ = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, *$1);
|
|
delete $1;
|
|
}
|
|
| qualquote
|
|
{
|
|
$$ = $1;
|
|
}
|
|
|
|
qualquote:
|
|
QUOTED
|
|
{
|
|
LOGP("QUOTED[" << *$1 << "]" << endl);
|
|
$$ = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
|
|
delete $1;
|
|
}
|
|
| QUOTED QUALIFIERS
|
|
{
|
|
LOGP("QUOTED[" << *$1 << "] QUALIFIERS[" << *$2 << "]" << endl);
|
|
Rcl::SearchDataClauseDist *cl =
|
|
new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, *$1, 0);
|
|
qualify(cl, *$2);
|
|
$$ = cl;
|
|
delete $1;
|
|
delete $2;
|
|
}
|
|
|
|
|
|
%%
|
|
|
|
#include <ctype.h>
|
|
|
|
// Look for int at index, skip and return new index found? value.
|
|
static unsigned int qualGetInt(const string& q, unsigned int cur, int *pval)
|
|
{
|
|
unsigned int ncur = cur;
|
|
if (cur < q.size() - 1) {
|
|
char *endptr;
|
|
int val = strtol(&q[cur + 1], &endptr, 10);
|
|
if (endptr != &q[cur + 1]) {
|
|
ncur += endptr - &q[cur + 1];
|
|
*pval = val;
|
|
}
|
|
}
|
|
return ncur;
|
|
}
|
|
|
|
static void qualify(Rcl::SearchDataClauseDist *cl, const string& quals)
|
|
{
|
|
// cerr << "qualify(" << cl << ", " << quals << ")" << endl;
|
|
for (unsigned int i = 0; i < quals.length(); i++) {
|
|
//fprintf(stderr, "qual char %c\n", quals[i]);
|
|
switch (quals[i]) {
|
|
case 'b':
|
|
cl->setWeight(10.0);
|
|
break;
|
|
case 'c': break;
|
|
case 'C':
|
|
cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
|
|
break;
|
|
case 'd': break;
|
|
case 'D':
|
|
cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
|
|
break;
|
|
case 'e':
|
|
cl->addModifier(Rcl::SearchDataClause::SDCM_CASESENS);
|
|
cl->addModifier(Rcl::SearchDataClause::SDCM_DIACSENS);
|
|
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
|
|
break;
|
|
case 'l':
|
|
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
|
|
break;
|
|
case 'L': break;
|
|
case 'o':
|
|
{
|
|
int slack = 10;
|
|
i = qualGetInt(quals, i, &slack);
|
|
cl->setslack(slack);
|
|
//cerr << "set slack " << cl->getslack() << " done" << endl;
|
|
}
|
|
break;
|
|
case 'p':
|
|
cl->setTp(Rcl::SCLT_NEAR);
|
|
if (cl->getslack() == 0) {
|
|
cl->setslack(10);
|
|
//cerr << "set slack " << cl->getslack() << " done" << endl;
|
|
}
|
|
break;
|
|
case 's':
|
|
cl->addModifier(Rcl::SearchDataClause::SDCM_NOSYNS);
|
|
break;
|
|
case 'S':
|
|
break;
|
|
case '.':case '0':case '1':case '2':case '3':case '4':
|
|
case '5':case '6':case '7':case '8':case '9':
|
|
{
|
|
int n = 0;
|
|
float factor = 1.0;
|
|
if (sscanf(&(quals[i]), "%f %n", &factor, &n)) {
|
|
if (factor != 1.0) {
|
|
cl->setWeight(factor);
|
|
}
|
|
}
|
|
if (n > 0)
|
|
i += n - 1;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// specialstartchars are special only at the beginning of a token
|
|
// (e.g. doctor-who is a term, not 2 terms separated by '-')
|
|
static const string specialstartchars("-");
|
|
// specialinchars are special everywhere except inside a quoted string
|
|
static const string specialinchars(":=<>()");
|
|
|
|
// Called with the first dquote already read
|
|
static int parseString(WasaParserDriver *d, yy::parser::semantic_type *yylval)
|
|
{
|
|
string* value = new string();
|
|
d->qualifiers().clear();
|
|
int c;
|
|
while ((c = d->GETCHAR())) {
|
|
switch (c) {
|
|
case '\\':
|
|
/* Escape: get next char */
|
|
c = d->GETCHAR();
|
|
if (c == 0) {
|
|
value->push_back(c);
|
|
goto out;
|
|
}
|
|
value->push_back(c);
|
|
break;
|
|
case '"':
|
|
/* End of string. Look for qualifiers */
|
|
while ((c = d->GETCHAR()) && (isalnum(c) || c == '.'))
|
|
d->qualifiers().push_back(c);
|
|
d->UNGETCHAR(c);
|
|
goto out;
|
|
default:
|
|
value->push_back(c);
|
|
}
|
|
}
|
|
out:
|
|
//cerr << "GOT QUOTED ["<<value<<"] quals [" << d->qualifiers() << "]" << endl;
|
|
yylval->str = value;
|
|
return yy::parser::token::QUOTED;
|
|
}
|
|
|
|
|
|
int yylex(yy::parser::semantic_type *yylval, yy::parser::location_type *,
|
|
WasaParserDriver *d)
|
|
{
|
|
if (!d->qualifiers().empty()) {
|
|
yylval->str = new string();
|
|
yylval->str->swap(d->qualifiers());
|
|
return yy::parser::token::QUALIFIERS;
|
|
}
|
|
|
|
int c;
|
|
|
|
/* Skip white space. */
|
|
while ((c = d->GETCHAR()) && isspace(c))
|
|
continue;
|
|
|
|
if (c == 0)
|
|
return 0;
|
|
|
|
if (specialstartchars.find_first_of(c) != string::npos) {
|
|
//cerr << "yylex: return " << c << endl;
|
|
return c;
|
|
}
|
|
|
|
// field-term relations, and ranges
|
|
switch (c) {
|
|
case '=': return yy::parser::token::EQUALS;
|
|
case ':': return yy::parser::token::CONTAINS;
|
|
case '<': {
|
|
int c1 = d->GETCHAR();
|
|
if (c1 == '=') {
|
|
return yy::parser::token::SMALLEREQ;
|
|
} else {
|
|
d->UNGETCHAR(c1);
|
|
return yy::parser::token::SMALLER;
|
|
}
|
|
}
|
|
case '.': {
|
|
int c1 = d->GETCHAR();
|
|
if (c1 == '.') {
|
|
return yy::parser::token::RANGE;
|
|
} else {
|
|
d->UNGETCHAR(c1);
|
|
break;
|
|
}
|
|
}
|
|
case '>': {
|
|
int c1 = d->GETCHAR();
|
|
if (c1 == '=') {
|
|
return yy::parser::token::GREATEREQ;
|
|
} else {
|
|
d->UNGETCHAR(c1);
|
|
return yy::parser::token::GREATER;
|
|
}
|
|
}
|
|
case '(': case ')':
|
|
return c;
|
|
}
|
|
|
|
if (c == '"')
|
|
return parseString(d, yylval);
|
|
|
|
d->UNGETCHAR(c);
|
|
|
|
// Other chars start a term or field name or reserved word
|
|
string* word = new string();
|
|
while ((c = d->GETCHAR())) {
|
|
if (isspace(c)) {
|
|
//cerr << "Word broken by whitespace" << endl;
|
|
break;
|
|
} else if (specialinchars.find_first_of(c) != string::npos) {
|
|
//cerr << "Word broken by special char" << endl;
|
|
d->UNGETCHAR(c);
|
|
break;
|
|
} else if (c == '.') {
|
|
int c1 = d->GETCHAR();
|
|
if (c1 == '.') {
|
|
d->UNGETCHAR(c1);
|
|
d->UNGETCHAR(c);
|
|
break;
|
|
} else {
|
|
d->UNGETCHAR(c1);
|
|
word->push_back(c);
|
|
}
|
|
} else if (c == 0) {
|
|
//cerr << "Word broken by EOF" << endl;
|
|
break;
|
|
} else {
|
|
word->push_back(c);
|
|
}
|
|
}
|
|
|
|
if (!word->compare("AND") || !word->compare("&&")) {
|
|
delete word;
|
|
return yy::parser::token::AND;
|
|
} else if (!word->compare("OR") || !word->compare("||")) {
|
|
delete word;
|
|
return yy::parser::token::OR;
|
|
}
|
|
|
|
// cerr << "Got word [" << word << "]" << endl;
|
|
yylval->str = word;
|
|
return yy::parser::token::WORD;
|
|
}
|