511 lines
14 KiB
C++
511 lines
14 KiB
C++
/* Copyright (C) 2006 J.F.Dockes
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the
|
|
* Free Software Foundation, Inc.,
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*/
|
|
#ifndef TEST_WASASTRINGTOQUERY
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <regex.h>
|
|
|
|
#include "smallut.h"
|
|
#include "wasastringtoquery.h"
|
|
|
|
#undef DEB_WASASTRINGTOQ
|
|
#ifdef DEB_WASASTRINGTOQ
|
|
#define DPRINT(X) fprintf X
|
|
#define DUMPQ(Q) {string D;Q->describe(D);fprintf(stderr, "%s\n", D.c_str());}
|
|
#else
|
|
#define DPRINT(X)
|
|
#define DUMPQ(Q)
|
|
#endif
|
|
|
|
WasaQuery::~WasaQuery()
|
|
{
|
|
for (vector<WasaQuery*>::iterator it = m_subs.begin();
|
|
it != m_subs.end(); it++) {
|
|
delete *it;
|
|
}
|
|
m_subs.clear();
|
|
}
|
|
|
|
static const char* reltosrel(WasaQuery::Rel rel)
|
|
{
|
|
switch (rel) {
|
|
case WasaQuery::REL_EQUALS: return "=";
|
|
case WasaQuery::REL_CONTAINS: return ":";
|
|
case WasaQuery::REL_LT: return "<";
|
|
case WasaQuery::REL_LTE: return "<=";
|
|
case WasaQuery::REL_GT: return ">";
|
|
case WasaQuery::REL_GTE: return ">=";
|
|
default: return "?";
|
|
}
|
|
}
|
|
|
|
void WasaQuery::describe(string &desc) const
|
|
{
|
|
desc += "(";
|
|
string fieldspec = m_fieldspec.empty() ? string() : m_fieldspec +
|
|
reltosrel(m_rel);
|
|
switch (m_op) {
|
|
case OP_NULL:
|
|
desc += "NULL";
|
|
break;
|
|
case OP_LEAF:
|
|
desc += fieldspec + m_value;
|
|
break;
|
|
case OP_EXCL:
|
|
desc += string("NOT (" ) + fieldspec + m_value + ") ";
|
|
break;
|
|
case OP_OR:
|
|
case OP_AND:
|
|
for (vector<WasaQuery *>::const_iterator it = m_subs.begin();
|
|
it != m_subs.end(); it++) {
|
|
(*it)->describe(desc);
|
|
vector<WasaQuery *>::const_iterator it1 = it;
|
|
it1++;
|
|
if (it1 != m_subs.end())
|
|
desc += m_op == OP_OR ? "OR ": "AND ";
|
|
}
|
|
break;
|
|
}
|
|
if (desc[desc.length() - 1] == ' ')
|
|
desc.erase(desc.length() - 1);
|
|
desc += ")";
|
|
if (m_modifiers != 0) {
|
|
if (m_modifiers & WQM_BOOST) desc += "BOOST|";
|
|
if (m_modifiers & WQM_CASESENS) desc += "CASESENS|";
|
|
if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|";
|
|
if (m_modifiers & WQM_FUZZY) desc += "FUZZY|";
|
|
if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|";
|
|
if (m_modifiers & WQM_PHRASESLACK) {
|
|
char buf[100];
|
|
sprintf(buf, "%d", m_slack);
|
|
desc += "PHRASESLACK(" + string(buf) + string(")|");
|
|
}
|
|
if (m_modifiers & WQM_PROX) desc += "PROX|";
|
|
if (m_modifiers & WQM_REGEX) desc += "REGEX|";
|
|
if (m_modifiers & WQM_SLOPPY) desc += "SLOPPY|";
|
|
if (m_modifiers & WQM_WORDS) desc += "WORDS|";
|
|
|
|
if (desc.length() > 0 && desc[desc.length()-1] == '|')
|
|
desc.erase(desc.length()-1);
|
|
}
|
|
desc += " ";
|
|
}
|
|
|
|
// The string query parser code:
|
|
|
|
/* Shamelessly lifted from Beagle:
|
|
* This is our regular Expression Pattern:
|
|
* we expect something like this:
|
|
* -key:"Value String"modifiers
|
|
* key:Value
|
|
* or
|
|
* Value
|
|
*/
|
|
|
|
/* The master regular expression used to parse a query string
|
|
* Sub-expressions in parenthesis are numbered from 1. Each opening
|
|
* parenthesis increases the index, but we're not interested in all
|
|
* Deviations from standard:
|
|
* Relation: the standard-conformant line read as (release<1.16):
|
|
"(:|=|<|>|<=|>=)" //7 Relation
|
|
but we are not actually making use of the relation type
|
|
(interpreting all as ":"), and this can product unexpected results
|
|
as a (ie pasted) search for nonexfield=value will silently drop
|
|
the nonexfield part, while the user probably was not aware of
|
|
triggering a field search (expecting just ':' to do this).
|
|
*/
|
|
static const char * parserExpr =
|
|
"(OR|\\|\\|)[[:space:]]*" //1 OR,||
|
|
"|"
|
|
"(AND|&&)[[:space:]]*" // 2 AND,&& (ignored, default)
|
|
"|"
|
|
"(" //3
|
|
"([+-])?" //4 Force or exclude indicator
|
|
"(" //5
|
|
"([[:alpha:]][[:alnum:]:]*)" //6 Field spec: ie: "dc:title:letitre"
|
|
"[[:space:]]*"
|
|
"(:|=|>|<)" //7 Relation
|
|
"[[:space:]]*)?"
|
|
"(" //8
|
|
"(\"" //9
|
|
"([^\"]+)" //10 "A quoted term"
|
|
"\")"
|
|
"([bcCdDeflLoprsw.0-9]*)" //11 modifiers
|
|
"|"
|
|
"([^[:space:]\"]+)" //12 ANormalTerm
|
|
")"
|
|
")[[:space:]]*"
|
|
;
|
|
|
|
// For debugging the parser. But see also NMATCH
|
|
static const char *matchNames[] = {
|
|
/* 0*/ "",
|
|
/* 1*/ "OR",
|
|
/* 2*/ "AND",
|
|
/* 3*/ "",
|
|
/* 4*/ "+-",
|
|
/* 5*/ "",
|
|
/* 6*/ "FIELD",
|
|
/* 7*/ "RELATION",
|
|
/* 8*/ "",
|
|
/* 9*/ "",
|
|
/*10*/ "QUOTEDTERM",
|
|
/*11*/ "MODIFIERS",
|
|
/*12*/ "TERM",
|
|
};
|
|
#define NMATCH (sizeof(matchNames) / sizeof(char *))
|
|
|
|
// Symbolic names for the interesting submatch indices
|
|
enum SbMatchIdx {SMI_OR=1, SMI_AND=2, SMI_PM=4, SMI_FIELD=6, SMI_REL=7,
|
|
SMI_QUOTED=10, SMI_MODIF=11, SMI_TERM=12};
|
|
|
|
static const int maxmatchlen = 1024;
|
|
static const int errbuflen = 300;
|
|
|
|
class StringToWasaQuery::Internal {
|
|
public:
|
|
Internal()
|
|
: m_rxneedsfree(false)
|
|
{}
|
|
~Internal()
|
|
{
|
|
if (m_rxneedsfree)
|
|
regfree(&m_rx);
|
|
}
|
|
bool checkSubMatch(int i, char *match, string& reason)
|
|
{
|
|
if (i < 0 || i >= int(NMATCH) || m_pmatch[i].rm_so == -1) {
|
|
//DPRINT((stderr, "checkSubMatch: no match: i %d rm_so %d\n",
|
|
//i, m_pmatch[i].rm_so));
|
|
return false;
|
|
}
|
|
if (m_pmatch[i].rm_eo - m_pmatch[i].rm_so <= 0) {
|
|
// weird and fatal
|
|
reason = "Internal regular expression handling error";
|
|
return false;
|
|
}
|
|
//DPRINT((stderr, "checkSubMatch: so %d eo %d\n", m_pmatch[i].rm_so,
|
|
//m_pmatch[i].rm_eo));
|
|
memcpy(match, m_cp + m_pmatch[i].rm_so,
|
|
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
|
|
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
|
|
return true;
|
|
}
|
|
|
|
WasaQuery *stringToQuery(const string& str, string& reason);
|
|
|
|
friend class StringToWasaQuery;
|
|
private:
|
|
const char *m_cp;
|
|
regex_t m_rx;
|
|
bool m_rxneedsfree;
|
|
regmatch_t m_pmatch[NMATCH];
|
|
};
|
|
|
|
StringToWasaQuery::StringToWasaQuery()
|
|
: internal(new Internal)
|
|
{
|
|
}
|
|
|
|
StringToWasaQuery::~StringToWasaQuery()
|
|
{
|
|
delete internal;
|
|
}
|
|
|
|
WasaQuery *
|
|
StringToWasaQuery::stringToQuery(const string& str, string& reason)
|
|
{
|
|
if (internal == 0)
|
|
return 0;
|
|
WasaQuery *wq = internal->stringToQuery(str, reason);
|
|
DUMPQ(wq);
|
|
return wq;
|
|
}
|
|
|
|
WasaQuery *
|
|
StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
|
{
|
|
if (m_rxneedsfree)
|
|
regfree(&m_rx);
|
|
|
|
char errbuf[errbuflen+1];
|
|
int errcode;
|
|
if ((errcode = regcomp(&m_rx, parserExpr, REG_EXTENDED)) != 0) {
|
|
regerror(errcode, &m_rx, errbuf, errbuflen);
|
|
reason = errbuf;
|
|
return 0;
|
|
}
|
|
m_rxneedsfree = true;
|
|
|
|
const char *cpe;
|
|
m_cp = str.c_str();
|
|
cpe = str.c_str() + str.length();
|
|
|
|
WasaQuery *query = new WasaQuery;
|
|
query->m_op = WasaQuery::OP_AND;
|
|
WasaQuery *orChain = 0;
|
|
bool prev_or = false;
|
|
|
|
// Loop on repeated regexp matches on the main string.
|
|
for (int loop = 0;;loop++) {
|
|
if ((errcode = regexec(&m_rx, m_cp, NMATCH, m_pmatch, 0))) {
|
|
regerror(errcode, &m_rx, errbuf, errbuflen);
|
|
reason = errbuf;
|
|
return 0;
|
|
}
|
|
if (m_pmatch[0].rm_eo <= 0) {
|
|
// weird and fatal
|
|
reason = "Internal regular expression handling error";
|
|
return 0;
|
|
}
|
|
|
|
#ifdef DEB_WASASTRINGTOQ
|
|
DPRINT((stderr, "Next part:\n"));
|
|
for (unsigned int i = 0; i < NMATCH; i++) {
|
|
if (m_pmatch[i].rm_so == -1) continue;
|
|
char match[maxmatchlen+1];
|
|
memcpy(match, m_cp + m_pmatch[i].rm_so,
|
|
m_pmatch[i].rm_eo - m_pmatch[i].rm_so);
|
|
match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0;
|
|
if (matchNames[i][0])
|
|
DPRINT((stderr, "%10s: [%s] (%d->%d)\n", matchNames[i], match,
|
|
(int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo));
|
|
}
|
|
#endif
|
|
|
|
char match[maxmatchlen+1];
|
|
if (checkSubMatch(SMI_OR, match, reason)) {
|
|
if (prev_or) {
|
|
// Bad syntax
|
|
reason = "Bad syntax: consecutive OR";
|
|
return 0;
|
|
}
|
|
|
|
if (orChain == 0) {
|
|
// Fist OR seen: start OR subclause.
|
|
if ((orChain = new WasaQuery()) == 0) {
|
|
reason = "Out of memory";
|
|
return 0;
|
|
}
|
|
orChain->m_op = WasaQuery::OP_OR;
|
|
}
|
|
|
|
// For the first OR, we need to transfer the previous
|
|
// query from the main vector to the OR subquery
|
|
if (orChain->m_subs.empty() && !query->m_subs.empty()) {
|
|
orChain->m_subs.push_back(query->m_subs.back());
|
|
query->m_subs.pop_back();
|
|
}
|
|
prev_or = true;
|
|
|
|
} else if (checkSubMatch(SMI_AND, match, reason)) {
|
|
// Do nothing, AND is the default. We might want to check for
|
|
// errors like consecutive ANDs, or OR AND
|
|
|
|
} else {
|
|
|
|
WasaQuery *nclause = new WasaQuery;
|
|
if (nclause == 0) {
|
|
reason = "Out of memory";
|
|
return 0;
|
|
}
|
|
|
|
// Check for quoted or unquoted value
|
|
unsigned int mods = 0;
|
|
if (checkSubMatch(SMI_QUOTED, match, reason)) {
|
|
nclause->m_value = match;
|
|
} else if (checkSubMatch(SMI_TERM, match, reason)) {
|
|
nclause->m_value = match;
|
|
}
|
|
|
|
if (nclause->m_value.empty()) {
|
|
// Isolated +- or fieldname: without a value. Ignore until
|
|
// told otherwise.
|
|
DPRINT((stderr, "Clause with empty value, skipping\n"));
|
|
delete nclause;
|
|
goto nextfield;
|
|
}
|
|
|
|
if (checkSubMatch(SMI_MODIF, match, reason)) {
|
|
DPRINT((stderr, "Got modifiers: [%s]\n", match));
|
|
for (unsigned int i = 0; i < strlen(match); i++) {
|
|
switch (match[i]) {
|
|
case 'b':
|
|
mods |= WasaQuery::WQM_BOOST;
|
|
nclause->m_weight = 10.0;
|
|
break;
|
|
case 'c': break;
|
|
case 'C': mods |= WasaQuery::WQM_CASESENS; break;
|
|
case 'd': break;
|
|
case 'D': mods |= WasaQuery::WQM_DIACSENS; break;
|
|
case 'e': mods |= WasaQuery::WQM_CASESENS |
|
|
WasaQuery::WQM_DIACSENS |
|
|
WasaQuery::WQM_NOSTEM;
|
|
break;
|
|
case 'f': mods |= WasaQuery::WQM_FUZZY; break;
|
|
case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
|
|
case 'L': break;
|
|
case 'o':
|
|
mods |= WasaQuery::WQM_PHRASESLACK;
|
|
// Default slack if specified only by 'o' is 10.
|
|
nclause->m_slack = 10;
|
|
if (i < strlen(match) - 1) {
|
|
char *endptr;
|
|
int slack = strtol(match+i+1, &endptr, 10);
|
|
if (endptr != match+i+1) {
|
|
i += endptr - (match+i+1);
|
|
nclause->m_slack = slack;
|
|
}
|
|
}
|
|
break;
|
|
case 'p':
|
|
mods |= WasaQuery::WQM_PROX;
|
|
nclause->m_slack = 10;
|
|
break;
|
|
case 'r': mods |= WasaQuery::WQM_REGEX; break;
|
|
case 's': mods |= WasaQuery::WQM_SLOPPY; break;
|
|
case 'w': mods |= WasaQuery::WQM_WORDS; break;
|
|
case '.':case '0':case '1':case '2':case '3':case '4':
|
|
case '5':case '6':case '7':case '8':case '9':
|
|
{
|
|
int n;
|
|
float factor;
|
|
if (sscanf(match+i, "%f %n", &factor, &n)) {
|
|
nclause->m_weight = factor;
|
|
DPRINT((stderr, "Got factor %.2f len %d\n",
|
|
factor, n));
|
|
}
|
|
if (n)
|
|
i += n-1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
nclause->m_modifiers = WasaQuery::Modifier(mods);
|
|
|
|
// Field indicator ?
|
|
if (checkSubMatch(SMI_FIELD, match, reason)) {
|
|
// We used Check for special fields indicating sorting
|
|
// etc. here but this went away from the spec. See 1.4
|
|
// if it comes back
|
|
nclause->m_fieldspec = match;
|
|
if (checkSubMatch(SMI_REL, match, reason)) {
|
|
switch (match[0]) {
|
|
case '=':nclause->m_rel = WasaQuery::REL_EQUALS;break;
|
|
case ':':nclause->m_rel = WasaQuery::REL_CONTAINS;break;
|
|
case '<':
|
|
if (match[1] == '=')
|
|
nclause->m_rel = WasaQuery::REL_LTE;
|
|
else
|
|
nclause->m_rel = WasaQuery::REL_LT;
|
|
break;
|
|
case '>':
|
|
if (match[1] == '=')
|
|
nclause->m_rel = WasaQuery::REL_GTE;
|
|
else
|
|
nclause->m_rel = WasaQuery::REL_GT;
|
|
break;
|
|
default:
|
|
nclause->m_rel = WasaQuery::REL_CONTAINS;
|
|
}
|
|
} else {
|
|
// ?? If field matched we should have a relation
|
|
nclause->m_rel = WasaQuery::REL_CONTAINS;
|
|
}
|
|
}
|
|
|
|
// +- indicator ?
|
|
if (checkSubMatch(SMI_PM, match, reason) && match[0] == '-') {
|
|
nclause->m_op = WasaQuery::OP_EXCL;
|
|
} else {
|
|
nclause->m_op = WasaQuery::OP_LEAF;
|
|
}
|
|
|
|
if (prev_or) {
|
|
// The precedent token was an OR, add new clause to or chain
|
|
//DPRINT((stderr, "Adding to OR chain\n"));
|
|
orChain->m_subs.push_back(nclause);
|
|
} else {
|
|
if (orChain) {
|
|
// Getting out of OR. Add the OR subquery to the main one
|
|
//DPRINT((stderr, "Adding OR chain to main\n"));
|
|
query->m_subs.push_back(orChain);
|
|
orChain = 0;
|
|
}
|
|
//DPRINT((stderr, "Adding to main chain\n"));
|
|
// Add new clause to main query
|
|
query->m_subs.push_back(nclause);
|
|
}
|
|
|
|
prev_or = false;
|
|
}
|
|
|
|
nextfield:
|
|
// Advance current string position. We checked earlier that
|
|
// the increment is strictly positive, so we won't loop
|
|
// forever
|
|
m_cp += m_pmatch[0].rm_eo;
|
|
if (m_cp >= cpe)
|
|
break;
|
|
}
|
|
|
|
if (orChain) {
|
|
// Getting out of OR. Add the OR subquery to the main one
|
|
DPRINT((stderr, "Adding OR chain to main.Before: \n"));
|
|
DUMPQ(query);
|
|
DUMPQ(orChain);
|
|
query->m_subs.push_back(orChain);
|
|
}
|
|
|
|
regfree(&m_rx);
|
|
m_rxneedsfree = false;
|
|
return query;
|
|
}
|
|
|
|
#else // TEST
|
|
|
|
#include <stdio.h>
|
|
#include "wasastringtoquery.h"
|
|
|
|
static char *thisprog;
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
thisprog = argv[0];
|
|
argc--; argv++;
|
|
|
|
if (argc != 1) {
|
|
fprintf(stderr, "need one arg\n");
|
|
exit(1);
|
|
}
|
|
const string str = *argv++;argc--;
|
|
string reason;
|
|
StringToWasaQuery qparser;
|
|
WasaQuery *q = qparser.stringToQuery(str, reason);
|
|
if (q == 0) {
|
|
fprintf(stderr, "stringToQuery failed: %s\n", reason.c_str());
|
|
exit(1);
|
|
}
|
|
string desc;
|
|
q->describe(desc);
|
|
fprintf(stderr, "Finally: %s\n", desc.c_str());
|
|
exit(0);
|
|
}
|
|
|
|
#endif // TEST_WASASTRINGTOQUERY
|