try to parse the whole of Xesam user language 0.95
This commit is contained in:
parent
b43d8ff1a7
commit
a41eb8eef1
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.7 2008-07-01 11:51:51 dockes Exp $ (C) 2006 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.8 2008-08-26 13:47:21 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -41,10 +41,24 @@ WasaQuery::~WasaQuery()
|
|||||||
m_subs.clear();
|
m_subs.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char* reltosrel(WasaQuery::Rel rel)
|
||||||
|
{
|
||||||
|
switch (rel) {
|
||||||
|
case WasaQuery::REL_EQUALS: return "=";
|
||||||
|
case WasaQuery::REL_CONTAINS: return ":";
|
||||||
|
case WasaQuery::REL_LT: return "<";
|
||||||
|
case WasaQuery::REL_LTE: return "<=";
|
||||||
|
case WasaQuery::REL_GT: return ">";
|
||||||
|
case WasaQuery::REL_GTE: return ">=";
|
||||||
|
default: return "?";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void WasaQuery::describe(string &desc) const
|
void WasaQuery::describe(string &desc) const
|
||||||
{
|
{
|
||||||
desc += "(";
|
desc += "(";
|
||||||
string fieldspec = m_fieldspec.empty() ? string() : m_fieldspec + ": ";
|
string fieldspec = m_fieldspec.empty() ? string() : m_fieldspec +
|
||||||
|
reltosrel(m_rel);
|
||||||
switch (m_op) {
|
switch (m_op) {
|
||||||
case OP_NULL:
|
case OP_NULL:
|
||||||
desc += "NULL";
|
desc += "NULL";
|
||||||
@ -71,6 +85,7 @@ void WasaQuery::describe(string &desc) const
|
|||||||
desc.erase(desc.length() - 1);
|
desc.erase(desc.length() - 1);
|
||||||
desc += ")";
|
desc += ")";
|
||||||
if (m_modifiers != 0) {
|
if (m_modifiers != 0) {
|
||||||
|
if (m_modifiers & WQM_BOOST) desc += "BOOST|";
|
||||||
if (m_modifiers & WQM_CASESENS) desc += "CASESENS|";
|
if (m_modifiers & WQM_CASESENS) desc += "CASESENS|";
|
||||||
if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|";
|
if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|";
|
||||||
if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|";
|
if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|";
|
||||||
@ -96,14 +111,6 @@ void WasaQuery::describe(string &desc) const
|
|||||||
* key:Value
|
* key:Value
|
||||||
* or
|
* or
|
||||||
* Value
|
* Value
|
||||||
([+-]?) # Required or Prohibited (optional)
|
|
||||||
(\w+:)? # Key (optional)
|
|
||||||
( # Query Text
|
|
||||||
(\"([^\"]*)\"?)# quoted
|
|
||||||
| # or
|
|
||||||
([^\s\"]+) # unquoted
|
|
||||||
)
|
|
||||||
";
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* The master regular expression used to parse a query string
|
/* The master regular expression used to parse a query string
|
||||||
@ -113,41 +120,47 @@ void WasaQuery::describe(string &desc) const
|
|||||||
static const char * parserExpr =
|
static const char * parserExpr =
|
||||||
"([oO][rR]|\\|\\|)[[:space:]]*" //1 OR,or,||
|
"([oO][rR]|\\|\\|)[[:space:]]*" //1 OR,or,||
|
||||||
"|"
|
"|"
|
||||||
"(" //2
|
"([Aa][Nn][Dd]|&&)[[:space:]]*" // 2 AND,and,&& (ignored, default)
|
||||||
"([+-])?" //3 Force or exclude indicator
|
"|"
|
||||||
"(" //4
|
"(" //3
|
||||||
"([[:alpha:]][[:alnum:]]*)" //5 Field spec: "fieldname:"
|
"([+-])?" //4 Force or exclude indicator
|
||||||
":)?"
|
"(" //5
|
||||||
"(" //6
|
"([[:alpha:]][[:alnum:]:]*)" //6 Field spec: ie: "dc:title:letitre"
|
||||||
"(\"" //7
|
"[[:space:]]*"
|
||||||
"([^\"]+)" //8 "A quoted term"
|
"(:|=|<|>|<=|>=)" //7 Relation
|
||||||
|
"[[:space:]]*)?"
|
||||||
|
"(" //8
|
||||||
|
"(\"" //9
|
||||||
|
"([^\"]+)" //10 "A quoted term"
|
||||||
"\")"
|
"\")"
|
||||||
"([a-zA-Z0-9]*)" //9 modifiers
|
"([a-zA-Z0-9]*)" //11 modifiers
|
||||||
"|"
|
"|"
|
||||||
"([^[:space:]\"]+)" //10 ANormalTerm
|
"([^[:space:]\"]+)" //12 ANormalTerm
|
||||||
")"
|
")"
|
||||||
")[[:space:]]*"
|
")[[:space:]]*"
|
||||||
;
|
;
|
||||||
|
|
||||||
// For debugging the parser. But see also NMATCH
|
// For debugging the parser. But see also NMATCH
|
||||||
static const char *matchNames[] = {
|
static const char *matchNames[] = {
|
||||||
/*0*/ "",
|
/* 0*/ "",
|
||||||
/*1*/ "OR",
|
/* 1*/ "OR",
|
||||||
/*2*/ "",
|
/* 2*/ "AND",
|
||||||
/*3*/ "+-",
|
/* 3*/ "",
|
||||||
/*4*/ "",
|
/* 4*/ "+-",
|
||||||
/*5*/ "FIELD",
|
/* 5*/ "",
|
||||||
/*6*/ "",
|
/* 6*/ "FIELD",
|
||||||
/*7*/ "",
|
/* 7*/ "RELATION",
|
||||||
/*8*/ "QUOTEDTERM",
|
/* 8*/ "",
|
||||||
/*9*/ "MODIIFIERS",
|
/* 9*/ "",
|
||||||
/*10*/ "TERM",
|
/*10*/ "QUOTEDTERM",
|
||||||
|
/*11*/ "MODIIFIERS",
|
||||||
|
/*12*/ "TERM",
|
||||||
};
|
};
|
||||||
#define NMATCH (sizeof(matchNames) / sizeof(char *))
|
#define NMATCH (sizeof(matchNames) / sizeof(char *))
|
||||||
|
|
||||||
// Symbolic names for the interesting submatch indices
|
// Symbolic names for the interesting submatch indices
|
||||||
enum SbMatchIdx {SMI_OR=1, SMI_PM=3, SMI_FIELD=5, SMI_QUOTED=8,
|
enum SbMatchIdx {SMI_OR=1, SMI_AND=2, SMI_PM=4, SMI_FIELD=6, SMI_REL=7,
|
||||||
SMI_MODIF=9, SMI_TERM=10};
|
SMI_QUOTED=10, SMI_MODIF=11, SMI_TERM=12};
|
||||||
|
|
||||||
static const int maxmatchlen = 1024;
|
static const int maxmatchlen = 1024;
|
||||||
static const int errbuflen = 300;
|
static const int errbuflen = 300;
|
||||||
@ -284,6 +297,10 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
|||||||
}
|
}
|
||||||
prev_or = true;
|
prev_or = true;
|
||||||
|
|
||||||
|
} else if (checkSubMatch(SMI_AND, match, reason)) {
|
||||||
|
// Do nothing, AND is the default. We might want to check for
|
||||||
|
// errors like consecutive ANDs, or OR AND
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
WasaQuery *nclause = new WasaQuery;
|
WasaQuery *nclause = new WasaQuery;
|
||||||
@ -312,19 +329,20 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
|||||||
unsigned int mods = 0;
|
unsigned int mods = 0;
|
||||||
for (unsigned int i = 0; i < strlen(match); i++) {
|
for (unsigned int i = 0; i < strlen(match); i++) {
|
||||||
switch (match[i]) {
|
switch (match[i]) {
|
||||||
case 'C': mods |= WasaQuery::WQM_CASESENS; break;
|
|
||||||
case 'D': mods |= WasaQuery::WQM_DIACSENS; break;
|
|
||||||
case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
|
|
||||||
case 'e': mods |= WasaQuery::WQM_CASESENS |
|
|
||||||
WasaQuery::WQM_DIACSENS |
|
|
||||||
WasaQuery::WQM_NOSTEM; break;
|
|
||||||
case 'f': mods |= WasaQuery::WQM_FUZZY; break;
|
|
||||||
case 'b': mods |= WasaQuery::WQM_BOOST; break;
|
case 'b': mods |= WasaQuery::WQM_BOOST; break;
|
||||||
|
case 'c': break;
|
||||||
|
case 'C': mods |= WasaQuery::WQM_CASESENS; break;
|
||||||
|
case 'd': break;
|
||||||
|
case 'D': mods |= WasaQuery::WQM_DIACSENS; break;
|
||||||
|
case 'e': mods |= WasaQuery::WQM_CASESENS | WasaQuery::WQM_DIACSENS | WasaQuery::WQM_NOSTEM; break;
|
||||||
|
case 'f': mods |= WasaQuery::WQM_FUZZY; break;
|
||||||
|
case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
|
||||||
|
case 'L': break;
|
||||||
|
case 'o': mods |= WasaQuery::WQM_PHRASESLACK; break;
|
||||||
case 'p': mods |= WasaQuery::WQM_PROX; break;
|
case 'p': mods |= WasaQuery::WQM_PROX; break;
|
||||||
|
case 'r': mods |= WasaQuery::WQM_REGEX; break;
|
||||||
case 's': mods |= WasaQuery::WQM_SLOPPY; break;
|
case 's': mods |= WasaQuery::WQM_SLOPPY; break;
|
||||||
case 'w': mods |= WasaQuery::WQM_WORDS; break;
|
case 'w': mods |= WasaQuery::WQM_WORDS; break;
|
||||||
case 'o': mods |= WasaQuery::WQM_PHRASESLACK; break;
|
|
||||||
case 'r': mods |= WasaQuery::WQM_REGEX; break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
nclause->m_modifiers = WasaQuery::Modifier(mods);
|
nclause->m_modifiers = WasaQuery::Modifier(mods);
|
||||||
@ -336,6 +354,29 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
|||||||
// etc. here but this went away from the spec. See 1.4
|
// etc. here but this went away from the spec. See 1.4
|
||||||
// if it comes back
|
// if it comes back
|
||||||
nclause->m_fieldspec = match;
|
nclause->m_fieldspec = match;
|
||||||
|
if (checkSubMatch(SMI_REL, match, reason)) {
|
||||||
|
switch (match[0]) {
|
||||||
|
case '=':nclause->m_rel = WasaQuery::REL_EQUALS;break;
|
||||||
|
case ':':nclause->m_rel = WasaQuery::REL_CONTAINS;break;
|
||||||
|
case '<':
|
||||||
|
if (match[1] == '=')
|
||||||
|
nclause->m_rel = WasaQuery::REL_LTE;
|
||||||
|
else
|
||||||
|
nclause->m_rel = WasaQuery::REL_LT;
|
||||||
|
break;
|
||||||
|
case '>':
|
||||||
|
if (match[1] == '=')
|
||||||
|
nclause->m_rel = WasaQuery::REL_GTE;
|
||||||
|
else
|
||||||
|
nclause->m_rel = WasaQuery::REL_GT;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
nclause->m_rel = WasaQuery::REL_CONTAINS;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// ?? If field matched we should have a relation
|
||||||
|
nclause->m_rel = WasaQuery::REL_CONTAINS;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// +- indicator ?
|
// +- indicator ?
|
||||||
@ -345,7 +386,6 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
|
|||||||
nclause->m_op = WasaQuery::OP_LEAF;
|
nclause->m_op = WasaQuery::OP_LEAF;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (prev_or) {
|
if (prev_or) {
|
||||||
// The precedent token was an OR, add new clause to or chain
|
// The precedent token was an OR, add new clause to or chain
|
||||||
//DPRINT((stderr, "Adding to OR chain\n"));
|
//DPRINT((stderr, "Adding to OR chain\n"));
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
#ifndef _WASASTRINGTOQUERY_H_INCLUDED_
|
#ifndef _WASASTRINGTOQUERY_H_INCLUDED_
|
||||||
#define _WASASTRINGTOQUERY_H_INCLUDED_
|
#define _WASASTRINGTOQUERY_H_INCLUDED_
|
||||||
/* @(#$Id: wasastringtoquery.h,v 1.6 2008-01-17 11:14:13 dockes Exp $ (C) 2006 J.F.Dockes */
|
/* @(#$Id: wasastringtoquery.h,v 1.7 2008-08-26 13:47:21 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU General Public License as published by
|
* it under the terms of the GNU General Public License as published by
|
||||||
@ -23,10 +23,12 @@
|
|||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
/* Note: Xesam used to be named wasabi. We changed the references to wasabi in
|
||||||
|
the comments, but not the code */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A simple class to represent a parsed wasabiSimple query element.
|
* A simple class to represent a parsed Xesam user language element.
|
||||||
* Can hold a string value or an array of subqueries.
|
* Can hold one leaf element or an array of subqueries to be joined by AND/OR
|
||||||
*
|
*
|
||||||
* The complete query is represented by a top WasaQuery holding a
|
* The complete query is represented by a top WasaQuery holding a
|
||||||
* chain of ANDed subclauses. Some of the subclauses may be themselves
|
* chain of ANDed subclauses. Some of the subclauses may be themselves
|
||||||
@ -35,14 +37,30 @@ using std::vector;
|
|||||||
*
|
*
|
||||||
* For LEAF elements, the value can hold one or several words. In the
|
* For LEAF elements, the value can hold one or several words. In the
|
||||||
* latter case, it should be interpreted as a phrase (comes from a
|
* latter case, it should be interpreted as a phrase (comes from a
|
||||||
* user-entered "quoted string").
|
* user-entered "quoted string"), except if the modifier flags say otherwise.
|
||||||
*
|
*
|
||||||
* Some fields only make sense either for compound or LEAF queries. This
|
* Some fields only make sense either for compound or LEAF queries. This
|
||||||
* is commented for each. We should subclass really.
|
* is commented for each. We should subclass really.
|
||||||
|
*
|
||||||
|
* Note that wasaStringToQuery supposedly parses the whole Xesam
|
||||||
|
* User Search Language v 0.95, but that some elements are dropped or
|
||||||
|
* ignored during the translation to a native Recoll query in wasaToRcl
|
||||||
*/
|
*/
|
||||||
class WasaQuery {
|
class WasaQuery {
|
||||||
public:
|
public:
|
||||||
|
/** Type of this element: leaf or AND/OR chain */
|
||||||
enum Op {OP_NULL, OP_LEAF, OP_EXCL, OP_OR, OP_AND};
|
enum Op {OP_NULL, OP_LEAF, OP_EXCL, OP_OR, OP_AND};
|
||||||
|
/** Relation to be searched between field and value. Recoll actually only
|
||||||
|
supports "contain" */
|
||||||
|
enum Rel {REL_NULL, REL_EQUALS, REL_CONTAINS, REL_LT, REL_LTE,
|
||||||
|
REL_GT, REL_GTE};
|
||||||
|
/** Modifiers for term handling: case/diacritics handling,
|
||||||
|
stemming control */
|
||||||
|
enum Modifier {WQM_CASESENS = 1, WQM_DIACSENS = 2, WQM_NOSTEM = 4,
|
||||||
|
WQM_BOOST = 8, WQM_PROX = 0x10, WQM_SLOPPY = 0x20,
|
||||||
|
WQM_WORDS = 0x40, WQM_PHRASESLACK = 0x80, WQM_REGEX = 0x100,
|
||||||
|
WQM_FUZZY = 0x200};
|
||||||
|
|
||||||
typedef vector<WasaQuery*> subqlist_t;
|
typedef vector<WasaQuery*> subqlist_t;
|
||||||
|
|
||||||
WasaQuery()
|
WasaQuery()
|
||||||
@ -59,6 +77,8 @@ public:
|
|||||||
|
|
||||||
/** Field specification if any (ie: title, author ...) Only OPT_LEAF */
|
/** Field specification if any (ie: title, author ...) Only OPT_LEAF */
|
||||||
string m_fieldspec;
|
string m_fieldspec;
|
||||||
|
/** Relation between field and value: =, :, <,>,<=, >= */
|
||||||
|
WasaQuery::Rel m_rel;
|
||||||
|
|
||||||
/* String value. Valid for op == OP_LEAF or EXCL */
|
/* String value. Valid for op == OP_LEAF or EXCL */
|
||||||
string m_value;
|
string m_value;
|
||||||
@ -66,13 +86,7 @@ public:
|
|||||||
/** Subqueries. Valid for conjunctions */
|
/** Subqueries. Valid for conjunctions */
|
||||||
vector<WasaQuery*> m_subs;
|
vector<WasaQuery*> m_subs;
|
||||||
|
|
||||||
/** Modifiers for term handling: case/diacritics handling,
|
unsigned int m_modifiers;
|
||||||
stemming control */
|
|
||||||
enum Modifier {WQM_CASESENS = 1, WQM_DIACSENS = 2, WQM_NOSTEM = 4,
|
|
||||||
WQM_BOOST = 8, WQM_PROX = 0x10, WQM_SLOPPY = 0x20,
|
|
||||||
WQM_WORDS = 0x40, WQM_PHRASESLACK = 0x80, WQM_REGEX = 0x100,
|
|
||||||
WQM_FUZZY = 0x200};
|
|
||||||
unsigned int m_modifiers;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.13 2008-01-16 11:14:38 dockes Exp $ (C) 2006 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.14 2008-08-26 13:47:21 dockes Exp $ (C) 2006 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -68,9 +68,11 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
|
|||||||
LOGINFO(("wasaQueryToRcl: found bad NULL or AND q type in list\n"));
|
LOGINFO(("wasaQueryToRcl: found bad NULL or AND q type in list\n"));
|
||||||
continue;
|
continue;
|
||||||
case WasaQuery::OP_LEAF:
|
case WasaQuery::OP_LEAF:
|
||||||
|
unsigned int mods = (unsigned int)(*it)->m_modifiers;
|
||||||
// Special cases (mime, category, dir filter ...). Not pretty.
|
// Special cases (mime, category, dir filter ...). Not pretty.
|
||||||
if (!stringicmp("mime", (*it)->m_fieldspec)) {
|
if (!stringicmp("mime", (*it)->m_fieldspec) ||
|
||||||
|
!stringicmp("format", (*it)->m_fieldspec)
|
||||||
|
) {
|
||||||
sdata->addFiletype((*it)->m_value);
|
sdata->addFiletype((*it)->m_value);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -95,8 +97,14 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) {
|
if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) {
|
||||||
nclause = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE,
|
int slack = (mods & WasaQuery::WQM_PHRASESLACK) ? 10 : 0;
|
||||||
(*it)->m_value, 0,
|
Rcl::SClType tp = Rcl::SCLT_PHRASE;
|
||||||
|
if (mods & WasaQuery::WQM_PROX) {
|
||||||
|
tp = Rcl::SCLT_NEAR;
|
||||||
|
slack = 10;
|
||||||
|
}
|
||||||
|
nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value,
|
||||||
|
slack,
|
||||||
(*it)->m_fieldspec);
|
(*it)->m_fieldspec);
|
||||||
} else {
|
} else {
|
||||||
nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,
|
nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user