try to parse the whole of Xesam user language 0.95

This commit is contained in:
dockes 2008-08-26 13:47:21 +00:00
parent b43d8ff1a7
commit a41eb8eef1
3 changed files with 121 additions and 59 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.7 2008-07-01 11:51:51 dockes Exp $ (C) 2006 J.F.Dockes"; static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.8 2008-08-26 13:47:21 dockes Exp $ (C) 2006 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -41,10 +41,24 @@ WasaQuery::~WasaQuery()
m_subs.clear(); m_subs.clear();
} }
static const char* reltosrel(WasaQuery::Rel rel)
{
switch (rel) {
case WasaQuery::REL_EQUALS: return "=";
case WasaQuery::REL_CONTAINS: return ":";
case WasaQuery::REL_LT: return "<";
case WasaQuery::REL_LTE: return "<=";
case WasaQuery::REL_GT: return ">";
case WasaQuery::REL_GTE: return ">=";
default: return "?";
}
}
void WasaQuery::describe(string &desc) const void WasaQuery::describe(string &desc) const
{ {
desc += "("; desc += "(";
string fieldspec = m_fieldspec.empty() ? string() : m_fieldspec + ": "; string fieldspec = m_fieldspec.empty() ? string() : m_fieldspec +
reltosrel(m_rel);
switch (m_op) { switch (m_op) {
case OP_NULL: case OP_NULL:
desc += "NULL"; desc += "NULL";
@ -71,6 +85,7 @@ void WasaQuery::describe(string &desc) const
desc.erase(desc.length() - 1); desc.erase(desc.length() - 1);
desc += ")"; desc += ")";
if (m_modifiers != 0) { if (m_modifiers != 0) {
if (m_modifiers & WQM_BOOST) desc += "BOOST|";
if (m_modifiers & WQM_CASESENS) desc += "CASESENS|"; if (m_modifiers & WQM_CASESENS) desc += "CASESENS|";
if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|"; if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|";
if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|"; if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|";
@ -96,14 +111,6 @@ void WasaQuery::describe(string &desc) const
* key:Value * key:Value
* or * or
* Value * Value
([+-]?) # Required or Prohibited (optional)
(\w+:)? # Key (optional)
( # Query Text
(\"([^\"]*)\"?)# quoted
| # or
([^\s\"]+) # unquoted
)
";
*/ */
/* The master regular expression used to parse a query string /* The master regular expression used to parse a query string
@ -113,41 +120,47 @@ void WasaQuery::describe(string &desc) const
static const char * parserExpr = static const char * parserExpr =
"([oO][rR]|\\|\\|)[[:space:]]*" //1 OR,or,|| "([oO][rR]|\\|\\|)[[:space:]]*" //1 OR,or,||
"|" "|"
"(" //2 "([Aa][Nn][Dd]|&&)[[:space:]]*" // 2 AND,and,&& (ignored, default)
"([+-])?" //3 Force or exclude indicator "|"
"(" //4 "(" //3
"([[:alpha:]][[:alnum:]]*)" //5 Field spec: "fieldname:" "([+-])?" //4 Force or exclude indicator
":)?" "(" //5
"(" //6 "([[:alpha:]][[:alnum:]:]*)" //6 Field spec: ie: "dc:title:letitre"
"(\"" //7 "[[:space:]]*"
"([^\"]+)" //8 "A quoted term" "(:|=|<|>|<=|>=)" //7 Relation
"[[:space:]]*)?"
"(" //8
"(\"" //9
"([^\"]+)" //10 "A quoted term"
"\")" "\")"
"([a-zA-Z0-9]*)" //9 modifiers "([a-zA-Z0-9]*)" //11 modifiers
"|" "|"
"([^[:space:]\"]+)" //10 ANormalTerm "([^[:space:]\"]+)" //12 ANormalTerm
")" ")"
")[[:space:]]*" ")[[:space:]]*"
; ;
// For debugging the parser. But see also NMATCH // For debugging the parser. But see also NMATCH
static const char *matchNames[] = { static const char *matchNames[] = {
/*0*/ "", /* 0*/ "",
/*1*/ "OR", /* 1*/ "OR",
/*2*/ "", /* 2*/ "AND",
/*3*/ "+-", /* 3*/ "",
/*4*/ "", /* 4*/ "+-",
/*5*/ "FIELD", /* 5*/ "",
/*6*/ "", /* 6*/ "FIELD",
/*7*/ "", /* 7*/ "RELATION",
/*8*/ "QUOTEDTERM", /* 8*/ "",
/*9*/ "MODIIFIERS", /* 9*/ "",
/*10*/ "TERM", /*10*/ "QUOTEDTERM",
/*11*/ "MODIIFIERS",
/*12*/ "TERM",
}; };
#define NMATCH (sizeof(matchNames) / sizeof(char *)) #define NMATCH (sizeof(matchNames) / sizeof(char *))
// Symbolic names for the interesting submatch indices // Symbolic names for the interesting submatch indices
enum SbMatchIdx {SMI_OR=1, SMI_PM=3, SMI_FIELD=5, SMI_QUOTED=8, enum SbMatchIdx {SMI_OR=1, SMI_AND=2, SMI_PM=4, SMI_FIELD=6, SMI_REL=7,
SMI_MODIF=9, SMI_TERM=10}; SMI_QUOTED=10, SMI_MODIF=11, SMI_TERM=12};
static const int maxmatchlen = 1024; static const int maxmatchlen = 1024;
static const int errbuflen = 300; static const int errbuflen = 300;
@ -284,6 +297,10 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
} }
prev_or = true; prev_or = true;
} else if (checkSubMatch(SMI_AND, match, reason)) {
// Do nothing, AND is the default. We might want to check for
// errors like consecutive ANDs, or OR AND
} else { } else {
WasaQuery *nclause = new WasaQuery; WasaQuery *nclause = new WasaQuery;
@ -312,19 +329,20 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
unsigned int mods = 0; unsigned int mods = 0;
for (unsigned int i = 0; i < strlen(match); i++) { for (unsigned int i = 0; i < strlen(match); i++) {
switch (match[i]) { switch (match[i]) {
case 'C': mods |= WasaQuery::WQM_CASESENS; break;
case 'D': mods |= WasaQuery::WQM_DIACSENS; break;
case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
case 'e': mods |= WasaQuery::WQM_CASESENS |
WasaQuery::WQM_DIACSENS |
WasaQuery::WQM_NOSTEM; break;
case 'f': mods |= WasaQuery::WQM_FUZZY; break;
case 'b': mods |= WasaQuery::WQM_BOOST; break; case 'b': mods |= WasaQuery::WQM_BOOST; break;
case 'c': break;
case 'C': mods |= WasaQuery::WQM_CASESENS; break;
case 'd': break;
case 'D': mods |= WasaQuery::WQM_DIACSENS; break;
case 'e': mods |= WasaQuery::WQM_CASESENS | WasaQuery::WQM_DIACSENS | WasaQuery::WQM_NOSTEM; break;
case 'f': mods |= WasaQuery::WQM_FUZZY; break;
case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
case 'L': break;
case 'o': mods |= WasaQuery::WQM_PHRASESLACK; break;
case 'p': mods |= WasaQuery::WQM_PROX; break; case 'p': mods |= WasaQuery::WQM_PROX; break;
case 'r': mods |= WasaQuery::WQM_REGEX; break;
case 's': mods |= WasaQuery::WQM_SLOPPY; break; case 's': mods |= WasaQuery::WQM_SLOPPY; break;
case 'w': mods |= WasaQuery::WQM_WORDS; break; case 'w': mods |= WasaQuery::WQM_WORDS; break;
case 'o': mods |= WasaQuery::WQM_PHRASESLACK; break;
case 'r': mods |= WasaQuery::WQM_REGEX; break;
} }
} }
nclause->m_modifiers = WasaQuery::Modifier(mods); nclause->m_modifiers = WasaQuery::Modifier(mods);
@ -336,6 +354,29 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
// etc. here but this went away from the spec. See 1.4 // etc. here but this went away from the spec. See 1.4
// if it comes back // if it comes back
nclause->m_fieldspec = match; nclause->m_fieldspec = match;
if (checkSubMatch(SMI_REL, match, reason)) {
switch (match[0]) {
case '=':nclause->m_rel = WasaQuery::REL_EQUALS;break;
case ':':nclause->m_rel = WasaQuery::REL_CONTAINS;break;
case '<':
if (match[1] == '=')
nclause->m_rel = WasaQuery::REL_LTE;
else
nclause->m_rel = WasaQuery::REL_LT;
break;
case '>':
if (match[1] == '=')
nclause->m_rel = WasaQuery::REL_GTE;
else
nclause->m_rel = WasaQuery::REL_GT;
break;
default:
nclause->m_rel = WasaQuery::REL_CONTAINS;
}
} else {
// ?? If field matched we should have a relation
nclause->m_rel = WasaQuery::REL_CONTAINS;
}
} }
// +- indicator ? // +- indicator ?
@ -345,7 +386,6 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
nclause->m_op = WasaQuery::OP_LEAF; nclause->m_op = WasaQuery::OP_LEAF;
} }
if (prev_or) { if (prev_or) {
// The precedent token was an OR, add new clause to or chain // The precedent token was an OR, add new clause to or chain
//DPRINT((stderr, "Adding to OR chain\n")); //DPRINT((stderr, "Adding to OR chain\n"));

View File

@ -1,6 +1,6 @@
#ifndef _WASASTRINGTOQUERY_H_INCLUDED_ #ifndef _WASASTRINGTOQUERY_H_INCLUDED_
#define _WASASTRINGTOQUERY_H_INCLUDED_ #define _WASASTRINGTOQUERY_H_INCLUDED_
/* @(#$Id: wasastringtoquery.h,v 1.6 2008-01-17 11:14:13 dockes Exp $ (C) 2006 J.F.Dockes */ /* @(#$Id: wasastringtoquery.h,v 1.7 2008-08-26 13:47:21 dockes Exp $ (C) 2006 J.F.Dockes */
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@ -23,10 +23,12 @@
using std::string; using std::string;
using std::vector; using std::vector;
/* Note: Xesam used to be named wasabi. We changed the references to wasabi in
the comments, but not the code */
/** /**
* A simple class to represent a parsed wasabiSimple query element. * A simple class to represent a parsed Xesam user language element.
* Can hold a string value or an array of subqueries. * Can hold one leaf element or an array of subqueries to be joined by AND/OR
* *
* The complete query is represented by a top WasaQuery holding a * The complete query is represented by a top WasaQuery holding a
* chain of ANDed subclauses. Some of the subclauses may be themselves * chain of ANDed subclauses. Some of the subclauses may be themselves
@ -35,14 +37,30 @@ using std::vector;
* *
* For LEAF elements, the value can hold one or several words. In the * For LEAF elements, the value can hold one or several words. In the
* latter case, it should be interpreted as a phrase (comes from a * latter case, it should be interpreted as a phrase (comes from a
* user-entered "quoted string"). * user-entered "quoted string"), except if the modifier flags say otherwise.
* *
* Some fields only make sense either for compound or LEAF queries. This * Some fields only make sense either for compound or LEAF queries. This
* is commented for each. We should subclass really. * is commented for each. We should subclass really.
*
* Note that wasaStringToQuery supposedly parses the whole Xesam
* User Search Language v 0.95, but that some elements are dropped or
* ignored during the translation to a native Recoll query in wasaToRcl
*/ */
class WasaQuery { class WasaQuery {
public: public:
/** Type of this element: leaf or AND/OR chain */
enum Op {OP_NULL, OP_LEAF, OP_EXCL, OP_OR, OP_AND}; enum Op {OP_NULL, OP_LEAF, OP_EXCL, OP_OR, OP_AND};
/** Relation to be searched between field and value. Recoll actually only
supports "contain" */
enum Rel {REL_NULL, REL_EQUALS, REL_CONTAINS, REL_LT, REL_LTE,
REL_GT, REL_GTE};
/** Modifiers for term handling: case/diacritics handling,
stemming control */
enum Modifier {WQM_CASESENS = 1, WQM_DIACSENS = 2, WQM_NOSTEM = 4,
WQM_BOOST = 8, WQM_PROX = 0x10, WQM_SLOPPY = 0x20,
WQM_WORDS = 0x40, WQM_PHRASESLACK = 0x80, WQM_REGEX = 0x100,
WQM_FUZZY = 0x200};
typedef vector<WasaQuery*> subqlist_t; typedef vector<WasaQuery*> subqlist_t;
WasaQuery() WasaQuery()
@ -59,6 +77,8 @@ public:
/** Field specification if any (ie: title, author ...) Only OPT_LEAF */ /** Field specification if any (ie: title, author ...) Only OPT_LEAF */
string m_fieldspec; string m_fieldspec;
/** Relation between field and value: =, :, <,>,<=, >= */
WasaQuery::Rel m_rel;
/* String value. Valid for op == OP_LEAF or EXCL */ /* String value. Valid for op == OP_LEAF or EXCL */
string m_value; string m_value;
@ -66,13 +86,7 @@ public:
/** Subqueries. Valid for conjunctions */ /** Subqueries. Valid for conjunctions */
vector<WasaQuery*> m_subs; vector<WasaQuery*> m_subs;
/** Modifiers for term handling: case/diacritics handling, unsigned int m_modifiers;
stemming control */
enum Modifier {WQM_CASESENS = 1, WQM_DIACSENS = 2, WQM_NOSTEM = 4,
WQM_BOOST = 8, WQM_PROX = 0x10, WQM_SLOPPY = 0x20,
WQM_WORDS = 0x40, WQM_PHRASESLACK = 0x80, WQM_REGEX = 0x100,
WQM_FUZZY = 0x200};
unsigned int m_modifiers;
}; };
/** /**

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.13 2008-01-16 11:14:38 dockes Exp $ (C) 2006 J.F.Dockes"; static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.14 2008-08-26 13:47:21 dockes Exp $ (C) 2006 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -68,9 +68,11 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
LOGINFO(("wasaQueryToRcl: found bad NULL or AND q type in list\n")); LOGINFO(("wasaQueryToRcl: found bad NULL or AND q type in list\n"));
continue; continue;
case WasaQuery::OP_LEAF: case WasaQuery::OP_LEAF:
unsigned int mods = (unsigned int)(*it)->m_modifiers;
// Special cases (mime, category, dir filter ...). Not pretty. // Special cases (mime, category, dir filter ...). Not pretty.
if (!stringicmp("mime", (*it)->m_fieldspec)) { if (!stringicmp("mime", (*it)->m_fieldspec) ||
!stringicmp("format", (*it)->m_fieldspec)
) {
sdata->addFiletype((*it)->m_value); sdata->addFiletype((*it)->m_value);
break; break;
} }
@ -95,8 +97,14 @@ Rcl::SearchData *wasaQueryToRcl(WasaQuery *wasa)
} }
if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) { if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) {
nclause = new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, int slack = (mods & WasaQuery::WQM_PHRASESLACK) ? 10 : 0;
(*it)->m_value, 0, Rcl::SClType tp = Rcl::SCLT_PHRASE;
if (mods & WasaQuery::WQM_PROX) {
tp = Rcl::SCLT_NEAR;
slack = 10;
}
nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value,
slack,
(*it)->m_fieldspec); (*it)->m_fieldspec);
} else { } else {
nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,