diff --git a/src/query/wasastringtoquery.cpp b/src/query/wasastringtoquery.cpp new file mode 100644 index 00000000..d5fa8fe3 --- /dev/null +++ b/src/query/wasastringtoquery.cpp @@ -0,0 +1,346 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.1 2006-11-30 18:12:16 dockes Exp $ (C) 2006 J.F.Dockes"; +#endif +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#ifndef TEST_STRINGTOQUERY +#include +#include +#include +#include + +#include "wasastringtoquery.h" + +WasaQuery::~WasaQuery() +{ + for (vector::iterator it = m_subs.begin(); + it != m_subs.end(); it++) { + delete *it; + } + m_subs.clear(); +} + +void WasaQuery::describe(string &desc) const +{ + desc += "("; + switch (m_op) { + case OP_NULL: + desc += "NULL"; + break; + case OP_LEAF: + desc += m_fieldspec.empty() ? + m_value : m_fieldspec + ":" + m_value; + break; + case OP_EXCL: + desc += string("NOT (" ) + m_value + ") "; + break; + case OP_OR: + case OP_AND: + for (vector::const_iterator it = m_subs.begin(); + it != m_subs.end(); it++) { + (*it)->describe(desc); + vector::const_iterator it1 = it; + it1++; + if (it1 != m_subs.end()) + desc += m_op == OP_OR ? "OR ": "AND "; + } + break; + } + desc += ") "; +} + +// The string query parser code: + +/* Shamelessly lifted from Beagle: + * This is our regular Expression Pattern: + * we expect something like this: + * -key:"Value String" + * key:Value + * or + * Value + ([+-]?) # Required or Prohibited (optional) + (\w+:)? # Key (optional) + ( # Query Text + (\"([^\"]*)\"?)# quoted + | # or + ([^\s\"]+) # unquoted + ) + "; +*/ + +/* The master regular expression used to parse a query string + * Sub-expressions in parenthesis are numbered from 1. Each opening + * parenthesis increases the index, but we're not interested in all + */ +static const char * parserExpr = + "([oO][rR])" //1 OR is a special word + "|" + "(" //2 + "([+-])?" //3 Force or exclude indicator + "(" //4 + "([[:alpha:]][[:alnum:]]+)" //5 Field spec: "fieldname:" + ":)?" + "(" //6 + "(\"" //7 + "([^\"]+)" //8 "A quoted term" + "\")" + "|" + "([^[:space:]]+)" //9 ANormalTerm + ")" + ")" +; + +// For debugging the parser. But see also NMATCH +static const char *matchNames[] = { + /*0*/ "", + /*1*/ "OR", + /*2*/ "", + /*3*/ "+-", + /*4*/ "", + /*5*/ "FIELD", + /*6*/ "", + /*7*/ "", + /*8*/ "QUOTEDTERM", + /*9*/ "TERM", +}; +#define NMATCH (sizeof(matchNames) / sizeof(char *)) + +// Symbolic names for the interesting submatch indices +enum SbMatchIdx {SMI_OR=1, SMI_PM=3, SMI_FIELD=5, SMI_QUOTED=8, SMI_TERM=9}; + +static const int maxmatchlen = 1024; +static const int errbuflen = 300; + +class StringToWasaQuery::Internal { +public: + Internal() + : m_rxneedsfree(false) + {} + ~Internal() + { + if (m_rxneedsfree) + regfree(&m_rx); + } + bool StringToWasaQuery::Internal::checkSubMatch(int i, char *match, + string& reason) + { + if (i < 0 || i >= int(NMATCH) || m_pmatch[i].rm_so == -1) + return false; + if (m_pmatch[i].rm_eo - m_pmatch[i].rm_so <= 0) { + // weird and fatal + reason = "Internal regular expression handling error"; + return false; + } + memcpy(match, m_cp + m_pmatch[i].rm_so, + m_pmatch[i].rm_eo - m_pmatch[i].rm_so); + match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0; + return true; + } + + WasaQuery *stringToQuery(const string& str, string& reason); + + friend class StringToWasaQuery; +private: + const char *m_cp; + regex_t m_rx; + bool m_rxneedsfree; + regmatch_t m_pmatch[NMATCH]; +}; + +StringToWasaQuery::StringToWasaQuery() + : internal(new Internal) +{ +} + +StringToWasaQuery::~StringToWasaQuery() +{ + delete internal; +} + + +WasaQuery * +StringToWasaQuery::stringToQuery(const string& str, string& reason) +{ + return internal ? internal->stringToQuery(str, reason) : 0; +} + +WasaQuery * +StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) +{ + if (m_rxneedsfree) + regfree(&m_rx); + + char errbuf[errbuflen+1]; + int errcode; + if ((errcode = regcomp(&m_rx, parserExpr, REG_EXTENDED)) != 0) { + regerror(errcode, &m_rx, errbuf, errbuflen); + reason = errbuf; + return 0; + } + m_rxneedsfree = true; + + const char *cpe; + m_cp = str.c_str(); + cpe = str.c_str() + str.length(); + + WasaQuery *query = new WasaQuery; + query->m_op = WasaQuery::OP_AND; + WasaQuery *orClause = 0; + bool prev_or = false; + + // Loop on repeated regexp matches on the main string. + for (int loop = 0;;loop++) { + if ((errcode = regexec(&m_rx, m_cp, NMATCH, m_pmatch, 0))) { + regerror(errcode, &m_rx, errbuf, errbuflen); + reason = errbuf; + return 0; + } + if (m_pmatch[0].rm_eo <= 0) { + // weird and fatal + reason = "Internal regular expression handling error"; + return 0; + } +#if 0 + if (loop) printf("Next part:\n"); + for (i = 0; i < NMATCH; i++) { + if (m_pmatch[i].rm_so == -1) continue; + char match[maxmatchlen+1]; + memcpy(match, m_cp + m_pmatch[i].rm_so, + m_pmatch[i].rm_eo - m_pmatch[i].rm_so); + match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0; + if (matchNames[i][0]) + printf("%10s: [%s] (%d->%d)\n", matchNames[i], match, + (int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo); + } +#endif + char match[maxmatchlen+1]; + if (checkSubMatch(SMI_OR, match, reason)) { + if (prev_or) { + // Bad syntax + reason = "Bad syntax: consecutive OR"; + return 0; + } + + if (orClause == 0) { + // Fist OR seen: start OR subclause. + if ((orClause = new WasaQuery()) == 0) { + reason = "Out of memory"; + return 0; + } + orClause->m_op = WasaQuery::OP_OR; + } + + // We need to transfer the previous query from the main vector + // to the OR subquery + if (!query->m_subs.empty()) { + orClause->m_subs.push_back(query->m_subs.back()); + query->m_subs.pop_back(); + } + prev_or = true; + + } else { + + WasaQuery *nclause = new WasaQuery; + if (nclause == 0) { + reason = "Out of memory"; + return 0; + } + + // Check for quoted or unquoted value + if (checkSubMatch(SMI_QUOTED, match, reason)) { + nclause->m_value = match; + } else if (checkSubMatch(SMI_TERM, match, reason)) { + nclause->m_value = match; + } + if (nclause->m_value.empty()) { + // Isolated +- or fieldname: without a value. Ignore until + // told otherwise. + delete nclause; + return 0; + } + + // +- indicator ? + if (checkSubMatch(SMI_PM, match, reason) && match[0] == '-') { + nclause->m_op = WasaQuery::OP_EXCL; + } else { + nclause->m_op = WasaQuery::OP_LEAF; + } + + // Field indicator ? + if (checkSubMatch(SMI_FIELD, match, reason)) { + nclause->m_fieldspec = match; + } + + if (prev_or) { + // We're in an OR subquery, add new subquery + orClause->m_subs.push_back(nclause); + } else { + if (orClause) { + // Getting out of OR. Add the OR subquery to the main one + query->m_subs.push_back(orClause); + orClause = 0; + } + // Add new subquery to main one. + query->m_subs.push_back(nclause); + } + prev_or = false; + } + + // Advance current string position. We checked earlier that + // the increment is strictly positive, so we won't loop + // forever + m_cp += m_pmatch[0].rm_eo; + if (m_cp >= cpe) + break; + } + + regfree(&m_rx); + m_rxneedsfree = false; + return query; +} + +#else // TEST + +#include +#include "wasastringtoquery.h" + +static char *thisprog; + +int main(int argc, char **argv) +{ + thisprog = argv[0]; + argc--; argv++; + + if (argc != 1) { + fprintf(stderr, "need one arg\n"); + exit(1); + } + const string str = *argv++;argc--; + string reason; + StringToWasaQuery qparser; + WasaQuery *q = qparser.stringToQuery(str, reason); + if (q == 0) { + fprintf(stderr, "stringToQuery failed: %s\n", reason.c_str()); + exit(1); + } + string desc; + q->describe(desc); + printf("%s\n", desc.c_str()); + exit(0); +} + +#endif // TEST_STRINGTOQUERY diff --git a/src/query/wasastringtoquery.h b/src/query/wasastringtoquery.h new file mode 100644 index 00000000..5afe231f --- /dev/null +++ b/src/query/wasastringtoquery.h @@ -0,0 +1,57 @@ +#ifndef _WASASTRINGTOQUERY_H_INCLUDED_ +#define _WASASTRINGTOQUERY_H_INCLUDED_ +/* @(#$Id: wasastringtoquery.h,v 1.1 2006-11-30 18:12:16 dockes Exp $ (C) 2006 J.F.Dockes */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#include +#include + +using std::string; +using std::vector; + +// A simple class to represent a parsed wasabi query string. +class WasaQuery { +public: + enum Op {OP_NULL, OP_LEAF, OP_EXCL, OP_OR, OP_AND}; + typedef vector subqlist_t; + + WasaQuery() : m_op(OP_NULL) {} + ~WasaQuery(); + + // Get string describing this query + void describe(string &desc) const; + + WasaQuery::Op m_op; + string m_fieldspec; + vector m_subs; + string m_value; +}; + + +// Wasabi query string parser class. +class StringToWasaQuery { +public: + StringToWasaQuery(); + ~StringToWasaQuery(); + WasaQuery *stringToQuery(const string& str, string& reason); + class Internal; +private: + Internal *internal; +}; + +#endif /* _WASASTRINGTOQUERY_H_INCLUDED_ */ diff --git a/src/query/wasatorcl.cpp b/src/query/wasatorcl.cpp new file mode 100644 index 00000000..9ec86545 --- /dev/null +++ b/src/query/wasatorcl.cpp @@ -0,0 +1,155 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: wasatorcl.cpp,v 1.1 2006-11-30 18:12:16 dockes Exp $ (C) 2006 J.F.Dockes"; +#endif +#ifndef TEST_WASATORCL + +#include "wasastringtoquery.h" +#include "rcldb.h" +#include "searchdata.h" +#include "wasatorcl.h" + +Rcl::SearchData *wasatorcl(WasaQuery *wasa) +{ + if (wasa == 0) + return 0; + + Rcl::SearchData *sdata = new Rcl::SearchData(Rcl::SCLT_AND); + + WasaQuery::subqlist_t::iterator it; + for (it = wasa->m_subs.begin(); it != wasa->m_subs.end(); it++) { + switch ((*it)->m_op) { + case WasaQuery::OP_NULL: + case WasaQuery::OP_AND: + default: + // ?? + continue; + case WasaQuery::OP_LEAF: + if ((*it)->m_value.find_first_of(" \t\n\r") != string::npos) { + sdata->addClause + (new Rcl::SearchDataClauseDist(Rcl::SCLT_PHRASE, + (*it)->m_value, 0)); + } else { + sdata->addClause + (new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, + (*it)->m_value)); + } + break; + case WasaQuery::OP_EXCL: + // Note: have to add dquotes which will be translated to + // phrase if there are several words in there. Not pretty + // but should work + sdata->addClause + (new Rcl::SearchDataClauseSimple(Rcl::SCLT_EXCL, + string("\"") + + (*it)->m_value + "\"")); + break; + case WasaQuery::OP_OR: + // Concatenate all OR values as phrases. Hope there are no + // stray dquotes in there + { + string orvalue; + WasaQuery::subqlist_t::iterator orit; + for (orit = (*it)->m_subs.begin(); + orit != (*it)->m_subs.end(); orit++) { + orvalue += string("\"") + (*orit)->m_value + "\""; + } + sdata->addClause + (new Rcl::SearchDataClauseSimple(Rcl::SCLT_OR, + orvalue)); + } + } + } + + return sdata; +} + + +#else // TEST + +#ifdef HAVE_CONFIG_H +#include "autoconfig.h" +#endif + +#include +#include +#include + +#include +#include +#include + +using namespace std; + +#include "debuglog.h" +#include "rclinit.h" +#include "rclconfig.h" +#include "rcldb.h" +#include "searchdata.h" +#include "refcntr.h" +#include "wasastringtoquery.h" +#include "wasatorcl.h" + +static char *thisprog; + +int main(int argc, char *argv[]) +{ + thisprog = argv[0]; + argc--; argv++; + + if (argc != 1) { + fprintf(stderr, "need one arg\n"); + exit(1); + } + const string str = *argv++;argc--; + string reason; + + RclConfig *config = recollinit(RCLINIT_NONE, 0, 0, reason, 0); + if (config == 0 || !config->ok()) { + cerr << "Configuration problem: " << reason << endl; + exit(1); + } + string dbdir = config->getDbDir(); + if (dbdir.empty()) { + // Note: this will have to be replaced by a call to a + // configuration buildin dialog for initial configuration + cerr << "Configuration problem: " << "No dbdir" << endl; + exit(1); + } + Rcl::Db rcldb; + if (!rcldb.open(dbdir, Rcl::Db::DbRO, 0)) { + cerr << "Could not open database in " << dbdir << endl; + return 1; + } + + StringToWasaQuery qparser; + WasaQuery *wq = qparser.stringToQuery(str, reason); + if (wq == 0) { + fprintf(stderr, "wasastringtoquery failed: %s\n", reason.c_str()); + return 1; + } + string desc; + wq->describe(desc); + cout << endl << "Wasabi query description: " << desc << endl << endl; + + Rcl::SearchData *sdata = wasatorcl(wq); + RefCntr rq(sdata); + if (!rcldb.setQuery(rq)) { + cerr << "setQuery failed" << endl; + return 1; + } + int maxi = rcldb.getResCnt() > 10 ? 10 : rcldb.getResCnt(); + + cout << endl << "Rcl Query description: " << sdata->getDescription() + << endl << endl << "Results: " << endl; + + for (int i = 0; i < maxi ; i++) { + Rcl::Doc doc; + if (!rcldb.getDoc(i, doc)) { + cerr << "getDoc failed" << endl; + return 1; + } + cout << i << ": " << doc.url << endl; + } + return 0; +} +#endif // TEST_WASATORCL diff --git a/src/query/wasatorcl.h b/src/query/wasatorcl.h new file mode 100644 index 00000000..13514673 --- /dev/null +++ b/src/query/wasatorcl.h @@ -0,0 +1,7 @@ +#ifndef _WASATORCL_H_INCLUDED_ +#define _WASATORCL_H_INCLUDED_ +/* @(#$Id: wasatorcl.h,v 1.1 2006-11-30 18:12:16 dockes Exp $ (C) 2006 J.F.Dockes */ + +extern Rcl::SearchData *wasatorcl(WasaQuery *wasa); + +#endif /* _WASATORCL_H_INCLUDED_ */