#ifndef lint static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.10 2008-12-16 08:53:49 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef TEST_WASASTRINGTOQUERY #include #include #include #include #include "wasastringtoquery.h" // #define DEB_WASASTRINGTOQ 1 #ifdef DEB_WASASTRINGTOQ #define DPRINT(X) fprintf X #define DUMPQ(Q) {string D;Q->describe(D);fprintf(stderr, "%s\n", D.c_str());} #else #define DPRINT(X) #define DUMPQ(Q) #endif WasaQuery::~WasaQuery() { for (vector::iterator it = m_subs.begin(); it != m_subs.end(); it++) { delete *it; } m_subs.clear(); } static const char* reltosrel(WasaQuery::Rel rel) { switch (rel) { case WasaQuery::REL_EQUALS: return "="; case WasaQuery::REL_CONTAINS: return ":"; case WasaQuery::REL_LT: return "<"; case WasaQuery::REL_LTE: return "<="; case WasaQuery::REL_GT: return ">"; case WasaQuery::REL_GTE: return ">="; default: return "?"; } } void WasaQuery::describe(string &desc) const { desc += "("; string fieldspec = m_fieldspec.empty() ? string() : m_fieldspec + reltosrel(m_rel); switch (m_op) { case OP_NULL: desc += "NULL"; break; case OP_LEAF: desc += fieldspec + m_value; break; case OP_EXCL: desc += string("NOT (" ) + fieldspec + m_value + ") "; break; case OP_OR: case OP_AND: for (vector::const_iterator it = m_subs.begin(); it != m_subs.end(); it++) { (*it)->describe(desc); vector::const_iterator it1 = it; it1++; if (it1 != m_subs.end()) desc += m_op == OP_OR ? "OR ": "AND "; } break; } if (desc[desc.length() - 1] == ' ') desc.erase(desc.length() - 1); desc += ")"; if (m_modifiers != 0) { if (m_modifiers & WQM_BOOST) desc += "BOOST|"; if (m_modifiers & WQM_CASESENS) desc += "CASESENS|"; if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|"; if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|"; if (m_modifiers & WQM_BOOST) desc += "BOOST|"; if (m_modifiers & WQM_PROX) desc += "PROX|"; if (m_modifiers & WQM_SLOPPY) desc += "SLOPPY|"; if (m_modifiers & WQM_WORDS) desc += "WORDS|"; if (m_modifiers & WQM_PHRASESLACK) desc += "PHRASESLACK|"; if (m_modifiers & WQM_REGEX) desc += "REGEX|"; if (m_modifiers & WQM_FUZZY) desc += "FUZZY|"; if (desc.length() > 0 && desc[desc.length()-1] == '|') desc = desc.substr(0, desc.length()-1); } desc += " "; } // The string query parser code: /* Shamelessly lifted from Beagle: * This is our regular Expression Pattern: * we expect something like this: * -key:"Value String"modifiers * key:Value * or * Value */ /* The master regular expression used to parse a query string * Sub-expressions in parenthesis are numbered from 1. Each opening * parenthesis increases the index, but we're not interested in all */ static const char * parserExpr = "(OR|\\|\\|)[[:space:]]*" //1 OR,|| "|" "(AND|&&)[[:space:]]*" // 2 AND,&& (ignored, default) "|" "(" //3 "([+-])?" //4 Force or exclude indicator "(" //5 "([[:alpha:]][[:alnum:]:]*)" //6 Field spec: ie: "dc:title:letitre" "[[:space:]]*" "(:|=|<|>|<=|>=)" //7 Relation "[[:space:]]*)?" "(" //8 "(\"" //9 "([^\"]+)" //10 "A quoted term" "\")" "([a-zA-Z0-9]*)" //11 modifiers "|" "([^[:space:]\"]+)" //12 ANormalTerm ")" ")[[:space:]]*" ; // For debugging the parser. But see also NMATCH static const char *matchNames[] = { /* 0*/ "", /* 1*/ "OR", /* 2*/ "AND", /* 3*/ "", /* 4*/ "+-", /* 5*/ "", /* 6*/ "FIELD", /* 7*/ "RELATION", /* 8*/ "", /* 9*/ "", /*10*/ "QUOTEDTERM", /*11*/ "MODIIFIERS", /*12*/ "TERM", }; #define NMATCH (sizeof(matchNames) / sizeof(char *)) // Symbolic names for the interesting submatch indices enum SbMatchIdx {SMI_OR=1, SMI_AND=2, SMI_PM=4, SMI_FIELD=6, SMI_REL=7, SMI_QUOTED=10, SMI_MODIF=11, SMI_TERM=12}; static const int maxmatchlen = 1024; static const int errbuflen = 300; class StringToWasaQuery::Internal { public: Internal() : m_rxneedsfree(false) {} ~Internal() { if (m_rxneedsfree) regfree(&m_rx); } bool checkSubMatch(int i, char *match, string& reason) { if (i < 0 || i >= int(NMATCH) || m_pmatch[i].rm_so == -1) { //DPRINT((stderr, "checkSubMatch: no match: i %d rm_so %d\n", //i, m_pmatch[i].rm_so)); return false; } if (m_pmatch[i].rm_eo - m_pmatch[i].rm_so <= 0) { // weird and fatal reason = "Internal regular expression handling error"; return false; } //DPRINT((stderr, "checkSubMatch: so %d eo %d\n", m_pmatch[i].rm_so, //m_pmatch[i].rm_eo)); memcpy(match, m_cp + m_pmatch[i].rm_so, m_pmatch[i].rm_eo - m_pmatch[i].rm_so); match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0; return true; } WasaQuery *stringToQuery(const string& str, string& reason); friend class StringToWasaQuery; private: const char *m_cp; regex_t m_rx; bool m_rxneedsfree; regmatch_t m_pmatch[NMATCH]; }; StringToWasaQuery::StringToWasaQuery() : internal(new Internal) { } StringToWasaQuery::~StringToWasaQuery() { delete internal; } WasaQuery * StringToWasaQuery::stringToQuery(const string& str, string& reason) { return internal ? internal->stringToQuery(str, reason) : 0; } WasaQuery * StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) { if (m_rxneedsfree) regfree(&m_rx); char errbuf[errbuflen+1]; int errcode; if ((errcode = regcomp(&m_rx, parserExpr, REG_EXTENDED)) != 0) { regerror(errcode, &m_rx, errbuf, errbuflen); reason = errbuf; return 0; } m_rxneedsfree = true; const char *cpe; m_cp = str.c_str(); cpe = str.c_str() + str.length(); WasaQuery *query = new WasaQuery; query->m_op = WasaQuery::OP_AND; WasaQuery *orChain = 0; bool prev_or = false; // Loop on repeated regexp matches on the main string. for (int loop = 0;;loop++) { if ((errcode = regexec(&m_rx, m_cp, NMATCH, m_pmatch, 0))) { regerror(errcode, &m_rx, errbuf, errbuflen); reason = errbuf; return 0; } if (m_pmatch[0].rm_eo <= 0) { // weird and fatal reason = "Internal regular expression handling error"; return 0; } #ifdef DEB_WASASTRINGTOQ DPRINT((stderr, "Next part:\n")); for (unsigned int i = 0; i < NMATCH; i++) { if (m_pmatch[i].rm_so == -1) continue; char match[maxmatchlen+1]; memcpy(match, m_cp + m_pmatch[i].rm_so, m_pmatch[i].rm_eo - m_pmatch[i].rm_so); match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0; if (matchNames[i][0]) DPRINT((stderr, "%10s: [%s] (%d->%d)\n", matchNames[i], match, (int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo)); } #endif char match[maxmatchlen+1]; if (checkSubMatch(SMI_OR, match, reason)) { if (prev_or) { // Bad syntax reason = "Bad syntax: consecutive OR"; return 0; } if (orChain == 0) { // Fist OR seen: start OR subclause. if ((orChain = new WasaQuery()) == 0) { reason = "Out of memory"; return 0; } orChain->m_op = WasaQuery::OP_OR; } // For the first OR, we need to transfer the previous // query from the main vector to the OR subquery if (orChain->m_subs.empty() && !query->m_subs.empty()) { orChain->m_subs.push_back(query->m_subs.back()); query->m_subs.pop_back(); } prev_or = true; } else if (checkSubMatch(SMI_AND, match, reason)) { // Do nothing, AND is the default. We might want to check for // errors like consecutive ANDs, or OR AND } else { WasaQuery *nclause = new WasaQuery; if (nclause == 0) { reason = "Out of memory"; return 0; } // Check for quoted or unquoted value if (checkSubMatch(SMI_QUOTED, match, reason)) { nclause->m_value = match; } else if (checkSubMatch(SMI_TERM, match, reason)) { nclause->m_value = match; } if (nclause->m_value.empty()) { // Isolated +- or fieldname: without a value. Ignore until // told otherwise. DPRINT((stderr, "Clause with empty value, skipping\n")); delete nclause; goto nextfield; } if (checkSubMatch(SMI_MODIF, match, reason)) { DPRINT((stderr, "Got modifiers: [%s]\n", match)); unsigned int mods = 0; for (unsigned int i = 0; i < strlen(match); i++) { switch (match[i]) { case 'b': mods |= WasaQuery::WQM_BOOST; break; case 'c': break; case 'C': mods |= WasaQuery::WQM_CASESENS; break; case 'd': break; case 'D': mods |= WasaQuery::WQM_DIACSENS; break; case 'e': mods |= WasaQuery::WQM_CASESENS | WasaQuery::WQM_DIACSENS | WasaQuery::WQM_NOSTEM; break; case 'f': mods |= WasaQuery::WQM_FUZZY; break; case 'l': mods |= WasaQuery::WQM_NOSTEM; break; case 'L': break; case 'o': mods |= WasaQuery::WQM_PHRASESLACK; break; case 'p': mods |= WasaQuery::WQM_PROX; break; case 'r': mods |= WasaQuery::WQM_REGEX; break; case 's': mods |= WasaQuery::WQM_SLOPPY; break; case 'w': mods |= WasaQuery::WQM_WORDS; break; } } nclause->m_modifiers = WasaQuery::Modifier(mods); } // Field indicator ? if (checkSubMatch(SMI_FIELD, match, reason)) { // We used Check for special fields indicating sorting // etc. here but this went away from the spec. See 1.4 // if it comes back nclause->m_fieldspec = match; if (checkSubMatch(SMI_REL, match, reason)) { switch (match[0]) { case '=':nclause->m_rel = WasaQuery::REL_EQUALS;break; case ':':nclause->m_rel = WasaQuery::REL_CONTAINS;break; case '<': if (match[1] == '=') nclause->m_rel = WasaQuery::REL_LTE; else nclause->m_rel = WasaQuery::REL_LT; break; case '>': if (match[1] == '=') nclause->m_rel = WasaQuery::REL_GTE; else nclause->m_rel = WasaQuery::REL_GT; break; default: nclause->m_rel = WasaQuery::REL_CONTAINS; } } else { // ?? If field matched we should have a relation nclause->m_rel = WasaQuery::REL_CONTAINS; } } // +- indicator ? if (checkSubMatch(SMI_PM, match, reason) && match[0] == '-') { nclause->m_op = WasaQuery::OP_EXCL; } else { nclause->m_op = WasaQuery::OP_LEAF; } if (prev_or) { // The precedent token was an OR, add new clause to or chain //DPRINT((stderr, "Adding to OR chain\n")); orChain->m_subs.push_back(nclause); } else { if (orChain) { // Getting out of OR. Add the OR subquery to the main one //DPRINT((stderr, "Adding OR chain to main\n")); query->m_subs.push_back(orChain); orChain = 0; } //DPRINT((stderr, "Adding to main chain\n")); // Add new clause to main query query->m_subs.push_back(nclause); } prev_or = false; } nextfield: // Advance current string position. We checked earlier that // the increment is strictly positive, so we won't loop // forever m_cp += m_pmatch[0].rm_eo; if (m_cp >= cpe) break; } if (orChain) { // Getting out of OR. Add the OR subquery to the main one DPRINT((stderr, "Adding OR chain to main.Before: \n")); DUMPQ(query); DUMPQ(orChain); query->m_subs.push_back(orChain); } regfree(&m_rx); m_rxneedsfree = false; return query; } #else // TEST #include #include "wasastringtoquery.h" static char *thisprog; int main(int argc, char **argv) { thisprog = argv[0]; argc--; argv++; if (argc != 1) { fprintf(stderr, "need one arg\n"); exit(1); } const string str = *argv++;argc--; string reason; StringToWasaQuery qparser; WasaQuery *q = qparser.stringToQuery(str, reason); if (q == 0) { fprintf(stderr, "stringToQuery failed: %s\n", reason.c_str()); exit(1); } string desc; q->describe(desc); fprintf(stderr, "Finally: %s\n", desc.c_str()); exit(0); } #endif // TEST_WASASTRINGTOQUERY