From b6d4fd34bd6aa64be319e750db5a89a5ffba380b Mon Sep 17 00:00:00 2001 From: dockes Date: Mon, 12 Feb 2007 18:16:08 +0000 Subject: [PATCH] add wasabi modifiers --- src/query/wasastringtoquery.cpp | 168 +++++++++++++++----------------- src/query/wasastringtoquery.h | 40 ++++---- 2 files changed, 103 insertions(+), 105 deletions(-) diff --git a/src/query/wasastringtoquery.cpp b/src/query/wasastringtoquery.cpp index b8fc0adf..36528e3f 100644 --- a/src/query/wasastringtoquery.cpp +++ b/src/query/wasastringtoquery.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.4 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes"; +static char rcsid[] = "@(#$Id: wasastringtoquery.cpp,v 1.5 2007-02-12 18:16:08 dockes Exp $ (C) 2006 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -43,30 +43,6 @@ WasaQuery::~WasaQuery() void WasaQuery::describe(string &desc) const { - if (!m_types.empty()) { - desc += "type_restrict("; - for (vector::const_iterator it = m_types.begin(); - it != m_types.end(); it++) { - desc += *it + ", "; - } - desc.erase(desc.size() - 2); - desc += ")"; - } - if (m_sortSpec.size() > 1 || - (m_sortSpec.size() == 1 && m_sortSpec[0] != WQSK_REL)) { - desc += "sort_by("; - for (vector::const_iterator it = m_sortSpec.begin(); - it != m_sortSpec.end(); it++) { - switch (*it) { - case WQSK_DATE: desc += string("date") + ", ";break; - case WQSK_ALPHA: desc += string("name") + ", ";break; - case WQSK_GROUP: desc += string("group") + ", ";break; - case WQSK_REL: default: desc += string("relevance") + ", ";break; - } - } - desc.erase(desc.size() - 2); - desc += ")"; - } desc += "("; string fieldspec = m_fieldspec.empty() ? "" : m_fieldspec + ": "; switch (m_op) { @@ -93,7 +69,22 @@ void WasaQuery::describe(string &desc) const } if (desc[desc.length() - 1] == ' ') desc.erase(desc.length() - 1); - desc += ") "; + desc += ")"; + if (m_modifiers != 0) { + if (m_modifiers & WQM_CASESENS) desc += "CASESENS|"; + if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|"; + if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|"; + if (m_modifiers & WQM_BOOST) desc += "BOOST|"; + if (m_modifiers & WQM_PROX) desc += "PROX|"; + if (m_modifiers & WQM_SLOPPY) desc += "SLOPPY|"; + if (m_modifiers & WQM_WORDS) desc += "WORDS|"; + if (m_modifiers & WQM_PHRASESLACK) desc += "PHRASESLACK|"; + if (m_modifiers & WQM_REGEX) desc += "REGEX|"; + if (m_modifiers & WQM_FUZZY) desc += "FUZZY|"; + if (desc.length() > 0 && desc[desc.length()-1] == '|') + desc = desc.substr(0, desc.length()-1); + } + desc += " "; } // The string query parser code: @@ -101,7 +92,7 @@ void WasaQuery::describe(string &desc) const /* Shamelessly lifted from Beagle: * This is our regular Expression Pattern: * we expect something like this: - * -key:"Value String" + * -key:"Value String"modifiers * key:Value * or * Value @@ -125,14 +116,15 @@ static const char * parserExpr = "(" //2 "([+-])?" //3 Force or exclude indicator "(" //4 - "([[:alpha:]][[:alnum:]]+)" //5 Field spec: "fieldname:" + "([[:alpha:]][[:alnum:]]*)" //5 Field spec: "fieldname:" ":)?" "(" //6 "(\"" //7 "([^\"]+)" //8 "A quoted term" "\")" + "([a-zA-Z0-9]*)" //9 modifiers "|" - "([^[:space:]]+)" //9 ANormalTerm + "([^[:space:]\"]+)" //10 ANormalTerm ")" ")[[:space:]]*" ; @@ -148,12 +140,14 @@ static const char *matchNames[] = { /*6*/ "", /*7*/ "", /*8*/ "QUOTEDTERM", - /*9*/ "TERM", + /*9*/ "MODIIFIERS", + /*10*/ "TERM", }; #define NMATCH (sizeof(matchNames) / sizeof(char *)) // Symbolic names for the interesting submatch indices -enum SbMatchIdx {SMI_OR=1, SMI_PM=3, SMI_FIELD=5, SMI_QUOTED=8, SMI_TERM=9}; +enum SbMatchIdx {SMI_OR=1, SMI_PM=3, SMI_FIELD=5, SMI_QUOTED=8, + SMI_MODIF=9, SMI_TERM=10}; static const int maxmatchlen = 1024; static const int errbuflen = 300; @@ -170,13 +164,18 @@ public: } bool checkSubMatch(int i, char *match, string& reason) { - if (i < 0 || i >= int(NMATCH) || m_pmatch[i].rm_so == -1) + if (i < 0 || i >= int(NMATCH) || m_pmatch[i].rm_so == -1) { + //DPRINT((stderr, "checkSubMatch: no match: i %d rm_so %d\n", + //i, m_pmatch[i].rm_so)); return false; + } if (m_pmatch[i].rm_eo - m_pmatch[i].rm_so <= 0) { // weird and fatal reason = "Internal regular expression handling error"; return false; } + //DPRINT((stderr, "checkSubMatch: so %d eo %d\n", m_pmatch[i].rm_so, + //m_pmatch[i].rm_eo)); memcpy(match, m_cp + m_pmatch[i].rm_so, m_pmatch[i].rm_eo - m_pmatch[i].rm_so); match[m_pmatch[i].rm_eo - m_pmatch[i].rm_so] = 0; @@ -230,7 +229,7 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) WasaQuery *query = new WasaQuery; query->m_op = WasaQuery::OP_AND; - WasaQuery *orClause = 0; + WasaQuery *orChain = 0; bool prev_or = false; // Loop on repeated regexp matches on the main string. @@ -247,7 +246,7 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) } #ifdef DEB_WASASTRINGTOQ - if (loop) DPRINT((stderr, "Next part:\n")); + DPRINT((stderr, "Next part:\n")); for (unsigned int i = 0; i < NMATCH; i++) { if (m_pmatch[i].rm_so == -1) continue; char match[maxmatchlen+1]; @@ -259,6 +258,7 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) (int)m_pmatch[i].rm_so, (int)m_pmatch[i].rm_eo)); } #endif + char match[maxmatchlen+1]; if (checkSubMatch(SMI_OR, match, reason)) { if (prev_or) { @@ -267,19 +267,19 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) return 0; } - if (orClause == 0) { + if (orChain == 0) { // Fist OR seen: start OR subclause. - if ((orClause = new WasaQuery()) == 0) { + if ((orChain = new WasaQuery()) == 0) { reason = "Out of memory"; return 0; } - orClause->m_op = WasaQuery::OP_OR; + orChain->m_op = WasaQuery::OP_OR; } // We need to transfer the previous query from the main vector // to the OR subquery if (!query->m_subs.empty()) { - orClause->m_subs.push_back(query->m_subs.back()); + orChain->m_subs.push_back(query->m_subs.back()); query->m_subs.pop_back(); } prev_or = true; @@ -298,53 +298,44 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) } else if (checkSubMatch(SMI_TERM, match, reason)) { nclause->m_value = match; } + if (nclause->m_value.empty()) { // Isolated +- or fieldname: without a value. Ignore until // told otherwise. + DPRINT((stderr, "Clause with empty value, skipping\n")); delete nclause; goto nextfield; } + + if (checkSubMatch(SMI_MODIF, match, reason)) { + DPRINT((stderr, "Got modifiers: [%s]\n", match)); + unsigned int mods = 0; + for (unsigned int i = 0; i < strlen(match); i++) { + switch (match[i]) { + case 'C': mods |= WasaQuery::WQM_CASESENS; break; + case 'D': mods |= WasaQuery::WQM_DIACSENS; break; + case 'l': mods |= WasaQuery::WQM_NOSTEM; break; + case 'e': mods |= WasaQuery::WQM_CASESENS | + WasaQuery::WQM_DIACSENS | + WasaQuery::WQM_NOSTEM; break; + case 'f': mods |= WasaQuery::WQM_FUZZY; break; + case 'b': mods |= WasaQuery::WQM_BOOST; break; + case 'p': mods |= WasaQuery::WQM_PROX; break; + case 's': mods |= WasaQuery::WQM_SLOPPY; break; + case 'w': mods |= WasaQuery::WQM_WORDS; break; + case 'o': mods |= WasaQuery::WQM_PHRASESLACK; break; + case 'r': mods |= WasaQuery::WQM_REGEX; break; + } + } + nclause->m_modifiers = WasaQuery::Modifier(mods); + } // Field indicator ? if (checkSubMatch(SMI_FIELD, match, reason)) { - // Check for special fields - if (match == string("mime")) { - if (query->m_typeKind == WasaQuery::WQTK_NONE) - query->m_typeKind = WasaQuery::WQTK_MIME; - if (query->m_typeKind == WasaQuery::WQTK_MIME) - query->m_types.push_back(nclause->m_value); - delete nclause; - goto nextfield; - } else if (match == string("group")) { - if (query->m_typeKind == WasaQuery::WQTK_NONE) - query->m_typeKind = WasaQuery::WQTK_GROUP; - if (query->m_typeKind == WasaQuery::WQTK_GROUP) - query->m_types.push_back(nclause->m_value); - delete nclause; - goto nextfield; - } else if (match == string("filetype") || - match == string("ext")) { - if (query->m_typeKind == WasaQuery::WQTK_NONE) - query->m_typeKind = WasaQuery::WQTK_EXT; - if (query->m_typeKind == WasaQuery::WQTK_EXT) - query->m_types.push_back(nclause->m_value); - delete nclause; - goto nextfield; - } else if (match == string("sort")) { - if (nclause->m_value == "score") { - query->m_sortSpec.push_back(WasaQuery::WQSK_REL); - } else if (nclause->m_value == "date") { - query->m_sortSpec.push_back(WasaQuery::WQSK_DATE); - } else if (nclause->m_value == "alpha") { - query->m_sortSpec.push_back(WasaQuery::WQSK_ALPHA); - } else if (nclause->m_value == "group") { - query->m_sortSpec.push_back(WasaQuery::WQSK_GROUP); - } - delete nclause; - goto nextfield; - } else { - nclause->m_fieldspec = match; - } + // We used Check for special fields indicating sorting + // etc. here but this went away from the spec. See 1.4 + // if it comes back + nclause->m_fieldspec = match; } // +- indicator ? @@ -356,20 +347,21 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) if (prev_or) { - // We're in an OR subquery, add new subquery - orClause->m_subs.push_back(nclause); - DPRINT((stderr, "Adding to OR chain\n")); + // The precedent token was an OR, add new clause to or chain + //DPRINT((stderr, "Adding to OR chain\n")); + orChain->m_subs.push_back(nclause); } else { - if (orClause) { + if (orChain) { // Getting out of OR. Add the OR subquery to the main one - query->m_subs.push_back(orClause); - DPRINT((stderr, "Adding OR chain to main\n")); - orClause = 0; - } - // Add new subquery to main one. + //DPRINT((stderr, "Adding OR chain to main\n")); + query->m_subs.push_back(orChain); + orChain = 0; + } + //DPRINT((stderr, "Adding to main chain\n")); + // Add new clause to main query query->m_subs.push_back(nclause); - DPRINT((stderr, "Adding to main chain\n")); } + prev_or = false; } @@ -382,9 +374,9 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) break; } - if (orClause) { + if (orChain) { // Getting out of OR. Add the OR subquery to the main one - query->m_subs.push_back(orClause); + query->m_subs.push_back(orChain); DPRINT((stderr, "Adding OR chain to main\n")); } diff --git a/src/query/wasastringtoquery.h b/src/query/wasastringtoquery.h index a96be508..14418df8 100644 --- a/src/query/wasastringtoquery.h +++ b/src/query/wasastringtoquery.h @@ -1,6 +1,6 @@ #ifndef _WASASTRINGTOQUERY_H_INCLUDED_ #define _WASASTRINGTOQUERY_H_INCLUDED_ -/* @(#$Id: wasastringtoquery.h,v 1.4 2007-01-17 13:53:41 dockes Exp $ (C) 2006 J.F.Dockes */ +/* @(#$Id: wasastringtoquery.h,v 1.5 2007-02-12 18:16:08 dockes Exp $ (C) 2006 J.F.Dockes */ /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -25,10 +25,17 @@ using std::string; using std::vector; /** - * A simple class to represent a parsed wasabiSimple query - * string. Can hold a string value or an array of subqueries. - * The value can hold one or several words. In the latter case, it should - * be interpreted as a phrase (comes from a user-entered "quoted string"). + * A simple class to represent a parsed wasabiSimple query element. + * Can hold a string value or an array of subqueries. + * + * The complete query is represented by a top WasaQuery holding a + * chain of ANDed subclauses. Some of the subclauses may be themselves + * OR'ed lists (it doesn't go deeper). Entries in the AND list may be + * negated (AND NOT). + * + * For LEAF elements, the value can hold one or several words. In the + * latter case, it should be interpreted as a phrase (comes from a + * user-entered "quoted string"). */ class WasaQuery { public: @@ -36,34 +43,33 @@ public: typedef vector subqlist_t; WasaQuery() - : m_op(OP_NULL), m_typeKind(WQTK_NONE) + : m_op(OP_NULL), m_modifiers(0) {} + ~WasaQuery(); /** Get string describing the query tree from this point */ void describe(string &desc) const; - /** Op to be performed on either value or subqueries */ + /** Op to be performed on either value (may be LEAF or EXCL, or subqs */ WasaQuery::Op m_op; /** Field specification if any (ie: title, author ...) */ string m_fieldspec; - /* String value. Valid for op == OP_LEAF */ + /* String value. Valid for op == OP_LEAF or EXCL */ string m_value; /** Subqueries. Valid for conjunctions */ vector m_subs; - /** Restrict results to some file type, defined by either mime, - * app group, or extension */ - enum TypeKind {WQTK_NONE, WQTK_MIME, WQTK_GROUP, WQTK_EXT}; - TypeKind m_typeKind; - vector m_types; - - /** Sort on relevance, date, name or group */ - enum SortKind {WQSK_REL, WQSK_DATE, WQSK_ALPHA, WQSK_GROUP}; - vector m_sortSpec; + /** Modifiers for term handling: case/diacritics handling, + stemming control */ + enum Modifier {WQM_CASESENS = 1, WQM_DIACSENS = 2, WQM_NOSTEM = 4, + WQM_BOOST = 8, WQM_PROX = 0x10, WQM_SLOPPY = 0x20, + WQM_WORDS = 0x40, WQM_PHRASESLACK = 0x80, WQM_REGEX = 0x100, + WQM_FUZZY = 0x200}; + unsigned int m_modifiers; }; /**