diff --git a/src/query/wasastringtoquery.cpp b/src/query/wasastringtoquery.cpp index 1169a271..b7d5a2f3 100644 --- a/src/query/wasastringtoquery.cpp +++ b/src/query/wasastringtoquery.cpp @@ -20,9 +20,10 @@ #include #include +#include "smallut.h" #include "wasastringtoquery.h" -// #define DEB_WASASTRINGTOQ 1 +#undef DEB_WASASTRINGTOQ #ifdef DEB_WASASTRINGTOQ #define DPRINT(X) fprintf X #define DUMPQ(Q) {string D;Q->describe(D);fprintf(stderr, "%s\n", D.c_str());} @@ -89,13 +90,18 @@ void WasaQuery::describe(string &desc) const if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|"; if (m_modifiers & WQM_FUZZY) desc += "FUZZY|"; if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|"; - if (m_modifiers & WQM_PHRASESLACK) desc += "PHRASESLACK|"; + if (m_modifiers & WQM_PHRASESLACK) { + char buf[100]; + sprintf(buf, "%d", m_slack); + desc += "PHRASESLACK(" + string(buf) + string(")|"); + } if (m_modifiers & WQM_PROX) desc += "PROX|"; if (m_modifiers & WQM_REGEX) desc += "REGEX|"; if (m_modifiers & WQM_SLOPPY) desc += "SLOPPY|"; if (m_modifiers & WQM_WORDS) desc += "WORDS|"; + if (desc.length() > 0 && desc[desc.length()-1] == '|') - desc = desc.substr(0, desc.length()-1); + desc.erase(desc.length()-1); } desc += " "; } @@ -224,7 +230,11 @@ StringToWasaQuery::~StringToWasaQuery() WasaQuery * StringToWasaQuery::stringToQuery(const string& str, string& reason) { - return internal ? internal->stringToQuery(str, reason) : 0; + if (internal == 0) + return 0; + WasaQuery *wq = internal->stringToQuery(str, reason); + DUMPQ(wq); + return wq; } WasaQuery * @@ -316,6 +326,7 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) } // Check for quoted or unquoted value + unsigned int mods = 0; if (checkSubMatch(SMI_QUOTED, match, reason)) { nclause->m_value = match; } else if (checkSubMatch(SMI_TERM, match, reason)) { @@ -332,7 +343,6 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) if (checkSubMatch(SMI_MODIF, match, reason)) { DPRINT((stderr, "Got modifiers: [%s]\n", match)); - unsigned int mods = 0; for (unsigned int i = 0; i < strlen(match); i++) { switch (match[i]) { case 'b': @@ -350,7 +360,19 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) case 'f': mods |= WasaQuery::WQM_FUZZY; break; case 'l': mods |= WasaQuery::WQM_NOSTEM; break; case 'L': break; - case 'o': mods |= WasaQuery::WQM_PHRASESLACK; break; + case 'o': + mods |= WasaQuery::WQM_PHRASESLACK; + // Default slack if specified only by 'o' is 10. + nclause->m_slack = 10; + if (i < strlen(match) - 1) { + char *endptr; + int slack = strtol(match+i+1, &endptr, 10); + if (endptr != match+i+1) { + i += endptr - (match+i+1); + nclause->m_slack = slack; + } + } + break; case 'p': mods |= WasaQuery::WQM_PROX; break; case 'r': mods |= WasaQuery::WQM_REGEX; break; case 's': mods |= WasaQuery::WQM_SLOPPY; break; @@ -370,8 +392,8 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason) } } } - nclause->m_modifiers = WasaQuery::Modifier(mods); } + nclause->m_modifiers = WasaQuery::Modifier(mods); // Field indicator ? if (checkSubMatch(SMI_FIELD, match, reason)) { diff --git a/src/query/wasastringtoquery.h b/src/query/wasastringtoquery.h index baa30dcf..14e2ea77 100644 --- a/src/query/wasastringtoquery.h +++ b/src/query/wasastringtoquery.h @@ -63,7 +63,7 @@ public: typedef vector subqlist_t; WasaQuery() - : m_op(OP_NULL), m_modifiers(0), m_weight(1.0) + : m_op(OP_NULL), m_modifiers(0), m_slack(0), m_weight(1.0) {} ~WasaQuery(); @@ -86,6 +86,7 @@ public: vector m_subs; unsigned int m_modifiers; + int m_slack; float m_weight; }; diff --git a/src/query/wasatorcl.cpp b/src/query/wasatorcl.cpp index 6cfc3876..4a2088e4 100644 --- a/src/query/wasatorcl.cpp +++ b/src/query/wasatorcl.cpp @@ -134,8 +134,9 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa, continue; case WasaQuery::OP_LEAF: { - LOGDEB2(("wasaQueryToRcl: leaf clause [%s]:[%s]\n", - (*it)->m_fieldspec.c_str(), (*it)->m_value.c_str())); + LOGDEB(("wasaQueryToRcl: leaf clause [%s]:[%s] slack %d\n", + (*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(), + (*it)->m_slack)); // Change terms found in the "autosuffs" list into "ext" // field queries @@ -152,15 +153,17 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa, unsigned int mods = (unsigned int)(*it)->m_modifiers; - if (TextSplit::hasVisibleWhite((*it)->m_value)) { - int slack = (mods & WasaQuery::WQM_PHRASESLACK) ? 10 : 0; + // I'm not sure I understand the phrase/near detection + // thereafter anymore, maybe it would be better to have an + // explicit flag. Mods can only be set after a double + // quote. + if (TextSplit::hasVisibleWhite((*it)->m_value) || mods) { Rcl::SClType tp = Rcl::SCLT_PHRASE; if (mods & WasaQuery::WQM_PROX) { tp = Rcl::SCLT_NEAR; - slack = 10; } nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value, - slack, + (*it)->m_slack, (*it)->m_fieldspec); } else { nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, @@ -173,7 +176,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa, return 0; } if (mods & WasaQuery::WQM_NOSTEM) { - nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING); + nclause->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING); } if ((*it)->m_weight != 1.0) nclause->setWeight((*it)->m_weight); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 5bd8fb16..d391fc73 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -73,6 +73,9 @@ namespace Rcl { #endif const string pathelt_prefix = "XP"; +const string start_of_field_term = "XXST"; +const string end_of_field_term = "XXND"; + // This is used as a marker inside the abstract frag lists, but // normally doesn't remain in final output (which is built with a // custom sep. by our caller). @@ -831,6 +834,8 @@ class TextSplitDb : public TextSplit { Xapian::Document &d, StopList &_stops) : db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1) {} + // Reimplement text_to_words to add start and end special terms + virtual bool text_to_words(const string &in); bool takeword(const std::string &term, int pos, int, int); void setprefix(const string& pref) {prefix = pref;} void setwdfinc(int i) {wdfinc = i;} @@ -843,6 +848,38 @@ private: int wdfinc; }; + +bool TextSplitDb::text_to_words(const string &in) +{ + LOGDEB(("TextSplitDb::text_to_words\n")); + string ermsg; + try { + // Index the possibly prefixed start term. + doc.add_posting(prefix + start_of_field_term, basepos, wdfinc); + ++basepos; + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); + return false; + } + + if (!TextSplit::text_to_words(in)) { + LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n")); + return false; + } + + try { + // Index the possibly prefixed end term. + doc.add_posting(prefix + end_of_field_term, basepos+curpos+1, wdfinc); + ++basepos; + } XCATCHERROR(ermsg); + if (!ermsg.empty()) { + LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); + return false; + } + return true; +} + // Get one term from the doc, remove accents and lowercase, then add posting bool TextSplitDb::takeword(const std::string &_term, int pos, int, int) { diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index d8f35d4d..e4d03a38 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -287,6 +287,8 @@ private: string version_string(); extern const string pathelt_prefix; +extern const string start_of_field_term; +extern const string end_of_field_term; #ifndef NO_NAMESPACES } #endif // NO_NAMESPACES diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index efa1ae03..ad4b39d0 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -510,13 +510,13 @@ public: private: void expandTerm(bool dont, const string& term, list& exp, - string& sterm, string *prefix = 0); + string& sterm, const string& prefix); // After splitting entry on whitespace: process non-phrase element void processSimpleSpan(const string& span, bool nostemexp, list &pqueries); // Process phrase/near element void processPhraseOrNear(TextSplitQ *splitData, list &pqueries, - bool useNear, int slack); + bool useNear, int slack, int mods); Db& m_db; const string& m_field; @@ -554,7 +554,7 @@ static void listVector(const string& what, const vector&l) void StringToXapianQ::expandTerm(bool nostemexp, const string& term, list& exp, - string &sterm, string *prefix) + string &sterm, const string& prefix) { LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n", m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp)); @@ -571,29 +571,20 @@ void StringToXapianQ::expandTerm(bool nostemexp, nostemexp = true; if (nostemexp && !haswild) { - // Neither stemming nor wildcard expansion: just the word - string pfx; - const FieldTraits *ftp; - if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) { - pfx = ftp->pfx; - } - sterm = term; m_uterms.push_back(sterm); - exp.push_front(pfx+term); + exp.push_front(prefix + term); exp.resize(1); - if (prefix) - *prefix = pfx; } else { TermMatchResult res; if (haswild) { m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, - m_field, prefix); + m_field); } else { sterm = term; m_uterms.push_back(sterm); - m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field, - prefix); + m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, + m_field); } for (list::const_iterator it = res.entries.begin(); it != res.entries.end(); it++) { @@ -642,8 +633,15 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp, { list exp; string sterm; // dumb version of user term + string prefix; - expandTerm(nostemexp, span, exp, sterm, &prefix); + const FieldTraits *ftp; + if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) { + prefix = ftp->pfx; + } + + expandTerm(nostemexp, span, exp, sterm, prefix); + // m_terms is used for highlighting, we don't want prefixes in there. for (list::const_iterator it = exp.begin(); it != exp.end(); it++) { @@ -658,10 +656,9 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp, // less wqf). This does not happen if there are wildcards anywhere // in the search. if (m_doBoostUserTerms && !sterm.empty()) { - xq = Xapian::Query(Xapian::Query::OP_OR, - xq, - Xapian::Query(prefix+sterm, - original_term_wqf_booster)); + xq = Xapian::Query(Xapian::Query::OP_OR, xq, + Xapian::Query(prefix+sterm, + original_term_wqf_booster)); } pqueries.push_back(xq); } @@ -672,7 +669,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp, // don't do stemming for PHRASE though) void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, list &pqueries, - bool useNear, int slack) + bool useNear, int slack, int mods) { Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : Xapian::Query::OP_PHRASE; @@ -680,6 +677,17 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, bool hadmultiple = false; vector >groups; + string prefix; + const FieldTraits *ftp; + if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) { + prefix = ftp->pfx; + } + + if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) { + orqueries.push_back(Xapian::Query(prefix + start_of_field_term)); + slack++; + } + // Go through the list and perform stem/wildcard expansion for each element vector::iterator nxit = splitData->nostemexps.begin(); for (vector::iterator it = splitData->terms.begin(); @@ -691,8 +699,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, string sterm; listexp; - string prefix; - expandTerm(nostemexp, *it, exp, sterm, &prefix); + expandTerm(nostemexp, *it, exp, sterm, prefix); // groups is used for highlighting, we don't want prefixes in there. vector noprefs; @@ -709,6 +716,11 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, #endif } + if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) { + orqueries.push_back(Xapian::Query(prefix + end_of_field_term)); + slack++; + } + // Generate an appropriate PHRASE/NEAR query with adjusted slack // For phrases, give a relevance boost like we do for original terms LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n", @@ -727,6 +739,23 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end()); } +// Trim string beginning with ^ or ending with $ and convert to flags +static int stringToMods(string& s) +{ + int mods = 0; + // Check for an anchored search + trimstring(s); + if (s.length() > 0 && s[0] == '^') { + mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART; + s.erase(0, 1); + } + if (s.length() > 0 && s[s.length()-1] == '$') { + mods |= Rcl::SearchDataClause::SDCM_ANCHOREND; + s.erase(s.length()-1); + } + return mods; +} + /** * Turn user entry string (NOT query language) into a list of xapian queries. * We just separate words and phrases, and do wildcard and stem expansion, @@ -772,7 +801,8 @@ bool StringToXapianQ::processUserString(const string &iq, for (list::iterator it = phrases.begin(); it != phrases.end(); it++) { LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str())); - + int mods = stringToMods(*it); + int terminc = mods != 0 ? 1 : 0; // If there are multiple spans in this element, including // at least one composite, we have to increase the slack // else a phrase query including a span would fail. @@ -803,7 +833,7 @@ bool StringToXapianQ::processUserString(const string &iq, } LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size())); - switch (splitter->terms.size()) { + switch (splitter->terms.size() + terminc) { case 0: continue;// ?? case 1: @@ -811,7 +841,7 @@ bool StringToXapianQ::processUserString(const string &iq, splitter->nostemexps.front(), pqueries); break; default: - processPhraseOrNear(splitter, pqueries, useNear, slack); + processPhraseOrNear(splitter, pqueries, useNear, slack, mods); } } } catch (const Xapian::Error &e) { diff --git a/src/rcldb/searchdata.h b/src/rcldb/searchdata.h index 95879c56..74c23e52 100644 --- a/src/rcldb/searchdata.h +++ b/src/rcldb/searchdata.h @@ -165,7 +165,8 @@ private: class SearchDataClause { public: - enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1}; + enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2, + SDCM_ANCHOREND=4}; SearchDataClause(SClType tp) : m_tp(tp), m_parentSearch(0), m_haveWildCards(0), @@ -182,6 +183,12 @@ public: SClType getTp() {return m_tp;} void setParent(SearchData *p) {m_parentSearch = p;} virtual void setModifiers(Modifier mod) {m_modifiers = mod;} + virtual int getModifiers() {return m_modifiers;} + virtual void addModifier(Modifier mod) { + int imod = getModifiers(); + imod |= mod; + setModifiers(Modifier(imod)); + } virtual void setWeight(float w) {m_weight = w;} friend class SearchData; diff --git a/tests/anchor/anchor.sh b/tests/anchor/anchor.sh new file mode 100755 index 00000000..c22d3f3e --- /dev/null +++ b/tests/anchor/anchor.sh @@ -0,0 +1,31 @@ +#!/bin/sh + +topdir=`dirname $0`/.. +. $topdir/shared.sh + +initvariables $0 +( +for q in \ +'"^anchortermeaudebut"' \ +'"^ anchortermeunpeuplusloin"' \ +'"^anchortermeunpeuplusloin"o30' \ +'"^ anchortermeunpeuplusloin"o30' \ +'"anchortermenullepart"' \ +'"^anchortermenullepart"' \ +'"anchortermenullepart $"' \ +'"anchortermeunpeumoinsloin$"o30' \ +'"anchortermeunpeumoinsloin$"' \ +'"anchortermealafin$"' \ +'title:"^anchortitlebegin"' \ +'title:"^anchortitleend"' \ +'title:"anchortitleend$"' \ +; do + echo $q + recollq -q $q +done + +) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout + +diff -w ${myname}.txt $mystdout > $mydiffs 2>&1 + +checkresult diff --git a/tests/anchor/anchor.txt b/tests/anchor/anchor.txt new file mode 100644 index 00000000..4f17474f --- /dev/null +++ b/tests/anchor/anchor.txt @@ -0,0 +1,34 @@ +"^anchortermeaudebut" +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes +"^ anchortermeunpeuplusloin" +0 results +"^anchortermeunpeuplusloin"o30 +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes +"^ anchortermeunpeuplusloin"o30 +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes +"anchortermenullepart" +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes +"^anchortermenullepart" +0 results +"anchortermenullepart $" +0 results +"anchortermeunpeumoinsloin$"o30 +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes +"anchortermeunpeumoinsloin$" +0 results +"anchortermealafin$" +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes +title:"^anchortitlebegin" +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes +title:"^anchortitleend" +0 results +title:"anchortitleend$" +1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes