Implement anchored searches: terms to be found at a maximum distance of the start or end of the text

2011-09-20 16:42:56 +02:00 · 2011-09-20 16:42:56 +02:00 · ee0d602ab3
commit ee0d602ab3
parent 5a6534113b
9 changed files with 210 additions and 43 deletions
--- a/src/query/wasastringtoquery.cpp
+++ b/src/query/wasastringtoquery.cpp
@ -20,9 +20,10 @@
 #include <string.h>
 #include <regex.h>

+#include "smallut.h"
 #include "wasastringtoquery.h"

-// #define DEB_WASASTRINGTOQ 1
+#undef DEB_WASASTRINGTOQ
 #ifdef DEB_WASASTRINGTOQ
 #define DPRINT(X) fprintf X
 #define DUMPQ(Q) {string D;Q->describe(D);fprintf(stderr, "%s\n", D.c_str());}
@ -89,13 +90,18 @@ void WasaQuery::describe(string &desc) const
 	if (m_modifiers & WQM_DIACSENS)  desc += "DIACSENS|";
 	if (m_modifiers & WQM_FUZZY)     desc += "FUZZY|";
 	if (m_modifiers & WQM_NOSTEM)    desc += "NOSTEM|";
-	if (m_modifiers & WQM_PHRASESLACK) desc += "PHRASESLACK|";
+	if (m_modifiers & WQM_PHRASESLACK) {
+	    char buf[100];
+	    sprintf(buf, "%d", m_slack);
+	    desc += "PHRASESLACK(" + string(buf) + string(")|");
+	}
 	if (m_modifiers & WQM_PROX)      desc += "PROX|";
 	if (m_modifiers & WQM_REGEX)     desc += "REGEX|";
 	if (m_modifiers & WQM_SLOPPY)    desc += "SLOPPY|";
 	if (m_modifiers & WQM_WORDS)     desc += "WORDS|";
+
 	if (desc.length() > 0 && desc[desc.length()-1] == '|')
-	    desc = desc.substr(0, desc.length()-1);
+	    desc.erase(desc.length()-1);
    }
    desc += " ";
 }
@ -224,7 +230,11 @@ StringToWasaQuery::~StringToWasaQuery()
 WasaQuery *
 StringToWasaQuery::stringToQuery(const string& str, string& reason)
 {
-    return internal ? internal->stringToQuery(str, reason) : 0;
+    if (internal == 0)
+	return 0;
+    WasaQuery *wq = internal->stringToQuery(str, reason);
+    DUMPQ(wq);
+    return wq;
 }

 WasaQuery *
@ -316,6 +326,7 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
 	    }

 	    // Check for quoted or unquoted value
+	    unsigned int mods = 0;
 	    if (checkSubMatch(SMI_QUOTED, match, reason)) {
 		nclause->m_value = match;
 	    } else if (checkSubMatch(SMI_TERM, match, reason)) {
@ -332,7 +343,6 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
 	    
 	    if (checkSubMatch(SMI_MODIF, match, reason)) {
 		DPRINT((stderr, "Got modifiers: [%s]\n", match));
-		unsigned int mods = 0;
 		for (unsigned int i = 0; i < strlen(match); i++) {
 		    switch (match[i]) {
 		    case 'b': 
@ -350,7 +360,19 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
 		    case 'f': mods |= WasaQuery::WQM_FUZZY; break;
 		    case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
 		    case 'L': break;
-		    case 'o': mods |= WasaQuery::WQM_PHRASESLACK; break;
+		    case 'o': 
+			mods |= WasaQuery::WQM_PHRASESLACK; 
+			// Default slack if specified only by 'o' is 10.
+			nclause->m_slack = 10;
+			if (i < strlen(match) - 1) {
+			    char *endptr;
+			    int slack = strtol(match+i+1, &endptr, 10);
+			    if (endptr != match+i+1) {
+				i += endptr - (match+i+1);
+				nclause->m_slack = slack;
+			    }
+			}
+			break;
 		    case 'p': mods |= WasaQuery::WQM_PROX; break;
 		    case 'r': mods |= WasaQuery::WQM_REGEX; break;
 		    case 's': mods |= WasaQuery::WQM_SLOPPY; break;
@ -370,8 +392,8 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
 		    }
 		    }
 		}
-		nclause->m_modifiers = WasaQuery::Modifier(mods);
 	    }
+	    nclause->m_modifiers = WasaQuery::Modifier(mods);

 	    // Field indicator ?
 	    if (checkSubMatch(SMI_FIELD, match, reason)) {
--- a/src/query/wasastringtoquery.h
+++ b/src/query/wasastringtoquery.h
@ -63,7 +63,7 @@ public:
    typedef vector<WasaQuery*> subqlist_t;

    WasaQuery() 
-	: m_op(OP_NULL), m_modifiers(0), m_weight(1.0)
+	: m_op(OP_NULL), m_modifiers(0), m_slack(0), m_weight(1.0)
    {}

    ~WasaQuery();
@ -86,6 +86,7 @@ public:
    vector<WasaQuery*> m_subs;
    
    unsigned int   m_modifiers;
+    int            m_slack;
    float          m_weight;
 };

--- a/src/query/wasatorcl.cpp
+++ b/src/query/wasatorcl.cpp
@ -134,8 +134,9 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
 	    continue;

 	case WasaQuery::OP_LEAF: {
-	    LOGDEB2(("wasaQueryToRcl: leaf clause [%s]:[%s]\n", 
-		     (*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
+	    LOGDEB(("wasaQueryToRcl: leaf clause [%s]:[%s] slack %d\n", 
+		    (*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(),
+		    (*it)->m_slack));

            // Change terms found in the "autosuffs" list into "ext"
            // field queries
@ -152,15 +153,17 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,

 	    unsigned int mods = (unsigned int)(*it)->m_modifiers;

-	    if (TextSplit::hasVisibleWhite((*it)->m_value)) {
-		int slack = (mods & WasaQuery::WQM_PHRASESLACK) ? 10 : 0;
+	    // I'm not sure I understand the phrase/near detection
+	    // thereafter anymore, maybe it would be better to have an
+	    // explicit flag. Mods can only be set after a double
+	    // quote.
+	    if (TextSplit::hasVisibleWhite((*it)->m_value) || mods) {
 		Rcl::SClType tp = Rcl::SCLT_PHRASE;
 		if (mods & WasaQuery::WQM_PROX) {
 		    tp = Rcl::SCLT_NEAR;
-		    slack = 10;
 		}
 		nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value,
-							slack,
+							(*it)->m_slack,
 							(*it)->m_fieldspec);
 	    } else {
 		nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, 
@ -173,7 +176,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
 		return 0;
 	    }
 	    if (mods & WasaQuery::WQM_NOSTEM) {
-		nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING);
+		nclause->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
 	    }
 	    if ((*it)->m_weight != 1.0)
 		nclause->setWeight((*it)->m_weight);
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -73,6 +73,9 @@ namespace Rcl {
 #endif

 const string pathelt_prefix = "XP";
+const string start_of_field_term = "XXST";
+const string end_of_field_term = "XXND";
+
 // This is used as a marker inside the abstract frag lists, but
 // normally doesn't remain in final output (which is built with a
 // custom sep. by our caller).
@ -831,6 +834,8 @@ class TextSplitDb : public TextSplit {
 		Xapian::Document &d, StopList &_stops) 
 	: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
    {}
+    // Reimplement text_to_words to add start and end special terms
+    virtual bool text_to_words(const string &in);
    bool takeword(const std::string &term, int pos, int, int);
    void setprefix(const string& pref) {prefix = pref;}
    void setwdfinc(int i) {wdfinc = i;}
@ -843,6 +848,38 @@ private:
    int wdfinc;
 };

+
+bool TextSplitDb::text_to_words(const string &in) 
+{
+    LOGDEB(("TextSplitDb::text_to_words\n"));
+    string ermsg;
+    try {
+	// Index the possibly prefixed start term.
+	doc.add_posting(prefix + start_of_field_term, basepos, wdfinc);
+	++basepos;
+    } XCATCHERROR(ermsg);
+    if (!ermsg.empty()) {
+	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
+	return false;
+    }
+
+    if (!TextSplit::text_to_words(in)) {
+	LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
+	return false;
+    }
+
+    try {
+	// Index the possibly prefixed end term.
+	doc.add_posting(prefix + end_of_field_term, basepos+curpos+1, wdfinc);
+	++basepos;
+    } XCATCHERROR(ermsg);
+    if (!ermsg.empty()) {
+	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
+	return false;
+    }
+    return true;
+}
+
 // Get one term from the doc, remove accents and lowercase, then add posting
 bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
 {
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -287,6 +287,8 @@ private:
 string version_string();

 extern const string pathelt_prefix;
+extern const string start_of_field_term;
+extern const string end_of_field_term;
 #ifndef NO_NAMESPACES
 }
 #endif // NO_NAMESPACES
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -510,13 +510,13 @@ public:

 private:
    void expandTerm(bool dont, const string& term, list<string>& exp, 
-                    string& sterm, string *prefix = 0);
+                    string& sterm, const string& prefix);
    // After splitting entry on whitespace: process non-phrase element
    void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
    // Process phrase/near element
    void processPhraseOrNear(TextSplitQ *splitData, 
 			     list<Xapian::Query> &pqueries,
-			     bool useNear, int slack);
+			     bool useNear, int slack, int mods);

    Db&           m_db;
    const string& m_field;
@ -554,7 +554,7 @@ static void listVector(const string& what, const vector<string>&l)
 void StringToXapianQ::expandTerm(bool nostemexp, 
                                 const string& term, 
                                 list<string>& exp,
-                                 string &sterm, string *prefix)
+                                 string &sterm, const string& prefix)
 {
    LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n", 
             m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
@ -571,29 +571,20 @@ void StringToXapianQ::expandTerm(bool nostemexp,
 	nostemexp = true;

    if (nostemexp && !haswild) {
-	// Neither stemming nor wildcard expansion: just the word
-        string pfx;
-	const FieldTraits *ftp;
-        if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
-	    pfx = ftp->pfx;
-	}
-	    
 	sterm = term;
        m_uterms.push_back(sterm);
-	exp.push_front(pfx+term);
+	exp.push_front(prefix + term);
 	exp.resize(1);
-        if (prefix)
-            *prefix = pfx;
    } else {
 	TermMatchResult res;
 	if (haswild) {
 	    m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, 
-                           m_field, prefix);
+                           m_field);
 	} else {
 	    sterm = term;
            m_uterms.push_back(sterm);
-	    m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field,
-                           prefix);
+	    m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, 
+			   m_field);
 	}
 	for (list<TermMatchEntry>::const_iterator it = res.entries.begin(); 
 	     it != res.entries.end(); it++) {
@ -642,8 +633,15 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
 {
    list<string> exp;  
    string sterm; // dumb version of user term
+
    string prefix;
-    expandTerm(nostemexp, span, exp, sterm, &prefix);
+    const FieldTraits *ftp;
+    if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
+	prefix = ftp->pfx;
+    }
+
+    expandTerm(nostemexp, span, exp, sterm, prefix);
+
    // m_terms is used for highlighting, we don't want prefixes in there.
    for (list<string>::const_iterator it = exp.begin(); 
 	 it != exp.end(); it++) {
@ -658,10 +656,9 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
    // less wqf). This does not happen if there are wildcards anywhere
    // in the search.
    if (m_doBoostUserTerms && !sterm.empty()) {
-        xq = Xapian::Query(Xapian::Query::OP_OR, 
-                           xq, 
-                           Xapian::Query(prefix+sterm, 
-                                         original_term_wqf_booster));
+        xq = Xapian::Query(Xapian::Query::OP_OR, xq, 
+			   Xapian::Query(prefix+sterm, 
+					 original_term_wqf_booster));
    }
    pqueries.push_back(xq);
 }
@ -672,7 +669,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
 // don't do stemming for PHRASE though)
 void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, 
 					  list<Xapian::Query> &pqueries,
-					  bool useNear, int slack)
+					  bool useNear, int slack, int mods)
 {
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
 	Xapian::Query::OP_PHRASE;
@ -680,6 +677,17 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
    bool hadmultiple = false;
    vector<vector<string> >groups;

+    string prefix;
+    const FieldTraits *ftp;
+    if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
+	prefix = ftp->pfx;
+    }
+
+    if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
+	orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
+	slack++;
+    }
+
    // Go through the list and perform stem/wildcard expansion for each element
    vector<bool>::iterator nxit = splitData->nostemexps.begin();
    for (vector<string>::iterator it = splitData->terms.begin();
@ -691,8 +699,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,

 	string sterm;
 	list<string>exp;
-	string prefix;
-	expandTerm(nostemexp, *it, exp, sterm, &prefix);
+	expandTerm(nostemexp, *it, exp, sterm, prefix);

 	// groups is used for highlighting, we don't want prefixes in there.
 	vector<string> noprefs;
@ -709,6 +716,11 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
 #endif
    }

+    if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
+	orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
+	slack++;
+    }
+
    // Generate an appropriate PHRASE/NEAR query with adjusted slack
    // For phrases, give a relevance boost like we do for original terms
    LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n", 
@ -727,6 +739,23 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
    m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end());
 }

+// Trim string beginning with ^ or ending with $ and convert to flags
+static int stringToMods(string& s)
+{
+    int mods = 0;
+    // Check for an anchored search
+    trimstring(s);
+    if (s.length() > 0 && s[0] == '^') {
+	mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
+	s.erase(0, 1);
+    }
+    if (s.length() > 0 && s[s.length()-1] == '$') {
+	mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
+	s.erase(s.length()-1);
+    }
+    return mods;
+}
+
 /** 
 * Turn user entry string (NOT query language) into a list of xapian queries.
 * We just separate words and phrases, and do wildcard and stem expansion,
@ -772,7 +801,8 @@ bool StringToXapianQ::processUserString(const string &iq,
 	for (list<string>::iterator it = phrases.begin(); 
 	     it != phrases.end(); it++) {
 	    LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
-
+	    int mods = stringToMods(*it);
+	    int terminc = mods != 0 ? 1 : 0;
 	    // If there are multiple spans in this element, including
 	    // at least one composite, we have to increase the slack
 	    // else a phrase query including a span would fail. 
@ -803,7 +833,7 @@ bool StringToXapianQ::processUserString(const string &iq,
 	    }

 	    LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
-	    switch (splitter->terms.size()) {
+	    switch (splitter->terms.size() + terminc) {
 	    case 0: 
 		continue;// ??
 	    case 1: 
@ -811,7 +841,7 @@ bool StringToXapianQ::processUserString(const string &iq,
                                  splitter->nostemexps.front(), pqueries);
 		break;
 	    default:
-		processPhraseOrNear(splitter, pqueries, useNear, slack);
+		processPhraseOrNear(splitter, pqueries, useNear, slack, mods);
 	    }
 	}
    } catch (const Xapian::Error &e) {
--- a/src/rcldb/searchdata.h
+++ b/src/rcldb/searchdata.h
@ -165,7 +165,8 @@ private:

 class SearchDataClause {
 public:
-    enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1};
+    enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2,
+		   SDCM_ANCHOREND=4};

    SearchDataClause(SClType tp) 
 	: m_tp(tp), m_parentSearch(0), m_haveWildCards(0), 
@ -182,6 +183,12 @@ public:
    SClType getTp() {return m_tp;}
    void setParent(SearchData *p) {m_parentSearch = p;}
    virtual void setModifiers(Modifier mod) {m_modifiers = mod;}
+    virtual int getModifiers() {return m_modifiers;}
+    virtual void addModifier(Modifier mod) {
+	int imod = getModifiers();
+	imod |= mod;
+	setModifiers(Modifier(imod));
+    }
    virtual void setWeight(float w) {m_weight = w;}
    friend class SearchData;

--- a/tests/anchor/anchor.sh
+++ b/tests/anchor/anchor.sh
@ -0,0 +1,31 @@
+#!/bin/sh
+
+topdir=`dirname $0`/..
+. $topdir/shared.sh
+
+initvariables $0
+(
+for q in \
+'"^anchortermeaudebut"' \
+'"^ anchortermeunpeuplusloin"' \
+'"^anchortermeunpeuplusloin"o30' \
+'"^  anchortermeunpeuplusloin"o30' \
+'"anchortermenullepart"' \
+'"^anchortermenullepart"' \
+'"anchortermenullepart $"' \
+'"anchortermeunpeumoinsloin$"o30' \
+'"anchortermeunpeumoinsloin$"' \
+'"anchortermealafin$"' \
+'title:"^anchortitlebegin"' \
+'title:"^anchortitleend"' \
+'title:"anchortitleend$"' \
+; do 
+    echo $q
+    recollq -q $q
+done
+
+) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
+
+diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
+
+checkresult
--- a/tests/anchor/anchor.txt
+++ b/tests/anchor/anchor.txt
@ -0,0 +1,34 @@
+"^anchortermeaudebut"
+1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
+"^ anchortermeunpeuplusloin"
+0 results
+"^anchortermeunpeuplusloin"o30
+1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
+"^ anchortermeunpeuplusloin"o30
+1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
+"anchortermenullepart"
+1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
+"^anchortermenullepart"
+0 results
+"anchortermenullepart $"
+0 results
+"anchortermeunpeumoinsloin$"o30
+1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
+"anchortermeunpeumoinsloin$"
+0 results
+"anchortermealafin$"
+1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
+title:"^anchortitlebegin"
+1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
+title:"^anchortitleend"
+0 results
+title:"anchortitleend$"
+1 results
+text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes