Implement anchored searches: terms to be found at a maximum distance of the start or end of the text

2011-09-20 16:42:56 +02:00 · 2011-09-20 16:42:56 +02:00 · ee0d602ab3
commit ee0d602ab3
parent 5a6534113b
9 changed files with 210 additions and 43 deletions
--- a/src/query/wasastringtoquery.cpp
+++ b/src/query/wasastringtoquery.cpp
@ -20,9 +20,10 @@
 #include <string.h>
 #include <regex.h>
 #include "smallut.h"
 #include "wasastringtoquery.h"
-// #define DEB_WASASTRINGTOQ 1
+#undef DEB_WASASTRINGTOQ
 #ifdef DEB_WASASTRINGTOQ
 #define DPRINT(X) fprintf X
 #define DUMPQ(Q) {string D;Q->describe(D);fprintf(stderr, "%s\n", D.c_str());}
@ -89,13 +90,18 @@ void WasaQuery::describe(string &desc) const
 	if (m_modifiers & WQM_DIACSENS)  desc += "DIACSENS|";
 	if (m_modifiers & WQM_FUZZY)     desc += "FUZZY|";
 	if (m_modifiers & WQM_NOSTEM)    desc += "NOSTEM|";
-	if (m_modifiers & WQM_PHRASESLACK) desc += "PHRASESLACK|";
+	if (m_modifiers & WQM_PHRASESLACK) {
 	    char buf[100];
 	    sprintf(buf, "%d", m_slack);
 	    desc += "PHRASESLACK(" + string(buf) + string(")|");
 	}
 	if (m_modifiers & WQM_PROX)      desc += "PROX|";
 	if (m_modifiers & WQM_REGEX)     desc += "REGEX|";
 	if (m_modifiers & WQM_SLOPPY)    desc += "SLOPPY|";
 	if (m_modifiers & WQM_WORDS)     desc += "WORDS|";
 	if (desc.length() > 0 && desc[desc.length()-1] == '|')
-	    desc = desc.substr(0, desc.length()-1);
+	    desc.erase(desc.length()-1);
    }
    desc += " ";
 }
@ -224,7 +230,11 @@ StringToWasaQuery::~StringToWasaQuery()
 WasaQuery *
 StringToWasaQuery::stringToQuery(const string& str, string& reason)
 {
-    return internal ? internal->stringToQuery(str, reason) : 0;
+    if (internal == 0)
 	return 0;
    WasaQuery *wq = internal->stringToQuery(str, reason);
    DUMPQ(wq);
    return wq;
 }
 WasaQuery *
@ -316,6 +326,7 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
 	    }
 	    // Check for quoted or unquoted value
 	    unsigned int mods = 0;
 	    if (checkSubMatch(SMI_QUOTED, match, reason)) {
 		nclause->m_value = match;
 	    } else if (checkSubMatch(SMI_TERM, match, reason)) {
@ -332,7 +343,6 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
 	    if (checkSubMatch(SMI_MODIF, match, reason)) {
 		DPRINT((stderr, "Got modifiers: [%s]\n", match));
 		unsigned int mods = 0;
 		for (unsigned int i = 0; i < strlen(match); i++) {
 		    switch (match[i]) {
 		    case 'b': 
@ -350,7 +360,19 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
 		    case 'f': mods |= WasaQuery::WQM_FUZZY; break;
 		    case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
 		    case 'L': break;
-		    case 'o': mods |= WasaQuery::WQM_PHRASESLACK; break;
+		    case 'o': 
 			mods |= WasaQuery::WQM_PHRASESLACK; 
 			// Default slack if specified only by 'o' is 10.
 			nclause->m_slack = 10;
 			if (i < strlen(match) - 1) {
 			    char *endptr;
 			    int slack = strtol(match+i+1, &endptr, 10);
 			    if (endptr != match+i+1) {
 				i += endptr - (match+i+1);
 				nclause->m_slack = slack;
 			    }
 			}
 			break;
 		    case 'p': mods |= WasaQuery::WQM_PROX; break;
 		    case 'r': mods |= WasaQuery::WQM_REGEX; break;
 		    case 's': mods |= WasaQuery::WQM_SLOPPY; break;
@ -370,8 +392,8 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
 		    }
 		    }
 		}
 		nclause->m_modifiers = WasaQuery::Modifier(mods);
 	    }
 	    nclause->m_modifiers = WasaQuery::Modifier(mods);
 	    // Field indicator ?
 	    if (checkSubMatch(SMI_FIELD, match, reason)) {
--- a/src/query/wasastringtoquery.h
+++ b/src/query/wasastringtoquery.h
@ -63,7 +63,7 @@ public:
    typedef vector<WasaQuery*> subqlist_t;
    WasaQuery() 
-	: m_op(OP_NULL), m_modifiers(0), m_weight(1.0)
+	: m_op(OP_NULL), m_modifiers(0), m_slack(0), m_weight(1.0)
    {}
    ~WasaQuery();
@ -86,6 +86,7 @@ public:
    vector<WasaQuery*> m_subs;
    unsigned int   m_modifiers;
    int            m_slack;
    float          m_weight;
 };
--- a/src/query/wasatorcl.cpp
+++ b/src/query/wasatorcl.cpp
@ -134,8 +134,9 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
 	    continue;
 	case WasaQuery::OP_LEAF: {
-	    LOGDEB2(("wasaQueryToRcl: leaf clause [%s]:[%s]\n", 
+	    LOGDEB(("wasaQueryToRcl: leaf clause [%s]:[%s] slack %d\n", 
-		     (*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
+		    (*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(),
 		    (*it)->m_slack));
            // Change terms found in the "autosuffs" list into "ext"
            // field queries
@ -152,15 +153,17 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
 	    unsigned int mods = (unsigned int)(*it)->m_modifiers;
-	    if (TextSplit::hasVisibleWhite((*it)->m_value)) {
+	    // I'm not sure I understand the phrase/near detection
-		int slack = (mods & WasaQuery::WQM_PHRASESLACK) ? 10 : 0;
+	    // thereafter anymore, maybe it would be better to have an
 	    // explicit flag. Mods can only be set after a double
 	    // quote.
 	    if (TextSplit::hasVisibleWhite((*it)->m_value) || mods) {
 		Rcl::SClType tp = Rcl::SCLT_PHRASE;
 		if (mods & WasaQuery::WQM_PROX) {
 		    tp = Rcl::SCLT_NEAR;
 		    slack = 10;
 		}
 		nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value,
-							slack,
+							(*it)->m_slack,
 							(*it)->m_fieldspec);
 	    } else {
 		nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND, 
@ -173,7 +176,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
 		return 0;
 	    }
 	    if (mods & WasaQuery::WQM_NOSTEM) {
-		nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING);
+		nclause->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
 	    }
 	    if ((*it)->m_weight != 1.0)
 		nclause->setWeight((*it)->m_weight);
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -73,6 +73,9 @@ namespace Rcl {
 #endif
 const string pathelt_prefix = "XP";
 const string start_of_field_term = "XXST";
 const string end_of_field_term = "XXND";
 // This is used as a marker inside the abstract frag lists, but
 // normally doesn't remain in final output (which is built with a
 // custom sep. by our caller).
@ -831,6 +834,8 @@ class TextSplitDb : public TextSplit {
 		Xapian::Document &d, StopList &_stops) 
 	: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
    {}
    // Reimplement text_to_words to add start and end special terms
    virtual bool text_to_words(const string &in);
    bool takeword(const std::string &term, int pos, int, int);
    void setprefix(const string& pref) {prefix = pref;}
    void setwdfinc(int i) {wdfinc = i;}
@ -843,6 +848,38 @@ private:
    int wdfinc;
 };
 bool TextSplitDb::text_to_words(const string &in) 
 {
    LOGDEB(("TextSplitDb::text_to_words\n"));
    string ermsg;
    try {
 	// Index the possibly prefixed start term.
 	doc.add_posting(prefix + start_of_field_term, basepos, wdfinc);
 	++basepos;
    } XCATCHERROR(ermsg);
    if (!ermsg.empty()) {
 	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
 	return false;
    }
    if (!TextSplit::text_to_words(in)) {
 	LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
 	return false;
    }
    try {
 	// Index the possibly prefixed end term.
 	doc.add_posting(prefix + end_of_field_term, basepos+curpos+1, wdfinc);
 	++basepos;
    } XCATCHERROR(ermsg);
    if (!ermsg.empty()) {
 	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
 	return false;
    }
    return true;
 }
 // Get one term from the doc, remove accents and lowercase, then add posting
 bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
 {
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -287,6 +287,8 @@ private:
 string version_string();
 extern const string pathelt_prefix;
 extern const string start_of_field_term;
 extern const string end_of_field_term;
 #ifndef NO_NAMESPACES
 }
 #endif // NO_NAMESPACES
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -510,13 +510,13 @@ public:
 private:
    void expandTerm(bool dont, const string& term, list<string>& exp, 
-                    string& sterm, string *prefix = 0);
+                    string& sterm, const string& prefix);
    // After splitting entry on whitespace: process non-phrase element
    void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
    // Process phrase/near element
    void processPhraseOrNear(TextSplitQ *splitData, 
 			     list<Xapian::Query> &pqueries,
-			     bool useNear, int slack);
+			     bool useNear, int slack, int mods);
    Db&           m_db;
    const string& m_field;
@ -554,7 +554,7 @@ static void listVector(const string& what, const vector<string>&l)
 void StringToXapianQ::expandTerm(bool nostemexp, 
                                 const string& term, 
                                 list<string>& exp,
-                                 string &sterm, string *prefix)
+                                 string &sterm, const string& prefix)
 {
    LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n", 
             m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
@ -571,29 +571,20 @@ void StringToXapianQ::expandTerm(bool nostemexp,
 	nostemexp = true;
    if (nostemexp && !haswild) {
 	// Neither stemming nor wildcard expansion: just the word
        string pfx;
 	const FieldTraits *ftp;
        if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
 	    pfx = ftp->pfx;
 	}
 	sterm = term;
        m_uterms.push_back(sterm);
-	exp.push_front(pfx+term);
+	exp.push_front(prefix + term);
 	exp.resize(1);
        if (prefix)
            *prefix = pfx;
    } else {
 	TermMatchResult res;
 	if (haswild) {
 	    m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, 
-                           m_field, prefix);
+                           m_field);
 	} else {
 	    sterm = term;
            m_uterms.push_back(sterm);
-	    m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field,
+	    m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, 
-                           prefix);
+			   m_field);
 	}
 	for (list<TermMatchEntry>::const_iterator it = res.entries.begin(); 
 	     it != res.entries.end(); it++) {
@ -642,8 +633,15 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
 {
    list<string> exp;  
    string sterm; // dumb version of user term
    string prefix;
-    expandTerm(nostemexp, span, exp, sterm, &prefix);
+    const FieldTraits *ftp;
    if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
 	prefix = ftp->pfx;
    }
    expandTerm(nostemexp, span, exp, sterm, prefix);
    // m_terms is used for highlighting, we don't want prefixes in there.
    for (list<string>::const_iterator it = exp.begin(); 
 	 it != exp.end(); it++) {
@ -658,10 +656,9 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
    // less wqf). This does not happen if there are wildcards anywhere
    // in the search.
    if (m_doBoostUserTerms && !sterm.empty()) {
-        xq = Xapian::Query(Xapian::Query::OP_OR, 
+        xq = Xapian::Query(Xapian::Query::OP_OR, xq, 
-                           xq, 
+			   Xapian::Query(prefix+sterm, 
-                           Xapian::Query(prefix+sterm, 
+					 original_term_wqf_booster));
                                         original_term_wqf_booster));
    }
    pqueries.push_back(xq);
 }
@ -672,7 +669,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
 // don't do stemming for PHRASE though)
 void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, 
 					  list<Xapian::Query> &pqueries,
-					  bool useNear, int slack)
+					  bool useNear, int slack, int mods)
 {
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
 	Xapian::Query::OP_PHRASE;
@ -680,6 +677,17 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
    bool hadmultiple = false;
    vector<vector<string> >groups;
    string prefix;
    const FieldTraits *ftp;
    if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
 	prefix = ftp->pfx;
    }
    if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
 	orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
 	slack++;
    }
    // Go through the list and perform stem/wildcard expansion for each element
    vector<bool>::iterator nxit = splitData->nostemexps.begin();
    for (vector<string>::iterator it = splitData->terms.begin();
@ -691,8 +699,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
 	string sterm;
 	list<string>exp;
-	string prefix;
+	expandTerm(nostemexp, *it, exp, sterm, prefix);
 	expandTerm(nostemexp, *it, exp, sterm, &prefix);
 	// groups is used for highlighting, we don't want prefixes in there.
 	vector<string> noprefs;
@ -709,6 +716,11 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
 #endif
    }
    if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
 	orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
 	slack++;
    }
    // Generate an appropriate PHRASE/NEAR query with adjusted slack
    // For phrases, give a relevance boost like we do for original terms
    LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n", 
@ -727,6 +739,23 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
    m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end());
 }
 // Trim string beginning with ^ or ending with $ and convert to flags
 static int stringToMods(string& s)
 {
    int mods = 0;
    // Check for an anchored search
    trimstring(s);
    if (s.length() > 0 && s[0] == '^') {
 	mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
 	s.erase(0, 1);
    }
    if (s.length() > 0 && s[s.length()-1] == '$') {
 	mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
 	s.erase(s.length()-1);
    }
    return mods;
 }
 /** 
 * Turn user entry string (NOT query language) into a list of xapian queries.
 * We just separate words and phrases, and do wildcard and stem expansion,
@ -772,7 +801,8 @@ bool StringToXapianQ::processUserString(const string &iq,
 	for (list<string>::iterator it = phrases.begin(); 
 	     it != phrases.end(); it++) {
 	    LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
-
+	    int mods = stringToMods(*it);
 	    int terminc = mods != 0 ? 1 : 0;
 	    // If there are multiple spans in this element, including
 	    // at least one composite, we have to increase the slack
 	    // else a phrase query including a span would fail. 
@ -803,7 +833,7 @@ bool StringToXapianQ::processUserString(const string &iq,
 	    }
 	    LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
-	    switch (splitter->terms.size()) {
+	    switch (splitter->terms.size() + terminc) {
 	    case 0: 
 		continue;// ??
 	    case 1: 
@ -811,7 +841,7 @@ bool StringToXapianQ::processUserString(const string &iq,
                                  splitter->nostemexps.front(), pqueries);
 		break;
 	    default:
-		processPhraseOrNear(splitter, pqueries, useNear, slack);
+		processPhraseOrNear(splitter, pqueries, useNear, slack, mods);
 	    }
 	}
    } catch (const Xapian::Error &e) {
--- a/src/rcldb/searchdata.h
+++ b/src/rcldb/searchdata.h
@ -165,7 +165,8 @@ private:
 class SearchDataClause {
 public:
-    enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1};
+    enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2,
 		   SDCM_ANCHOREND=4};
    SearchDataClause(SClType tp) 
 	: m_tp(tp), m_parentSearch(0), m_haveWildCards(0), 
@ -182,6 +183,12 @@ public:
    SClType getTp() {return m_tp;}
    void setParent(SearchData *p) {m_parentSearch = p;}
    virtual void setModifiers(Modifier mod) {m_modifiers = mod;}
    virtual int getModifiers() {return m_modifiers;}
    virtual void addModifier(Modifier mod) {
 	int imod = getModifiers();
 	imod |= mod;
 	setModifiers(Modifier(imod));
    }
    virtual void setWeight(float w) {m_weight = w;}
    friend class SearchData;
--- a/tests/anchor/anchor.sh
+++ b/tests/anchor/anchor.sh
@ -0,0 +1,31 @@
 #!/bin/sh
 topdir=`dirname $0`/..
 . $topdir/shared.sh
 initvariables $0
 (
 for q in \
 '"^anchortermeaudebut"' \
 '"^ anchortermeunpeuplusloin"' \
 '"^anchortermeunpeuplusloin"o30' \
 '"^  anchortermeunpeuplusloin"o30' \
 '"anchortermenullepart"' \
 '"^anchortermenullepart"' \
 '"anchortermenullepart $"' \
 '"anchortermeunpeumoinsloin$"o30' \
 '"anchortermeunpeumoinsloin$"' \
 '"anchortermealafin$"' \
 'title:"^anchortitlebegin"' \
 'title:"^anchortitleend"' \
 'title:"anchortitleend$"' \
 ; do 
    echo $q
    recollq -q $q
 done
 ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
 checkresult
--- a/tests/anchor/anchor.txt
+++ b/tests/anchor/anchor.txt
@ -0,0 +1,34 @@
 "^anchortermeaudebut"
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
 "^ anchortermeunpeuplusloin"
 0 results
 "^anchortermeunpeuplusloin"o30
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
 "^ anchortermeunpeuplusloin"o30
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
 "anchortermenullepart"
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
 "^anchortermenullepart"
 0 results
 "anchortermenullepart $"
 0 results
 "anchortermeunpeumoinsloin$"o30
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
 "anchortermeunpeumoinsloin$"
 0 results
 "anchortermealafin$"
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
 title:"^anchortitlebegin"
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes	
 title:"^anchortitleend"
 0 results
 title:"anchortitleend$"
 1 results
 text/html	[file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html]	[anchortitlebegin anchortitlemiddle anchortitleend]	1463	bytes