Implement anchored searches: terms to be found at a maximum distance of the start or end of the text

This commit is contained in:
Jean-Francois Dockes 2011-09-20 16:42:56 +02:00
parent 5a6534113b
commit ee0d602ab3
9 changed files with 210 additions and 43 deletions

View File

@ -20,9 +20,10 @@
#include <string.h>
#include <regex.h>
#include "smallut.h"
#include "wasastringtoquery.h"
// #define DEB_WASASTRINGTOQ 1
#undef DEB_WASASTRINGTOQ
#ifdef DEB_WASASTRINGTOQ
#define DPRINT(X) fprintf X
#define DUMPQ(Q) {string D;Q->describe(D);fprintf(stderr, "%s\n", D.c_str());}
@ -89,13 +90,18 @@ void WasaQuery::describe(string &desc) const
if (m_modifiers & WQM_DIACSENS) desc += "DIACSENS|";
if (m_modifiers & WQM_FUZZY) desc += "FUZZY|";
if (m_modifiers & WQM_NOSTEM) desc += "NOSTEM|";
if (m_modifiers & WQM_PHRASESLACK) desc += "PHRASESLACK|";
if (m_modifiers & WQM_PHRASESLACK) {
char buf[100];
sprintf(buf, "%d", m_slack);
desc += "PHRASESLACK(" + string(buf) + string(")|");
}
if (m_modifiers & WQM_PROX) desc += "PROX|";
if (m_modifiers & WQM_REGEX) desc += "REGEX|";
if (m_modifiers & WQM_SLOPPY) desc += "SLOPPY|";
if (m_modifiers & WQM_WORDS) desc += "WORDS|";
if (desc.length() > 0 && desc[desc.length()-1] == '|')
desc = desc.substr(0, desc.length()-1);
desc.erase(desc.length()-1);
}
desc += " ";
}
@ -224,7 +230,11 @@ StringToWasaQuery::~StringToWasaQuery()
WasaQuery *
StringToWasaQuery::stringToQuery(const string& str, string& reason)
{
return internal ? internal->stringToQuery(str, reason) : 0;
if (internal == 0)
return 0;
WasaQuery *wq = internal->stringToQuery(str, reason);
DUMPQ(wq);
return wq;
}
WasaQuery *
@ -316,6 +326,7 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
}
// Check for quoted or unquoted value
unsigned int mods = 0;
if (checkSubMatch(SMI_QUOTED, match, reason)) {
nclause->m_value = match;
} else if (checkSubMatch(SMI_TERM, match, reason)) {
@ -332,7 +343,6 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
if (checkSubMatch(SMI_MODIF, match, reason)) {
DPRINT((stderr, "Got modifiers: [%s]\n", match));
unsigned int mods = 0;
for (unsigned int i = 0; i < strlen(match); i++) {
switch (match[i]) {
case 'b':
@ -350,7 +360,19 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
case 'f': mods |= WasaQuery::WQM_FUZZY; break;
case 'l': mods |= WasaQuery::WQM_NOSTEM; break;
case 'L': break;
case 'o': mods |= WasaQuery::WQM_PHRASESLACK; break;
case 'o':
mods |= WasaQuery::WQM_PHRASESLACK;
// Default slack if specified only by 'o' is 10.
nclause->m_slack = 10;
if (i < strlen(match) - 1) {
char *endptr;
int slack = strtol(match+i+1, &endptr, 10);
if (endptr != match+i+1) {
i += endptr - (match+i+1);
nclause->m_slack = slack;
}
}
break;
case 'p': mods |= WasaQuery::WQM_PROX; break;
case 'r': mods |= WasaQuery::WQM_REGEX; break;
case 's': mods |= WasaQuery::WQM_SLOPPY; break;
@ -370,8 +392,8 @@ StringToWasaQuery::Internal::stringToQuery(const string& str, string& reason)
}
}
}
nclause->m_modifiers = WasaQuery::Modifier(mods);
}
nclause->m_modifiers = WasaQuery::Modifier(mods);
// Field indicator ?
if (checkSubMatch(SMI_FIELD, match, reason)) {

View File

@ -63,7 +63,7 @@ public:
typedef vector<WasaQuery*> subqlist_t;
WasaQuery()
: m_op(OP_NULL), m_modifiers(0), m_weight(1.0)
: m_op(OP_NULL), m_modifiers(0), m_slack(0), m_weight(1.0)
{}
~WasaQuery();
@ -86,6 +86,7 @@ public:
vector<WasaQuery*> m_subs;
unsigned int m_modifiers;
int m_slack;
float m_weight;
};

View File

@ -134,8 +134,9 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
continue;
case WasaQuery::OP_LEAF: {
LOGDEB2(("wasaQueryToRcl: leaf clause [%s]:[%s]\n",
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str()));
LOGDEB(("wasaQueryToRcl: leaf clause [%s]:[%s] slack %d\n",
(*it)->m_fieldspec.c_str(), (*it)->m_value.c_str(),
(*it)->m_slack));
// Change terms found in the "autosuffs" list into "ext"
// field queries
@ -152,15 +153,17 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
unsigned int mods = (unsigned int)(*it)->m_modifiers;
if (TextSplit::hasVisibleWhite((*it)->m_value)) {
int slack = (mods & WasaQuery::WQM_PHRASESLACK) ? 10 : 0;
// I'm not sure I understand the phrase/near detection
// thereafter anymore, maybe it would be better to have an
// explicit flag. Mods can only be set after a double
// quote.
if (TextSplit::hasVisibleWhite((*it)->m_value) || mods) {
Rcl::SClType tp = Rcl::SCLT_PHRASE;
if (mods & WasaQuery::WQM_PROX) {
tp = Rcl::SCLT_NEAR;
slack = 10;
}
nclause = new Rcl::SearchDataClauseDist(tp, (*it)->m_value,
slack,
(*it)->m_slack,
(*it)->m_fieldspec);
} else {
nclause = new Rcl::SearchDataClauseSimple(Rcl::SCLT_AND,
@ -173,7 +176,7 @@ static Rcl::SearchData *wasaQueryToRcl(RclConfig *config, WasaQuery *wasa,
return 0;
}
if (mods & WasaQuery::WQM_NOSTEM) {
nclause->setModifiers(Rcl::SearchDataClause::SDCM_NOSTEMMING);
nclause->addModifier(Rcl::SearchDataClause::SDCM_NOSTEMMING);
}
if ((*it)->m_weight != 1.0)
nclause->setWeight((*it)->m_weight);

View File

@ -73,6 +73,9 @@ namespace Rcl {
#endif
const string pathelt_prefix = "XP";
const string start_of_field_term = "XXST";
const string end_of_field_term = "XXND";
// This is used as a marker inside the abstract frag lists, but
// normally doesn't remain in final output (which is built with a
// custom sep. by our caller).
@ -831,6 +834,8 @@ class TextSplitDb : public TextSplit {
Xapian::Document &d, StopList &_stops)
: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
{}
// Reimplement text_to_words to add start and end special terms
virtual bool text_to_words(const string &in);
bool takeword(const std::string &term, int pos, int, int);
void setprefix(const string& pref) {prefix = pref;}
void setwdfinc(int i) {wdfinc = i;}
@ -843,6 +848,38 @@ private:
int wdfinc;
};
bool TextSplitDb::text_to_words(const string &in)
{
LOGDEB(("TextSplitDb::text_to_words\n"));
string ermsg;
try {
// Index the possibly prefixed start term.
doc.add_posting(prefix + start_of_field_term, basepos, wdfinc);
++basepos;
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
return false;
}
if (!TextSplit::text_to_words(in)) {
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
return false;
}
try {
// Index the possibly prefixed end term.
doc.add_posting(prefix + end_of_field_term, basepos+curpos+1, wdfinc);
++basepos;
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
return false;
}
return true;
}
// Get one term from the doc, remove accents and lowercase, then add posting
bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
{

View File

@ -287,6 +287,8 @@ private:
string version_string();
extern const string pathelt_prefix;
extern const string start_of_field_term;
extern const string end_of_field_term;
#ifndef NO_NAMESPACES
}
#endif // NO_NAMESPACES

View File

@ -510,13 +510,13 @@ public:
private:
void expandTerm(bool dont, const string& term, list<string>& exp,
string& sterm, string *prefix = 0);
string& sterm, const string& prefix);
// After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
// Process phrase/near element
void processPhraseOrNear(TextSplitQ *splitData,
list<Xapian::Query> &pqueries,
bool useNear, int slack);
bool useNear, int slack, int mods);
Db& m_db;
const string& m_field;
@ -554,7 +554,7 @@ static void listVector(const string& what, const vector<string>&l)
void StringToXapianQ::expandTerm(bool nostemexp,
const string& term,
list<string>& exp,
string &sterm, string *prefix)
string &sterm, const string& prefix)
{
LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
@ -571,29 +571,20 @@ void StringToXapianQ::expandTerm(bool nostemexp,
nostemexp = true;
if (nostemexp && !haswild) {
// Neither stemming nor wildcard expansion: just the word
string pfx;
const FieldTraits *ftp;
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
pfx = ftp->pfx;
}
sterm = term;
m_uterms.push_back(sterm);
exp.push_front(pfx+term);
exp.push_front(prefix + term);
exp.resize(1);
if (prefix)
*prefix = pfx;
} else {
TermMatchResult res;
if (haswild) {
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1,
m_field, prefix);
m_field);
} else {
sterm = term;
m_uterms.push_back(sterm);
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field,
prefix);
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1,
m_field);
}
for (list<TermMatchEntry>::const_iterator it = res.entries.begin();
it != res.entries.end(); it++) {
@ -642,8 +633,15 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
{
list<string> exp;
string sterm; // dumb version of user term
string prefix;
expandTerm(nostemexp, span, exp, sterm, &prefix);
const FieldTraits *ftp;
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
prefix = ftp->pfx;
}
expandTerm(nostemexp, span, exp, sterm, prefix);
// m_terms is used for highlighting, we don't want prefixes in there.
for (list<string>::const_iterator it = exp.begin();
it != exp.end(); it++) {
@ -658,10 +656,9 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
// less wqf). This does not happen if there are wildcards anywhere
// in the search.
if (m_doBoostUserTerms && !sterm.empty()) {
xq = Xapian::Query(Xapian::Query::OP_OR,
xq,
Xapian::Query(prefix+sterm,
original_term_wqf_booster));
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
Xapian::Query(prefix+sterm,
original_term_wqf_booster));
}
pqueries.push_back(xq);
}
@ -672,7 +669,7 @@ void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
// don't do stemming for PHRASE though)
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
list<Xapian::Query> &pqueries,
bool useNear, int slack)
bool useNear, int slack, int mods)
{
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
Xapian::Query::OP_PHRASE;
@ -680,6 +677,17 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
bool hadmultiple = false;
vector<vector<string> >groups;
string prefix;
const FieldTraits *ftp;
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
prefix = ftp->pfx;
}
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
slack++;
}
// Go through the list and perform stem/wildcard expansion for each element
vector<bool>::iterator nxit = splitData->nostemexps.begin();
for (vector<string>::iterator it = splitData->terms.begin();
@ -691,8 +699,7 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
string sterm;
list<string>exp;
string prefix;
expandTerm(nostemexp, *it, exp, sterm, &prefix);
expandTerm(nostemexp, *it, exp, sterm, prefix);
// groups is used for highlighting, we don't want prefixes in there.
vector<string> noprefs;
@ -709,6 +716,11 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
#endif
}
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
slack++;
}
// Generate an appropriate PHRASE/NEAR query with adjusted slack
// For phrases, give a relevance boost like we do for original terms
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
@ -727,6 +739,23 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end());
}
// Trim string beginning with ^ or ending with $ and convert to flags
static int stringToMods(string& s)
{
int mods = 0;
// Check for an anchored search
trimstring(s);
if (s.length() > 0 && s[0] == '^') {
mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
s.erase(0, 1);
}
if (s.length() > 0 && s[s.length()-1] == '$') {
mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
s.erase(s.length()-1);
}
return mods;
}
/**
* Turn user entry string (NOT query language) into a list of xapian queries.
* We just separate words and phrases, and do wildcard and stem expansion,
@ -772,7 +801,8 @@ bool StringToXapianQ::processUserString(const string &iq,
for (list<string>::iterator it = phrases.begin();
it != phrases.end(); it++) {
LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
int mods = stringToMods(*it);
int terminc = mods != 0 ? 1 : 0;
// If there are multiple spans in this element, including
// at least one composite, we have to increase the slack
// else a phrase query including a span would fail.
@ -803,7 +833,7 @@ bool StringToXapianQ::processUserString(const string &iq,
}
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
switch (splitter->terms.size()) {
switch (splitter->terms.size() + terminc) {
case 0:
continue;// ??
case 1:
@ -811,7 +841,7 @@ bool StringToXapianQ::processUserString(const string &iq,
splitter->nostemexps.front(), pqueries);
break;
default:
processPhraseOrNear(splitter, pqueries, useNear, slack);
processPhraseOrNear(splitter, pqueries, useNear, slack, mods);
}
}
} catch (const Xapian::Error &e) {

View File

@ -165,7 +165,8 @@ private:
class SearchDataClause {
public:
enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1};
enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=1, SDCM_ANCHORSTART=2,
SDCM_ANCHOREND=4};
SearchDataClause(SClType tp)
: m_tp(tp), m_parentSearch(0), m_haveWildCards(0),
@ -182,6 +183,12 @@ public:
SClType getTp() {return m_tp;}
void setParent(SearchData *p) {m_parentSearch = p;}
virtual void setModifiers(Modifier mod) {m_modifiers = mod;}
virtual int getModifiers() {return m_modifiers;}
virtual void addModifier(Modifier mod) {
int imod = getModifiers();
imod |= mod;
setModifiers(Modifier(imod));
}
virtual void setWeight(float w) {m_weight = w;}
friend class SearchData;

31
tests/anchor/anchor.sh Executable file
View File

@ -0,0 +1,31 @@
#!/bin/sh
topdir=`dirname $0`/..
. $topdir/shared.sh
initvariables $0
(
for q in \
'"^anchortermeaudebut"' \
'"^ anchortermeunpeuplusloin"' \
'"^anchortermeunpeuplusloin"o30' \
'"^ anchortermeunpeuplusloin"o30' \
'"anchortermenullepart"' \
'"^anchortermenullepart"' \
'"anchortermenullepart $"' \
'"anchortermeunpeumoinsloin$"o30' \
'"anchortermeunpeumoinsloin$"' \
'"anchortermealafin$"' \
'title:"^anchortitlebegin"' \
'title:"^anchortitleend"' \
'title:"anchortitleend$"' \
; do
echo $q
recollq -q $q
done
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
checkresult

34
tests/anchor/anchor.txt Normal file
View File

@ -0,0 +1,34 @@
"^anchortermeaudebut"
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes
"^ anchortermeunpeuplusloin"
0 results
"^anchortermeunpeuplusloin"o30
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes
"^ anchortermeunpeuplusloin"o30
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes
"anchortermenullepart"
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes
"^anchortermenullepart"
0 results
"anchortermenullepart $"
0 results
"anchortermeunpeumoinsloin$"o30
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes
"anchortermeunpeumoinsloin$"
0 results
"anchortermealafin$"
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes
title:"^anchortitlebegin"
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes
title:"^anchortitleend"
0 results
title:"anchortitleend$"
1 results
text/html [file:///home/dockes/projets/fulltext/testrecoll/anchor/tryanchor.html] [anchortitlebegin anchortitlemiddle anchortitleend] 1463 bytes