phrase queries with bot spans and words must be splitted as words only

This commit is contained in:
dockes 2006-11-12 08:35:11 +00:00
parent e5725c9eb7
commit 31b348b736
3 changed files with 39 additions and 16 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.23 2006-09-21 05:59:02 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.24 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -144,7 +144,8 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
#endif
// Emit span. When splitting for query, we only emit final spans
if (spanerase) {
bool spanemitted = false;
if (spanerase && !(m_flags & TXTS_NOSPANS)) {
// Maybe trim at end These are chars that we would keep inside
// a span, but not at the end
while (span.length() > 0) {
@ -162,13 +163,15 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
}
}
breakloop1:
spanemitted = true;
if (!emitterm(true, span, spanpos, bp-span.length(), bp))
return false;
}
// Emit word if different from span and not query mode
if (!fq && (!spanerase || (word.length() != span.length())))
// Emit word if different from span and not 'no words' mode
if (!(m_flags & TXTS_ONLYSPANS) &&
(!spanemitted || word.length() != span.length()))
if (!emitterm(false, word, wordpos, bp-word.length(), bp))
return false;
@ -404,7 +407,8 @@ static string thisprog;
static string usage =
" textsplit [opts] [filename]\n"
" -q: query mode\n"
" -s: only spans\n"
" -w: only words\n"
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
" \n\n"
;
@ -417,7 +421,8 @@ Usage(void)
}
static int op_flags;
#define OPT_q 0x1
#define OPT_s 0x1
#define OPT_w 0x2
int main(int argc, char **argv)
{
@ -431,7 +436,8 @@ int main(int argc, char **argv)
Usage();
while (**argv)
switch (*(*argv)++) {
case 'q': op_flags |= OPT_q; break;
case 's': op_flags |= OPT_s; break;
case 'w': op_flags |= OPT_w; break;
default: Usage(); break;
}
argc--; argv++;
@ -439,7 +445,12 @@ int main(int argc, char **argv)
DebugLog::getdbl()->setloglevel(DEBDEB1);
DebugLog::setfilename("stderr");
mySplitterCB cb;
TextSplit splitter(&cb, (op_flags&OPT_q) ? true: false);
TextSplit::Flags flags = TextSplit::TXTS_NONE;
if (op_flags&OPT_s)
flags = TextSplit::TXTS_ONLYSPANS;
else if (op_flags&OPT_w)
flags = TextSplit::TXTS_NOSPANS;
TextSplit splitter(&cb, flags);
if (argc == 1) {
string data;
const char *filename = *argv++; argc--;

View File

@ -16,7 +16,7 @@
*/
#ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.11 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: textsplit.h,v 1.12 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#ifndef NO_NAMESPACES
@ -44,18 +44,19 @@ class TextSplitCB {
*/
class TextSplit {
public:
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2};
/**
* Constructor: just store callback object
*/
TextSplit(TextSplitCB *t, bool forquery = false)
: fq(forquery), cb(t), maxWordLength(40), prevpos(-1) {}
TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE)
: m_flags(flags), cb(t), maxWordLength(40), prevpos(-1) {}
/**
* Split text, emit words and positions.
*/
bool text_to_words(const std::string &in);
private:
bool fq; // for query: Are we splitting for query or index ?
Flags m_flags;
TextSplitCB *cb;
int maxWordLength;

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.89 2006-11-10 17:18:01 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.90 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -961,9 +961,20 @@ static void stringToXapianQueries(const string &iq,
for (list<string>::iterator it=phrases.begin(); it !=phrases.end(); it++) {
LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));
wsQData splitData;
TextSplit splitter(&splitData, true);
splitter.text_to_words(*it);
// If there are both spans and single words in this element,
// we need to use a word split, else a phrase query including
// a span would fail if we didn't adjust the proximity to
// account for the additional span term which is complicated.
wsQData splitDataS, splitDataW;
TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS);
splitterS.text_to_words(*it);
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
splitterW.text_to_words(*it);
wsQData& splitData = splitDataS;
if (splitDataS.terms.size() > 1 && splitDataS.terms.size() !=
splitDataW.terms.size())
splitData = splitDataW;
LOGDEB1(("strToXapianQ: splitter term count: %d\n",
splitData.terms.size()));
switch(splitData.terms.size()) {