phrase queries with bot spans and words must be splitted as words only
This commit is contained in:
parent
e5725c9eb7
commit
31b348b736
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.23 2006-09-21 05:59:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.24 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -144,7 +144,8 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
|
||||
#endif
|
||||
|
||||
// Emit span. When splitting for query, we only emit final spans
|
||||
if (spanerase) {
|
||||
bool spanemitted = false;
|
||||
if (spanerase && !(m_flags & TXTS_NOSPANS)) {
|
||||
// Maybe trim at end These are chars that we would keep inside
|
||||
// a span, but not at the end
|
||||
while (span.length() > 0) {
|
||||
@ -162,13 +163,15 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
|
||||
}
|
||||
}
|
||||
breakloop1:
|
||||
spanemitted = true;
|
||||
if (!emitterm(true, span, spanpos, bp-span.length(), bp))
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// Emit word if different from span and not query mode
|
||||
if (!fq && (!spanerase || (word.length() != span.length())))
|
||||
// Emit word if different from span and not 'no words' mode
|
||||
if (!(m_flags & TXTS_ONLYSPANS) &&
|
||||
(!spanemitted || word.length() != span.length()))
|
||||
if (!emitterm(false, word, wordpos, bp-word.length(), bp))
|
||||
return false;
|
||||
|
||||
@ -404,7 +407,8 @@ static string thisprog;
|
||||
|
||||
static string usage =
|
||||
" textsplit [opts] [filename]\n"
|
||||
" -q: query mode\n"
|
||||
" -s: only spans\n"
|
||||
" -w: only words\n"
|
||||
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
||||
" \n\n"
|
||||
;
|
||||
@ -417,7 +421,8 @@ Usage(void)
|
||||
}
|
||||
|
||||
static int op_flags;
|
||||
#define OPT_q 0x1
|
||||
#define OPT_s 0x1
|
||||
#define OPT_w 0x2
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
@ -431,7 +436,8 @@ int main(int argc, char **argv)
|
||||
Usage();
|
||||
while (**argv)
|
||||
switch (*(*argv)++) {
|
||||
case 'q': op_flags |= OPT_q; break;
|
||||
case 's': op_flags |= OPT_s; break;
|
||||
case 'w': op_flags |= OPT_w; break;
|
||||
default: Usage(); break;
|
||||
}
|
||||
argc--; argv++;
|
||||
@ -439,7 +445,12 @@ int main(int argc, char **argv)
|
||||
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
||||
DebugLog::setfilename("stderr");
|
||||
mySplitterCB cb;
|
||||
TextSplit splitter(&cb, (op_flags&OPT_q) ? true: false);
|
||||
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
||||
if (op_flags&OPT_s)
|
||||
flags = TextSplit::TXTS_ONLYSPANS;
|
||||
else if (op_flags&OPT_w)
|
||||
flags = TextSplit::TXTS_NOSPANS;
|
||||
TextSplit splitter(&cb, flags);
|
||||
if (argc == 1) {
|
||||
string data;
|
||||
const char *filename = *argv++; argc--;
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||
#define _TEXTSPLIT_H_INCLUDED_
|
||||
/* @(#$Id: textsplit.h,v 1.11 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: textsplit.h,v 1.12 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#ifndef NO_NAMESPACES
|
||||
@ -44,18 +44,19 @@ class TextSplitCB {
|
||||
*/
|
||||
class TextSplit {
|
||||
public:
|
||||
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2};
|
||||
/**
|
||||
* Constructor: just store callback object
|
||||
*/
|
||||
TextSplit(TextSplitCB *t, bool forquery = false)
|
||||
: fq(forquery), cb(t), maxWordLength(40), prevpos(-1) {}
|
||||
TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE)
|
||||
: m_flags(flags), cb(t), maxWordLength(40), prevpos(-1) {}
|
||||
/**
|
||||
* Split text, emit words and positions.
|
||||
*/
|
||||
bool text_to_words(const std::string &in);
|
||||
|
||||
private:
|
||||
bool fq; // for query: Are we splitting for query or index ?
|
||||
Flags m_flags;
|
||||
TextSplitCB *cb;
|
||||
int maxWordLength;
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.89 2006-11-10 17:18:01 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.90 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -961,9 +961,20 @@ static void stringToXapianQueries(const string &iq,
|
||||
for (list<string>::iterator it=phrases.begin(); it !=phrases.end(); it++) {
|
||||
LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));
|
||||
|
||||
wsQData splitData;
|
||||
TextSplit splitter(&splitData, true);
|
||||
splitter.text_to_words(*it);
|
||||
// If there are both spans and single words in this element,
|
||||
// we need to use a word split, else a phrase query including
|
||||
// a span would fail if we didn't adjust the proximity to
|
||||
// account for the additional span term which is complicated.
|
||||
wsQData splitDataS, splitDataW;
|
||||
TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS);
|
||||
splitterS.text_to_words(*it);
|
||||
TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
|
||||
splitterW.text_to_words(*it);
|
||||
wsQData& splitData = splitDataS;
|
||||
if (splitDataS.terms.size() > 1 && splitDataS.terms.size() !=
|
||||
splitDataW.terms.size())
|
||||
splitData = splitDataW;
|
||||
|
||||
LOGDEB1(("strToXapianQ: splitter term count: %d\n",
|
||||
splitData.terms.size()));
|
||||
switch(splitData.terms.size()) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user