phrase queries with bot spans and words must be splitted as words only

2006-11-12 08:35:11 +00:00 · 2006-11-12 08:35:11 +00:00 · 31b348b736
commit 31b348b736
parent e5725c9eb7
3 changed files with 39 additions and 16 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.23 2006-09-21 05:59:02 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.24 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -144,7 +144,8 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
 #endif

    // Emit span. When splitting for query, we only emit final spans
-    if (spanerase) {
+    bool spanemitted = false;
+    if (spanerase && !(m_flags & TXTS_NOSPANS)) {
 	// Maybe trim at end These are chars that we would keep inside 
 	// a span, but not at the end
 	while (span.length() > 0) {
@ -162,13 +163,15 @@ inline bool TextSplit::doemit(bool spanerase, int bp)
 	    }
 	}
    breakloop1:
+	spanemitted = true;
 	if (!emitterm(true, span, spanpos, bp-span.length(), bp))
 	    return false;
    }


-    // Emit word if different from span and not query mode
-    if (!fq && (!spanerase || (word.length() != span.length())))
+    // Emit word if different from span and not 'no words' mode
+    if (!(m_flags & TXTS_ONLYSPANS) && 
+	(!spanemitted || word.length() != span.length()))
 	if (!emitterm(false, word, wordpos, bp-word.length(), bp))
 	    return false;

@ -404,7 +407,8 @@ static string thisprog;

 static string usage =
    " textsplit [opts] [filename]\n"
-    "   -q: query mode\n"
+    "   -s:  only spans\n"
+    "   -w:  only words\n"
    " if filename is 'stdin', will read stdin for data (end with ^D)\n"
    "  \n\n"
    ;
@ -417,7 +421,8 @@ Usage(void)
 }

 static int        op_flags;
-#define OPT_q	  0x1 
+#define OPT_s	  0x1 
+#define OPT_w	  0x2

 int main(int argc, char **argv)
 {
@ -431,7 +436,8 @@ int main(int argc, char **argv)
 	    Usage();
 	while (**argv)
 	    switch (*(*argv)++) {
-	    case 'q':	op_flags |= OPT_q; break;
+	    case 's':	op_flags |= OPT_s; break;
+	    case 'w':	op_flags |= OPT_w; break;
 	    default: Usage();	break;
 	    }
 	argc--; argv++;
@ -439,7 +445,12 @@ int main(int argc, char **argv)
    DebugLog::getdbl()->setloglevel(DEBDEB1);
    DebugLog::setfilename("stderr");
    mySplitterCB cb;
-    TextSplit splitter(&cb, (op_flags&OPT_q) ? true: false);
+    TextSplit::Flags flags = TextSplit::TXTS_NONE;
+    if (op_flags&OPT_s)
+	flags = TextSplit::TXTS_ONLYSPANS;
+    else if (op_flags&OPT_w)
+	flags = TextSplit::TXTS_NOSPANS;
+    TextSplit splitter(&cb,  flags);
    if (argc == 1) {
 	string data;
 	const char *filename = *argv++;	argc--;
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -16,7 +16,7 @@
 */
 #ifndef _TEXTSPLIT_H_INCLUDED_
 #define _TEXTSPLIT_H_INCLUDED_
-/* @(#$Id: textsplit.h,v 1.11 2006-01-30 11:15:27 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: textsplit.h,v 1.12 2006-11-12 08:35:11 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
 #ifndef NO_NAMESPACES
@ -44,18 +44,19 @@ class TextSplitCB {
 */
 class TextSplit {
 public:
+    enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2};
    /**
     * Constructor: just store callback object
     */
-    TextSplit(TextSplitCB *t, bool forquery = false) 
-	: fq(forquery), cb(t), maxWordLength(40), prevpos(-1) {}
+    TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE) 
+	: m_flags(flags), cb(t), maxWordLength(40), prevpos(-1) {}
    /**
     * Split text, emit words and positions.
     */
    bool text_to_words(const std::string &in);

 private:
-    bool fq;        // for query:  Are we splitting for query or index ?
+    Flags m_flags;
    TextSplitCB *cb;
    int maxWordLength;

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.89 2006-11-10 17:18:01 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.90 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -961,9 +961,20 @@ static void stringToXapianQueries(const string &iq,
    for (list<string>::iterator it=phrases.begin(); it !=phrases.end(); it++) {
 	LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));

-	wsQData splitData;
-	TextSplit splitter(&splitData, true);
-	splitter.text_to_words(*it);
+	// If there are both spans and single words in this element,
+	// we need to use a word split, else a phrase query including
+	// a span would fail if we didn't adjust the proximity to
+	// account for the additional span term which is complicated.
+	wsQData splitDataS, splitDataW;
+	TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS);
+	splitterS.text_to_words(*it);
+	TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS);
+	splitterW.text_to_words(*it);
+	wsQData& splitData = splitDataS;
+	if (splitDataS.terms.size() > 1 && splitDataS.terms.size() != 
+	    splitDataW.terms.size())
+	    splitData = splitDataW;
+
 	LOGDEB1(("strToXapianQ: splitter term count: %d\n", 
 		splitData.terms.size()));
 	switch(splitData.terms.size()) {