diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index cf27ded9..7cf3cd97 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.23 2006-09-21 05:59:02 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.24 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -144,7 +144,8 @@ inline bool TextSplit::doemit(bool spanerase, int bp) #endif // Emit span. When splitting for query, we only emit final spans - if (spanerase) { + bool spanemitted = false; + if (spanerase && !(m_flags & TXTS_NOSPANS)) { // Maybe trim at end These are chars that we would keep inside // a span, but not at the end while (span.length() > 0) { @@ -162,13 +163,15 @@ inline bool TextSplit::doemit(bool spanerase, int bp) } } breakloop1: + spanemitted = true; if (!emitterm(true, span, spanpos, bp-span.length(), bp)) return false; } - // Emit word if different from span and not query mode - if (!fq && (!spanerase || (word.length() != span.length()))) + // Emit word if different from span and not 'no words' mode + if (!(m_flags & TXTS_ONLYSPANS) && + (!spanemitted || word.length() != span.length())) if (!emitterm(false, word, wordpos, bp-word.length(), bp)) return false; @@ -404,7 +407,8 @@ static string thisprog; static string usage = " textsplit [opts] [filename]\n" - " -q: query mode\n" + " -s: only spans\n" + " -w: only words\n" " if filename is 'stdin', will read stdin for data (end with ^D)\n" " \n\n" ; @@ -417,7 +421,8 @@ Usage(void) } static int op_flags; -#define OPT_q 0x1 +#define OPT_s 0x1 +#define OPT_w 0x2 int main(int argc, char **argv) { @@ -431,7 +436,8 @@ int main(int argc, char **argv) Usage(); while (**argv) switch (*(*argv)++) { - case 'q': op_flags |= OPT_q; break; + case 's': op_flags |= OPT_s; break; + case 'w': op_flags |= OPT_w; break; default: Usage(); break; } argc--; argv++; @@ -439,7 +445,12 @@ int main(int argc, char **argv) DebugLog::getdbl()->setloglevel(DEBDEB1); DebugLog::setfilename("stderr"); mySplitterCB cb; - TextSplit splitter(&cb, (op_flags&OPT_q) ? true: false); + TextSplit::Flags flags = TextSplit::TXTS_NONE; + if (op_flags&OPT_s) + flags = TextSplit::TXTS_ONLYSPANS; + else if (op_flags&OPT_w) + flags = TextSplit::TXTS_NOSPANS; + TextSplit splitter(&cb, flags); if (argc == 1) { string data; const char *filename = *argv++; argc--; diff --git a/src/common/textsplit.h b/src/common/textsplit.h index 3b175e41..05012ed4 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -16,7 +16,7 @@ */ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ -/* @(#$Id: textsplit.h,v 1.11 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: textsplit.h,v 1.12 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes */ #include #ifndef NO_NAMESPACES @@ -44,18 +44,19 @@ class TextSplitCB { */ class TextSplit { public: + enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2}; /** * Constructor: just store callback object */ - TextSplit(TextSplitCB *t, bool forquery = false) - : fq(forquery), cb(t), maxWordLength(40), prevpos(-1) {} + TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE) + : m_flags(flags), cb(t), maxWordLength(40), prevpos(-1) {} /** * Split text, emit words and positions. */ bool text_to_words(const std::string &in); private: - bool fq; // for query: Are we splitting for query or index ? + Flags m_flags; TextSplitCB *cb; int maxWordLength; diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index b775953f..24a282d7 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.89 2006-11-10 17:18:01 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.90 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -961,9 +961,20 @@ static void stringToXapianQueries(const string &iq, for (list::iterator it=phrases.begin(); it !=phrases.end(); it++) { LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str())); - wsQData splitData; - TextSplit splitter(&splitData, true); - splitter.text_to_words(*it); + // If there are both spans and single words in this element, + // we need to use a word split, else a phrase query including + // a span would fail if we didn't adjust the proximity to + // account for the additional span term which is complicated. + wsQData splitDataS, splitDataW; + TextSplit splitterS(&splitDataS, TextSplit::TXTS_ONLYSPANS); + splitterS.text_to_words(*it); + TextSplit splitterW(&splitDataW, TextSplit::TXTS_NOSPANS); + splitterW.text_to_words(*it); + wsQData& splitData = splitDataS; + if (splitDataS.terms.size() > 1 && splitDataS.terms.size() != + splitDataW.terms.size()) + splitData = splitDataW; + LOGDEB1(("strToXapianQ: splitter term count: %d\n", splitData.terms.size())); switch(splitData.terms.size()) {