From 4588803281d888a5ac0e849446e5bd917d25e175 Mon Sep 17 00:00:00 2001 From: dockes Date: Tue, 8 Feb 2005 10:56:13 +0000 Subject: [PATCH] phrases ok except for preview position --- src/common/textsplit.cpp | 69 +++++++++++++++++++++------------------- src/common/textsplit.h | 10 ++++-- src/rcldb/rcldb.cpp | 53 +++++++++++++++++++++++------- 3 files changed, 84 insertions(+), 48 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 3999a9e1..c64cb86f 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.6 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.7 2005-02-08 10:56:13 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_TEXTSPLIT @@ -65,12 +65,11 @@ static void setcharclasses() // Do some cleanup (the kind which is simpler to do here than in the main loop, // then send term to our client. -bool TextSplit::emitterm(bool isspan, string &w, int pos, bool doerase, +bool TextSplit::emitterm(bool isspan, string &w, int pos, int btstart, int btend) { LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos)); - if (fq && !isspan) - return true; + if (!cb) return false; @@ -113,13 +112,32 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos, bool doerase, } if (w.length() > 0 && w.length() < (unsigned)maxWordLength) { bool ret = cb->takeword(w, pos, btstart, btend); - if (doerase) - w.erase(); return ret; } return true; } +bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos, + bool spanerase, int bp) +{ + // When splitting for query, we only emit final spans + if (fq && !spanerase) { + wordpos++; + word.erase(); + return true; + } + if (!emitterm(true, span, spanpos, bp-span.length(), bp)) + return false; + if (word.length() != span.length() && !fq) + if (!emitterm(false, word, wordpos, bp-word.length(), bp)) + return false; + wordpos++; + if (spanerase) + span.erase(); + word.erase(); + return true; +} + /* * We basically emit a word every time we see a separator, but some chars are * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, @@ -143,11 +161,7 @@ bool TextSplit::text_to_words(const string &in) case SPACE: SPACE: if (word.length()) { - if (span.length() != word.length()) { - if (!emitterm(true, span, spanpos, true, i-span.length(), i)) - return false; - } - if (!emitterm(false, word, wordpos++, true, i-word.length(), i)) + if (!doemit(word, wordpos, span, spanpos, true, i)) return false; number = false; } @@ -163,11 +177,7 @@ bool TextSplit::text_to_words(const string &in) span += c; } } else { - if (span.length() != word.length()) { - if (!emitterm(true, span, spanpos, false, i-span.length(), i)) - return false; - } - if (!emitterm(false, word, wordpos++, true, i-word.length(), i)) + if (!doemit(word, wordpos, span, spanpos, false, i)) return false; number = false; span += c; @@ -175,11 +185,7 @@ bool TextSplit::text_to_words(const string &in) break; case '@': if (word.length()) { - if (span.length() != word.length()) { - if (!emitterm(true, span, spanpos, false, i-span.length(), i)) - return false; - } - if (!emitterm(false, word, wordpos++, true, i-word.length(), i)) + if (!doemit(word, wordpos, span, spanpos, false, i)) return false; number = false; } else @@ -188,11 +194,7 @@ bool TextSplit::text_to_words(const string &in) break; case '\'': if (word.length()) { - if (span.length() != word.length()) { - if (!emitterm(true, span, spanpos, false, i-span.length(), i)) - return false; - } - if (!emitterm(false, word, wordpos++, true, i-word.length(), i)) + if (!doemit(word, wordpos, span, spanpos, false, i)) return false; number = false; span += c; @@ -202,8 +204,9 @@ bool TextSplit::text_to_words(const string &in) if (number) { word += c; } else { + //cerr<<"Got . span: '"<setloglevel(DEBDEB1); DebugLog::setfilename("stderr"); mySplitterCB cb; - TextSplit splitter(&cb); + TextSplit splitter(&cb, true); if (argc == 2) { string data; if (!file_to_string(argv[1], data)) exit(1); splitter.text_to_words(data); } else { - cout << teststring << endl; + cout << endl << teststring << endl << endl; splitter.text_to_words(teststring); } diff --git a/src/common/textsplit.h b/src/common/textsplit.h index adb9d4b0..762fe405 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -1,8 +1,11 @@ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ -/* @(#$Id: textsplit.h,v 1.5 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: textsplit.h,v 1.6 2005-02-08 10:56:13 dockes Exp $ (C) 2004 J.F.Dockes */ #include +#ifndef NO_NAMESPACES +using std::string; +#endif // Function class whose called for every detected word class TextSplitCB { @@ -25,8 +28,9 @@ class TextSplit { bool fq; // Are we splitting for query or index ? TextSplitCB *cb; int maxWordLength; - bool emitterm(bool isspan, std::string &term, int pos, bool doerase, - int bs, int be); + bool emitterm(bool isspan, std::string &term, int pos, int bs, int be); + bool doemit(string &word, int &wordpos, string &span, int spanpos, + bool spanerase, int bp); public: /** * Constructor: just store callback and client data diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index bbe0583e..2474d641 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.20 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.21 2005-02-08 10:56:12 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include #include @@ -438,33 +438,62 @@ bool Rcl::Db::purge() class wsQData : public TextSplitCB { public: vector terms; - + string catterms() { + string s; + for (unsigned int i=0;i phrases; + ConfTree::stringToStrings(qstring, phrases); + for (list::const_iterator i=phrases.begin(); + i != phrases.end();i++) { + LOGDEB(("Rcl::Db::setQuery: phrase: '%s'\n", i->c_str())); + } + list pqueries; + for (list::const_iterator it = phrases.begin(); + it != phrases.end(); it++) { - ndb->query = Xapian::Query(Xapian::Query::OP_OR, splitData.terms.begin(), - splitData.terms.end()); + wsQData splitData; + TextSplit splitter(&splitData, true); + splitter.text_to_words(*it); + LOGDEB(("Splitter term count: %d\n", splitData.terms.size())); + switch(splitData.terms.size()) { + case 0: continue;// ?? + case 1: + pqueries.push_back(Xapian::Query(splitData.terms.front())); + break; + default: + LOGDEB(("Pushing phrase: %s\n", splitData.catterms().c_str())); + pqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE, + splitData.terms.begin(), + splitData.terms.end())); + } + } + ndb->query = Xapian::Query(Xapian::Query::OP_OR, pqueries.begin(), + pqueries.end()); delete ndb->enquire; ndb->enquire = new Xapian::Enquire(ndb->db); ndb->enquire->set_query(ndb->query);