diff --git a/src/common/textsplit.h b/src/common/textsplit.h index a73f8895..2b58e977 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -19,6 +19,7 @@ #include #include + #ifndef NO_NAMESPACES using std::string; using std::list; @@ -26,7 +27,6 @@ using std::list; class Utf8Iter; - /** * Split text into words. * See comments at top of .cpp for more explanations. diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 7bc066f9..c1fd0ca8 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -52,6 +52,7 @@ using namespace std; #include "rclversion.h" #include "cancelcheck.h" #include "ptmutex.h" +#include "termproc.h" #ifndef MAX #define MAX(A,B) (A>B?A:B) @@ -858,7 +859,7 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp) // The splitter breaks text into words and adds postings to the Xapian // document. We use a single object to split all of the document // fields and position jumps to separate fields -class TextSplitDb : public TextSplit { +class TextSplitDb : public TextSplitP { public: Xapian::WritableDatabase db; Xapian::Document &doc; // Xapian document @@ -873,17 +874,18 @@ class TextSplitDb : public TextSplit { // to compute the first position of the next section. Xapian::termpos curpos; - StopList &stops; TextSplitDb(Xapian::WritableDatabase idb, - Xapian::Document &d, StopList &_stops) - : db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1) + Xapian::Document &d, TermProc *prc) + : TextSplitP(prc), + db(idb), doc(d), basepos(1), curpos(0), wdfinc(1) {} // Reimplement text_to_words to add start and end special terms virtual bool text_to_words(const string &in); - bool takeword(const std::string &term, int pos, int, int); void setprefix(const string& pref) {prefix = pref;} void setwdfinc(int i) {wdfinc = i;} + friend class TermProcIdx; + private: // If prefix is set, we also add a posting for the prefixed terms // (ie: for titles, add postings for both "term" and "Sterm") @@ -892,7 +894,7 @@ private: int wdfinc; }; - +// Reimplement text_to_words to insert the begin and end anchor terms. bool TextSplitDb::text_to_words(const string &in) { LOGDEB2(("TextSplitDb::text_to_words\n")); @@ -908,7 +910,7 @@ bool TextSplitDb::text_to_words(const string &in) return false; } - if (!TextSplit::text_to_words(in)) { + if (!TextSplitP::text_to_words(in)) { LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n")); basepos += curpos + 100; return false; @@ -924,51 +926,45 @@ bool TextSplitDb::text_to_words(const string &in) basepos += curpos + 100; return false; } + basepos += curpos + 100; return true; } -// Get one term from the doc, remove accents and lowercase, then add posting -bool TextSplitDb::takeword(const std::string &_term, int pos, int, int) -{ - LOGDEB2(("TextSplitDb::takeword: [%s]\n", _term.c_str())); +class TermProcIdx : public TermProc { +public: + TermProcIdx() : TermProc(0), m_ts(0) {} + void setTSD(TextSplitDb *ts) {m_ts = ts;} - string term; - if (!unacmaybefold(_term, term, "UTF-8", true)) { - LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", - _term.c_str())); - term.clear(); - // We don't generate a fatal error because of a bad term - return true; - } - - if (stops.isStop(term)) { - LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str())); - return true; - } - - // Compute absolute position (pos is relative to current segment), - // and remember relative. - curpos = pos; - pos += basepos; - string ermsg; - try { - // Index without prefix, using the field-specific weighting - doc.add_posting(term, pos, wdfinc); + bool takeword(const std::string &term, int pos, int, int) + { + // Compute absolute position (pos is relative to current segment), + // and remember relative. + m_ts->curpos = pos; + pos += m_ts->basepos; + string ermsg; + try { + // Index without prefix, using the field-specific weighting + LOGDEB1(("Emitting term at %d : [%s]\n", pos, term.c_str())); + m_ts->doc.add_posting(term, pos, m_ts->wdfinc); #ifdef TESTING_XAPIAN_SPELL - if (Db::isSpellingCandidate(term)) { - db.add_spelling(term); - } + if (Db::isSpellingCandidate(term)) { + m_ts->db.add_spelling(term); + } #endif - // Index the prefixed term. - if (!prefix.empty()) { - doc.add_posting(prefix + term, pos, wdfinc); - } - return true; - } XCATCHERROR(ermsg); - LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); - return false; -} + // Index the prefixed term. + if (!m_ts->prefix.empty()) { + m_ts->doc.add_posting(m_ts->prefix + term, pos, m_ts->wdfinc); + } + return true; + } XCATCHERROR(ermsg); + LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str())); + return false; + } +private: + TextSplitDb *m_ts; +}; + #ifdef TESTING_XAPIAN_SPELL string Db::getSpellingSuggestion(const string& word) @@ -1032,8 +1028,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc doc = idoc; Xapian::Document newdocument; - TextSplitDb splitter(m_ndb->xwdb, newdocument, m_stops); - + TermProcIdx tpidx; +// TermProcStop tpstop(&tpidx, m_stops); + TermProcCommongrams tpstop(&tpidx, m_stops); + TermProcPrep tpprep(&tpstop); + TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep); + tpidx.setTSD(&splitter); // Split and index file name as document term(s) LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str())); if (!splitter.text_to_words(doc.utf8fn)) diff --git a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp index 0208f63b..be9f14ad 100644 --- a/src/rcldb/searchdata.cpp +++ b/src/rcldb/searchdata.cpp @@ -35,6 +35,7 @@ #include "utf8iter.h" #include "stoplist.h" #include "rclconfig.h" +#include "termproc.h" #ifndef NO_NAMESPACES using namespace std; @@ -474,36 +475,23 @@ void SearchData::getUTerms(vector& terms) const // phrases. This is for parts of the user entry which would appear as // a single word because there is no white space inside, but are // actually multiple terms to rcldb (ie term1,term2) -class TextSplitQ : public TextSplit { +class TextSplitQ : public TextSplitP { public: - TextSplitQ(Flags flags, const StopList &_stops) - : TextSplit(flags), stops(_stops), alltermcount(0), lastpos(0) + TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc) + : TextSplitP(prc, flags), stops(_stops), alltermcount(0), lastpos(0) {} - bool takeword(const std::string &interm, int pos, int, int) { - alltermcount++; - lastpos = pos - LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str())); + bool takeword(const std::string &term, int pos, int bs, int be) + { // Check if the first letter is a majuscule in which - // case we do not want to do stem expansion. - bool nostemexp = unaciscapital(interm); - string noaclowterm; - if (!unacmaybefold(interm, noaclowterm, "UTF-8", true)) { - LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", - interm.c_str())); - return true; - } + // case we do not want to do stem expansion. Need to do this + // before unac of course... + curnostemexp = unaciscapital(term); - if (stops.isStop(noaclowterm)) { - LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n", - noaclowterm.c_str())); - return true; - } - terms.push_back(noaclowterm); - nostemexps.push_back(nostemexp); - return true; + return TextSplitP::takeword(term, pos, bs, be); } + bool curnostemexp; vector terms; vector nostemexps; const StopList &stops; @@ -513,6 +501,26 @@ class TextSplitQ : public TextSplit { int lastpos; }; +class TermProcQ : public TermProc { +public: + TermProcQ() : TermProc(0), m_ts(0) {} + void setTSQ(TextSplitQ *ts) {m_ts = ts;} + + bool takeword(const std::string &term, int pos, int bs, int be) + { + m_ts->alltermcount++; + m_ts->lastpos = pos; + bool noexpand = be ? m_ts->curnostemexp : true; + LOGDEB(("TermProcQ::takeword: pushing [%s] noexp %d\n", + term.c_str(), noexpand)); + m_ts->terms.push_back(term); + m_ts->nostemexps.push_back(noexpand); + return true; + } +private: + TextSplitQ *m_ts; +}; + // A class used to translate a user compound string (*not* a query // language string) as may be entered in any_terms/all_terms search // entry fields, ex: [term1 "a phrase" term3] into a xapian query @@ -566,7 +574,7 @@ private: vector > m_groups; }; -#if 0 +#if 1 static void listVector(const string& what, const vector&l) { string a; @@ -575,6 +583,14 @@ static void listVector(const string& what, const vector&l) } LOGDEB(("%s: %s\n", what.c_str(), a.c_str())); } +static void listList(const string& what, const list& l) +{ + string a; + for (list::const_iterator it = l.begin(); it != l.end(); it++) { + a = a + *it + " "; + } + LOGDEB(("%s: %s\n", what.c_str(), a.c_str())); +} #endif /** Expand stem and wildcards @@ -734,15 +750,17 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, vector::iterator nxit = splitData->nostemexps.begin(); for (vector::iterator it = splitData->terms.begin(); it != splitData->terms.end(); it++, nxit++) { + LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str())); // Adjust when we do stem expansion. Not inside phrases, and // some versions of xapian will accept only one OR clause // inside NEAR, all others must be leafs. bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple; string sterm; - listexp; + list exp; expandTerm(nostemexp, *it, exp, sterm, prefix); - + LOGDEB0(("ProcessPhrase: exp size %d\n", exp.size())); + listList("", exp); // groups is used for highlighting, we don't want prefixes in there. vector noprefs; for (list::const_iterator it = exp.begin(); @@ -859,21 +877,32 @@ bool StringToXapianQ::processUserString(const string &iq, // We now adjust the phrase/near slack by the term count // difference (this is mainly better for cjk where this is a very // common occurrence because of the ngrams thing. + + TermProcQ tpq; + // TermProcStop tpstop(&tpidx, stops); + TermProcCommongrams tpstop(&tpq, stops); + tpstop.onlygrams(true); + TermProcPrep tpprep(&tpstop); + TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | - TextSplit::TXTS_KEEPWILD), - stops); + TextSplit::TXTS_KEEPWILD), + stops, &tpprep); + tpq.setTSQ(&splitterS); splitterS.text_to_words(*it); + LOGDEB(("SplitterS has %d terms\n", splitterS.terms.size())); TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS | TextSplit::TXTS_KEEPWILD), - stops); + stops, &tpprep); + tpq.setTSQ(&splitterW); + tpstop.onlygrams(false); splitterW.text_to_words(*it); - TextSplitQ *splitter = &splitterS; + if (splitterS.terms.size() > 1 && splitterS.terms.size() != splitterW.terms.size()) { slack += splitterW.terms.size() - splitterS.terms.size(); - // used to: splitData = &splitDataW; } + TextSplitQ *splitter = &splitterS; LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size())); switch (splitter->terms.size() + terminc) { case 0: diff --git a/src/rcldb/termproc.h b/src/rcldb/termproc.h new file mode 100644 index 00000000..d7b6e777 --- /dev/null +++ b/src/rcldb/termproc.h @@ -0,0 +1,182 @@ +/* Copyright (C) 2011 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + + +#ifndef _TERMPROC_H_INCLUDED_ +#define _TERMPROC_H_INCLUDED_ + +#include "textsplit.h" +#include "stoplist.h" +namespace Rcl { +class TermProc { +public: + TermProc(TermProc* next) : m_next(next) {} + virtual ~TermProc() {} + virtual bool takeword(const string &term, int pos, int bs, int be) + { + if (m_next) + return m_next->takeword(term, pos, bs, be); + else + return true; + } + virtual bool flush() + { + if (m_next) + return m_next->flush(); + else + return true; + } +private: + TermProc *m_next; +}; + +class TextSplitP : public TextSplit { +public: + TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE)) + : TextSplit(flags), m_prc(prc) + {} + + virtual bool text_to_words(const string &in) + { + bool ret = TextSplit::text_to_words(in); + if (m_prc && !m_prc->flush()) + return false; + return ret; + } + + virtual bool takeword(const string& term, int pos, int bs, int be) + { + if (m_prc) + return m_prc->takeword(term, pos, bs, be); + else + return true; + } + +private: + TermProc *m_prc; +}; + +class TermProcPrep : public TermProc { +public: + TermProcPrep(TermProc *nxt) : TermProc(nxt) {} + + virtual bool takeword(const string& itrm, int pos, int bs, int be) + { + string otrm; + if (!unacmaybefold(itrm, otrm, "UTF-8", true)) { + LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str())); + // We don't generate a fatal error because of a bad term + return true; + } + return TermProc::takeword(otrm, pos, bs, be); + } +}; + +class TermProcStop : public TermProc { +public: + TermProcStop(TermProc *nxt, const Rcl::StopList& stops) + : TermProc(nxt), m_stops(stops) { } + virtual bool takeword(const string& term, int pos, int bts, int bte) + { + if (m_stops.isStop(term)) { + return true; + } + return TermProc::takeword(term, pos, bts, bte); + } +private: + const Rcl::StopList& m_stops; +}; + +class TermProcCommongrams : public TermProc { +public: + TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops) + : TermProc(nxt), m_stops(stops), m_onlygrams(false) { } + + virtual bool takeword(const string& term, int pos, int bs, int be) + { + LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n", + pos, bs, be, term.c_str())); + bool isstop = m_stops.isStop(term); + bool twogramemit = false; + + if (!m_prevterm.empty() && (m_prevstop || isstop)) { + // create 2-gram. space unnecessary but improves + // lisibility of queries + string twogram; + twogram.swap(m_prevterm); + twogram.append(1, ' '); + twogram += term; + // When emitting a complex term we set the bps to 0. This may + // be used by our clients + if (!TermProc::takeword(twogram, m_prevpos, 0, 0)) + return false; + twogramemit = true; +#if 0 + if (m_stops.isStop(twogram)) { + firstword = twogram; + isstop = false; + } +#endif + } + + m_prevterm = term; + m_prevstop = isstop; + m_prevpos = pos; + m_prevsent = false; + m_prevbs = bs; + m_prevbe = be; + // If flags allow, emit the bare term at the current pos. + if (!m_onlygrams || (!isstop && !twogramemit)) { + if (!TermProc::takeword(term, pos, bs, be)) + return false; + m_prevsent = true; + } + + return true; + } + + bool flush() + { + if (!m_prevsent && !m_prevterm.empty()) + if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe)) + return false; + + m_prevterm.clear(); + m_prevsent = true; + return TermProc::flush(); + } + void onlygrams(bool on) + { + m_onlygrams = on; + } +private: + // The stoplist we're using + const Rcl::StopList& m_stops; + // Remembered data for the last processed term + string m_prevterm; + bool m_prevstop; + int m_prevpos; + int m_prevbs; + int m_prevbe; + bool m_prevsent; + // If this is set, we only emit longest grams + bool m_onlygrams; +}; + +} + +#endif /* _TERMPROC_H_INCLUDED_ */