New text to terms processing pipelines: results identical to 1.16 when used with empty stopfile

2011-10-07 07:53:49 +02:00 · 2011-10-07 07:53:49 +02:00 · 5fd31172f5
commit 5fd31172f5
parent 61bf17aa46
4 changed files with 290 additions and 79 deletions
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -19,6 +19,7 @@

 #include <string>
 #include <list>
+
 #ifndef NO_NAMESPACES
 using std::string;
 using std::list;
@ -26,7 +27,6 @@ using std::list;

 class Utf8Iter;

-
 /** 
 * Split text into words. 
 * See comments at top of .cpp for more explanations.
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -52,6 +52,7 @@ using namespace std;
 #include "rclversion.h"
 #include "cancelcheck.h"
 #include "ptmutex.h"
+#include "termproc.h"

 #ifndef MAX
 #define MAX(A,B) (A>B?A:B)
@ -858,7 +859,7 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
 // The splitter breaks text into words and adds postings to the Xapian
 // document. We use a single object to split all of the document
 // fields and position jumps to separate fields
-class TextSplitDb : public TextSplit {
+class TextSplitDb : public TextSplitP {
 public:
    Xapian::WritableDatabase db;
    Xapian::Document &doc;   // Xapian document 
@ -873,17 +874,18 @@ class TextSplitDb : public TextSplit {
    // to compute the first position of the next section.
    Xapian::termpos curpos;

-    StopList &stops;
    TextSplitDb(Xapian::WritableDatabase idb, 
-		Xapian::Document &d, StopList &_stops) 
-	: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
+		Xapian::Document &d, TermProc *prc)
+	: TextSplitP(prc), 
+	  db(idb), doc(d), basepos(1), curpos(0), wdfinc(1)
    {}
    // Reimplement text_to_words to add start and end special terms
    virtual bool text_to_words(const string &in);
-    bool takeword(const std::string &term, int pos, int, int);
    void setprefix(const string& pref) {prefix = pref;}
    void setwdfinc(int i) {wdfinc = i;}

+    friend class TermProcIdx;
+
 private:
    // If prefix is set, we also add a posting for the prefixed terms
    // (ie: for titles, add postings for both "term" and "Sterm")
@ -892,7 +894,7 @@ private:
    int wdfinc;
 };

-
+// Reimplement text_to_words to insert the begin and end anchor terms.
 bool TextSplitDb::text_to_words(const string &in) 
 {
    LOGDEB2(("TextSplitDb::text_to_words\n"));
@ -908,7 +910,7 @@ bool TextSplitDb::text_to_words(const string &in)
 	return false;
    }

-    if (!TextSplit::text_to_words(in)) {
+    if (!TextSplitP::text_to_words(in)) {
 	LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
 	basepos += curpos + 100;
 	return false;
@ -924,51 +926,45 @@ bool TextSplitDb::text_to_words(const string &in)
 	basepos += curpos + 100;
 	return false;
    }
+
    basepos += curpos + 100;
    return true;
 }

-// Get one term from the doc, remove accents and lowercase, then add posting
-bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
-{
-    LOGDEB2(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
+class TermProcIdx : public TermProc {
+public:
+    TermProcIdx() : TermProc(0), m_ts(0) {}
+    void setTSD(TextSplitDb *ts) {m_ts = ts;}

-    string term;
-    if (!unacmaybefold(_term, term, "UTF-8", true)) {
-	LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", 
-                 _term.c_str()));
-	term.clear();
-	// We don't generate a fatal error because of a bad term
-	return true;
-    }
-
-    if (stops.isStop(term)) {
-	LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
-	return true;
-    }
-
-    // Compute absolute position (pos is relative to current segment),
-    // and remember relative.
-    curpos = pos;
-    pos += basepos;
-    string ermsg;
-    try {
-	// Index without prefix, using the field-specific weighting
-	doc.add_posting(term, pos, wdfinc);
+    bool takeword(const std::string &term, int pos, int, int)
+    {
+	// Compute absolute position (pos is relative to current segment),
+	// and remember relative.
+	m_ts->curpos = pos;
+	pos += m_ts->basepos;
+	string ermsg;
+	try {
+	    // Index without prefix, using the field-specific weighting
+	    LOGDEB1(("Emitting term at %d : [%s]\n", pos, term.c_str()));
+	    m_ts->doc.add_posting(term, pos, m_ts->wdfinc);
 #ifdef TESTING_XAPIAN_SPELL
-	if (Db::isSpellingCandidate(term)) {
-	    db.add_spelling(term);
-	}
+	    if (Db::isSpellingCandidate(term)) {
+		m_ts->db.add_spelling(term);
+	    }
 #endif
-	// Index the prefixed term.
-	if (!prefix.empty()) {
-	    doc.add_posting(prefix + term, pos, wdfinc);
-	}
-	return true;
-    } XCATCHERROR(ermsg);
-    LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
-    return false;
-}
+	    // Index the prefixed term.
+	    if (!m_ts->prefix.empty()) {
+		m_ts->doc.add_posting(m_ts->prefix + term, pos, m_ts->wdfinc);
+	    }
+	    return true;
+	} XCATCHERROR(ermsg);
+	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
+	return false;
+    }
+private:
+    TextSplitDb *m_ts;
+};
+

 #ifdef TESTING_XAPIAN_SPELL
 string Db::getSpellingSuggestion(const string& word)
@ -1032,8 +1028,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    Doc doc = idoc;

    Xapian::Document newdocument;
-    TextSplitDb splitter(m_ndb->xwdb, newdocument, m_stops);
-
+    TermProcIdx tpidx;
+//    TermProcStop tpstop(&tpidx, m_stops);
+    TermProcCommongrams tpstop(&tpidx, m_stops);
+    TermProcPrep tpprep(&tpstop);
+    TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
+    tpidx.setTSD(&splitter);
    // Split and index file name as document term(s)
    LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
    if (!splitter.text_to_words(doc.utf8fn))
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -35,6 +35,7 @@
 #include "utf8iter.h"
 #include "stoplist.h"
 #include "rclconfig.h"
+#include "termproc.h"

 #ifndef NO_NAMESPACES
 using namespace std;
@ -474,36 +475,23 @@ void SearchData::getUTerms(vector<string>& terms) const
 // phrases. This is for parts of the user entry which would appear as
 // a single word because there is no white space inside, but are
 // actually multiple terms to rcldb (ie term1,term2)
-class TextSplitQ : public TextSplit {
+class TextSplitQ : public TextSplitP {
 public:
-    TextSplitQ(Flags flags, const StopList &_stops) 
-	: TextSplit(flags), stops(_stops), alltermcount(0), lastpos(0)
+    TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
+	: TextSplitP(prc, flags), stops(_stops), alltermcount(0), lastpos(0)
    {}
-    bool takeword(const std::string &interm, int pos, int, int) {
-	alltermcount++;
-        lastpos = pos
-	LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));

+    bool takeword(const std::string &term, int pos, int bs, int be) 
+    {
 	// Check if the first letter is a majuscule in which
-	// case we do not want to do stem expansion. 
-	bool nostemexp = unaciscapital(interm);
-	string noaclowterm;
-	if (!unacmaybefold(interm, noaclowterm, "UTF-8", true)) {
-	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", 
-                     interm.c_str()));
-	    return true;
-	}
+	// case we do not want to do stem expansion. Need to do this
+	// before unac of course...
+	curnostemexp = unaciscapital(term);

-	if (stops.isStop(noaclowterm)) {
-	    LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n", 
-                     noaclowterm.c_str()));
-	    return true;
-	}
-	terms.push_back(noaclowterm);
-	nostemexps.push_back(nostemexp);
-	return true;
+	return TextSplitP::takeword(term, pos, bs, be);
    }

+    bool           curnostemexp;
    vector<string> terms;
    vector<bool>   nostemexps;
    const StopList &stops;
@ -513,6 +501,26 @@ class TextSplitQ : public TextSplit {
    int lastpos;
 };

+class TermProcQ : public TermProc {
+public:
+    TermProcQ() : TermProc(0), m_ts(0) {}
+    void setTSQ(TextSplitQ *ts) {m_ts = ts;}
+    
+    bool takeword(const std::string &term, int pos, int bs, int be) 
+    {
+	m_ts->alltermcount++;
+        m_ts->lastpos = pos;
+	bool noexpand = be ? m_ts->curnostemexp : true;
+	LOGDEB(("TermProcQ::takeword: pushing [%s] noexp %d\n", 
+		term.c_str(), noexpand));
+	m_ts->terms.push_back(term);
+	m_ts->nostemexps.push_back(noexpand);
+	return true;
+    }
+private:
+    TextSplitQ *m_ts;
+};
+
 // A class used to translate a user compound string (*not* a query
 // language string) as may be entered in any_terms/all_terms search
 // entry fields, ex: [term1 "a phrase" term3] into a xapian query
@ -566,7 +574,7 @@ private:
    vector<vector<string> > m_groups; 
 };

-#if 0
+#if 1
 static void listVector(const string& what, const vector<string>&l)
 {
    string a;
@ -575,6 +583,14 @@ static void listVector(const string& what, const vector<string>&l)
    }
    LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
 }
+static void listList(const string& what, const list<string>& l)
+{
+    string a;
+    for (list<string>::const_iterator it = l.begin(); it != l.end(); it++) {
+        a = a + *it + " ";
+    }
+    LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
+}
 #endif

 /** Expand stem and wildcards
@ -734,15 +750,17 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
    vector<bool>::iterator nxit = splitData->nostemexps.begin();
    for (vector<string>::iterator it = splitData->terms.begin();
 	 it != splitData->terms.end(); it++, nxit++) {
+	LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
 	// Adjust when we do stem expansion. Not inside phrases, and
 	// some versions of xapian will accept only one OR clause
 	// inside NEAR, all others must be leafs.
 	bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;

 	string sterm;
-	list<string>exp;
+	list<string> exp;
 	expandTerm(nostemexp, *it, exp, sterm, prefix);
-
+	LOGDEB0(("ProcessPhrase: exp size %d\n", exp.size()));
+	listList("", exp);
 	// groups is used for highlighting, we don't want prefixes in there.
 	vector<string> noprefs;
 	for (list<string>::const_iterator it = exp.begin(); 
@ -859,21 +877,32 @@ bool StringToXapianQ::processUserString(const string &iq,
 	    // We now adjust the phrase/near slack by the term count
 	    // difference (this is mainly better for cjk where this is a very
 	    // common occurrence because of the ngrams thing.
+
+	    TermProcQ tpq;
+            //    TermProcStop tpstop(&tpidx, stops);
+	    TermProcCommongrams tpstop(&tpq, stops);
+	    tpstop.onlygrams(true);
+	    TermProcPrep tpprep(&tpstop);
+
 	    TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
-                                                  TextSplit::TXTS_KEEPWILD), 
-                                 stops);
+						  TextSplit::TXTS_KEEPWILD), 
+                                 stops, &tpprep);
+	    tpq.setTSQ(&splitterS);
 	    splitterS.text_to_words(*it);
+	    LOGDEB(("SplitterS has %d terms\n", splitterS.terms.size()));
 	    TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS | 
                                                  TextSplit::TXTS_KEEPWILD),
-                                 stops);
+                                 stops, &tpprep);
+	    tpq.setTSQ(&splitterW);
+	    tpstop.onlygrams(false);
 	    splitterW.text_to_words(*it);
-	    TextSplitQ *splitter = &splitterS;
+
 	    if (splitterS.terms.size() > 1 && 
 		splitterS.terms.size() != splitterW.terms.size()) {
 		slack += splitterW.terms.size() - splitterS.terms.size();
-		// used to: splitData = &splitDataW;
 	    }

+	    TextSplitQ *splitter = &splitterS;
 	    LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
 	    switch (splitter->terms.size() + terminc) {
 	    case 0: 
--- a/src/rcldb/termproc.h
+++ b/src/rcldb/termproc.h
@ -0,0 +1,182 @@
+/* Copyright (C) 2011 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+
+#ifndef _TERMPROC_H_INCLUDED_
+#define _TERMPROC_H_INCLUDED_
+
+#include "textsplit.h"
+#include "stoplist.h"
+namespace Rcl {
+class TermProc {
+public:
+    TermProc(TermProc* next) : m_next(next) {}
+    virtual ~TermProc() {}
+    virtual bool takeword(const string &term, int pos, int bs, int be)
+    {
+	if (m_next)
+	    return m_next->takeword(term, pos, bs, be);
+	else
+	    return true;
+    }
+    virtual bool flush()
+    {
+	if (m_next)
+	    return m_next->flush();
+	else
+	    return true;
+    }
+private:
+    TermProc *m_next;
+};
+
+class TextSplitP : public TextSplit {
+public:
+    TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
+	: TextSplit(flags), m_prc(prc)
+    {}
+
+    virtual bool text_to_words(const string &in)
+    {
+	bool ret = TextSplit::text_to_words(in);
+	if (m_prc && !m_prc->flush())
+	    return false;
+	return ret;
+    }
+
+    virtual bool takeword(const string& term, int pos, int bs, int be)
+    {
+	if (m_prc)
+	    return m_prc->takeword(term, pos, bs, be);
+	else
+	    return true;
+    }
+
+private:
+    TermProc *m_prc;
+};
+
+class TermProcPrep : public TermProc {
+public:
+    TermProcPrep(TermProc *nxt)	: TermProc(nxt) {}
+
+    virtual bool takeword(const string& itrm, int pos, int bs, int be)
+    {
+	string otrm;
+	if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
+	    LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
+	    // We don't generate a fatal error because of a bad term
+	    return true;
+	}
+	return TermProc::takeword(otrm, pos, bs, be);
+    }
+};
+
+class TermProcStop : public TermProc {
+public:
+    TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
+	: TermProc(nxt), m_stops(stops) { }
+    virtual bool takeword(const string& term, int pos, int bts, int bte)
+    {
+	if (m_stops.isStop(term)) {
+	    return true;
+	}
+	return TermProc::takeword(term, pos, bts, bte);
+    }
+private:
+    const Rcl::StopList& m_stops;
+};
+
+class TermProcCommongrams : public TermProc {
+public:
+    TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
+	: TermProc(nxt), m_stops(stops), m_onlygrams(false) { }
+
+    virtual bool takeword(const string& term, int pos, int bs, int be)
+    {
+	LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n", 
+		 pos, bs, be, term.c_str()));
+	bool isstop = m_stops.isStop(term);
+	bool twogramemit = false;
+
+	if (!m_prevterm.empty() && (m_prevstop || isstop)) {
+	    // create 2-gram. space unnecessary but improves
+	    // lisibility of queries
+	    string twogram;
+	    twogram.swap(m_prevterm);
+	    twogram.append(1, ' ');
+	    twogram += term;
+	    // When emitting a complex term we set the bps to 0. This may
+	    // be used by our clients
+	    if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
+		return false;
+	    twogramemit = true;
+#if 0
+	    if (m_stops.isStop(twogram)) {
+		firstword = twogram;
+		isstop = false;
+	    }
+#endif
+	}
+
+	m_prevterm = term;
+	m_prevstop = isstop;
+	m_prevpos = pos;
+	m_prevsent = false;
+	m_prevbs = bs;
+	m_prevbe = be;
+	// If flags allow, emit the bare term at the current pos.
+	if (!m_onlygrams || (!isstop && !twogramemit)) {
+	    if (!TermProc::takeword(term, pos, bs, be))
+		return false;
+	    m_prevsent = true;
+	} 
+
+	return true;
+    }
+
+    bool flush()
+    {
+	if (!m_prevsent && !m_prevterm.empty())
+	    if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
+		return false;
+	    
+	m_prevterm.clear();
+	m_prevsent = true;
+	return TermProc::flush();
+    }
+    void onlygrams(bool on)
+    {
+	m_onlygrams = on;
+    }
+private:
+    // The stoplist we're using
+    const Rcl::StopList& m_stops;
+    // Remembered data for the last processed term
+    string m_prevterm;
+    bool   m_prevstop;
+    int    m_prevpos;
+    int    m_prevbs;
+    int    m_prevbe;
+    bool   m_prevsent;
+    // If this is set, we only emit longest grams
+    bool   m_onlygrams;
+};
+
+}
+
+#endif /* _TERMPROC_H_INCLUDED_ */