New text to terms processing pipelines: results identical to 1.16 when used with empty stopfile

2011-10-07 07:53:49 +02:00 · 2011-10-07 07:53:49 +02:00 · 5fd31172f5
commit 5fd31172f5
parent 61bf17aa46
4 changed files with 290 additions and 79 deletions
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -19,6 +19,7 @@
 #include <string>
 #include <list>
 #ifndef NO_NAMESPACES
 using std::string;
 using std::list;
@ -26,7 +27,6 @@ using std::list;
 class Utf8Iter;
 /** 
 * Split text into words. 
 * See comments at top of .cpp for more explanations.
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -52,6 +52,7 @@ using namespace std;
 #include "rclversion.h"
 #include "cancelcheck.h"
 #include "ptmutex.h"
 #include "termproc.h"
 #ifndef MAX
 #define MAX(A,B) (A>B?A:B)
@ -858,7 +859,7 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
 // The splitter breaks text into words and adds postings to the Xapian
 // document. We use a single object to split all of the document
 // fields and position jumps to separate fields
-class TextSplitDb : public TextSplit {
+class TextSplitDb : public TextSplitP {
 public:
    Xapian::WritableDatabase db;
    Xapian::Document &doc;   // Xapian document 
@ -873,17 +874,18 @@ class TextSplitDb : public TextSplit {
    // to compute the first position of the next section.
    Xapian::termpos curpos;
    StopList &stops;
    TextSplitDb(Xapian::WritableDatabase idb, 
-		Xapian::Document &d, StopList &_stops) 
+		Xapian::Document &d, TermProc *prc)
-	: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
+	: TextSplitP(prc), 
 	  db(idb), doc(d), basepos(1), curpos(0), wdfinc(1)
    {}
    // Reimplement text_to_words to add start and end special terms
    virtual bool text_to_words(const string &in);
    bool takeword(const std::string &term, int pos, int, int);
    void setprefix(const string& pref) {prefix = pref;}
    void setwdfinc(int i) {wdfinc = i;}
    friend class TermProcIdx;
 private:
    // If prefix is set, we also add a posting for the prefixed terms
    // (ie: for titles, add postings for both "term" and "Sterm")
@ -892,7 +894,7 @@ private:
    int wdfinc;
 };
-
+// Reimplement text_to_words to insert the begin and end anchor terms.
 bool TextSplitDb::text_to_words(const string &in) 
 {
    LOGDEB2(("TextSplitDb::text_to_words\n"));
@ -908,7 +910,7 @@ bool TextSplitDb::text_to_words(const string &in)
 	return false;
    }
-    if (!TextSplit::text_to_words(in)) {
+    if (!TextSplitP::text_to_words(in)) {
 	LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
 	basepos += curpos + 100;
 	return false;
@ -924,51 +926,45 @@ bool TextSplitDb::text_to_words(const string &in)
 	basepos += curpos + 100;
 	return false;
    }
    basepos += curpos + 100;
    return true;
 }
-// Get one term from the doc, remove accents and lowercase, then add posting
+class TermProcIdx : public TermProc {
-bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
+public:
-{
+    TermProcIdx() : TermProc(0), m_ts(0) {}
-    LOGDEB2(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
+    void setTSD(TextSplitDb *ts) {m_ts = ts;}
-    string term;
+    bool takeword(const std::string &term, int pos, int, int)
-    if (!unacmaybefold(_term, term, "UTF-8", true)) {
+    {
-	LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", 
+	// Compute absolute position (pos is relative to current segment),
-                 _term.c_str()));
+	// and remember relative.
-	term.clear();
+	m_ts->curpos = pos;
-	// We don't generate a fatal error because of a bad term
+	pos += m_ts->basepos;
-	return true;
+	string ermsg;
-    }
+	try {
-
+	    // Index without prefix, using the field-specific weighting
-    if (stops.isStop(term)) {
+	    LOGDEB1(("Emitting term at %d : [%s]\n", pos, term.c_str()));
-	LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
+	    m_ts->doc.add_posting(term, pos, m_ts->wdfinc);
 	return true;
    }
    // Compute absolute position (pos is relative to current segment),
    // and remember relative.
    curpos = pos;
    pos += basepos;
    string ermsg;
    try {
 	// Index without prefix, using the field-specific weighting
 	doc.add_posting(term, pos, wdfinc);
 #ifdef TESTING_XAPIAN_SPELL
-	if (Db::isSpellingCandidate(term)) {
+	    if (Db::isSpellingCandidate(term)) {
-	    db.add_spelling(term);
+		m_ts->db.add_spelling(term);
-	}
+	    }
 #endif
-	// Index the prefixed term.
+	    // Index the prefixed term.
-	if (!prefix.empty()) {
+	    if (!m_ts->prefix.empty()) {
-	    doc.add_posting(prefix + term, pos, wdfinc);
+		m_ts->doc.add_posting(m_ts->prefix + term, pos, m_ts->wdfinc);
-	}
+	    }
-	return true;
+	    return true;
-    } XCATCHERROR(ermsg);
+	} XCATCHERROR(ermsg);
-    LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
+	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
-    return false;
+	return false;
-}
+    }
 private:
    TextSplitDb *m_ts;
 };
 #ifdef TESTING_XAPIAN_SPELL
 string Db::getSpellingSuggestion(const string& word)
@ -1032,8 +1028,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
    Doc doc = idoc;
    Xapian::Document newdocument;
-    TextSplitDb splitter(m_ndb->xwdb, newdocument, m_stops);
+    TermProcIdx tpidx;
-
+//    TermProcStop tpstop(&tpidx, m_stops);
    TermProcCommongrams tpstop(&tpidx, m_stops);
    TermProcPrep tpprep(&tpstop);
    TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
    tpidx.setTSD(&splitter);
    // Split and index file name as document term(s)
    LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
    if (!splitter.text_to_words(doc.utf8fn))
--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@ -35,6 +35,7 @@
 #include "utf8iter.h"
 #include "stoplist.h"
 #include "rclconfig.h"
 #include "termproc.h"
 #ifndef NO_NAMESPACES
 using namespace std;
@ -474,36 +475,23 @@ void SearchData::getUTerms(vector<string>& terms) const
 // phrases. This is for parts of the user entry which would appear as
 // a single word because there is no white space inside, but are
 // actually multiple terms to rcldb (ie term1,term2)
-class TextSplitQ : public TextSplit {
+class TextSplitQ : public TextSplitP {
 public:
-    TextSplitQ(Flags flags, const StopList &_stops) 
+    TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
-	: TextSplit(flags), stops(_stops), alltermcount(0), lastpos(0)
+	: TextSplitP(prc, flags), stops(_stops), alltermcount(0), lastpos(0)
    {}
    bool takeword(const std::string &interm, int pos, int, int) {
 	alltermcount++;
        lastpos = pos
 	LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));
    bool takeword(const std::string &term, int pos, int bs, int be) 
    {
 	// Check if the first letter is a majuscule in which
-	// case we do not want to do stem expansion. 
+	// case we do not want to do stem expansion. Need to do this
-	bool nostemexp = unaciscapital(interm);
+	// before unac of course...
-	string noaclowterm;
+	curnostemexp = unaciscapital(term);
 	if (!unacmaybefold(interm, noaclowterm, "UTF-8", true)) {
 	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", 
                     interm.c_str()));
 	    return true;
 	}
-	if (stops.isStop(noaclowterm)) {
+	return TextSplitP::takeword(term, pos, bs, be);
 	    LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n", 
                     noaclowterm.c_str()));
 	    return true;
 	}
 	terms.push_back(noaclowterm);
 	nostemexps.push_back(nostemexp);
 	return true;
    }
    bool           curnostemexp;
    vector<string> terms;
    vector<bool>   nostemexps;
    const StopList &stops;
@ -513,6 +501,26 @@ class TextSplitQ : public TextSplit {
    int lastpos;
 };
 class TermProcQ : public TermProc {
 public:
    TermProcQ() : TermProc(0), m_ts(0) {}
    void setTSQ(TextSplitQ *ts) {m_ts = ts;}
    bool takeword(const std::string &term, int pos, int bs, int be) 
    {
 	m_ts->alltermcount++;
        m_ts->lastpos = pos;
 	bool noexpand = be ? m_ts->curnostemexp : true;
 	LOGDEB(("TermProcQ::takeword: pushing [%s] noexp %d\n", 
 		term.c_str(), noexpand));
 	m_ts->terms.push_back(term);
 	m_ts->nostemexps.push_back(noexpand);
 	return true;
    }
 private:
    TextSplitQ *m_ts;
 };
 // A class used to translate a user compound string (*not* a query
 // language string) as may be entered in any_terms/all_terms search
 // entry fields, ex: [term1 "a phrase" term3] into a xapian query
@ -566,7 +574,7 @@ private:
    vector<vector<string> > m_groups; 
 };
-#if 0
+#if 1
 static void listVector(const string& what, const vector<string>&l)
 {
    string a;
@ -575,6 +583,14 @@ static void listVector(const string& what, const vector<string>&l)
    }
    LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
 }
 static void listList(const string& what, const list<string>& l)
 {
    string a;
    for (list<string>::const_iterator it = l.begin(); it != l.end(); it++) {
        a = a + *it + " ";
    }
    LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
 }
 #endif
 /** Expand stem and wildcards
@ -734,15 +750,17 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
    vector<bool>::iterator nxit = splitData->nostemexps.begin();
    for (vector<string>::iterator it = splitData->terms.begin();
 	 it != splitData->terms.end(); it++, nxit++) {
 	LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
 	// Adjust when we do stem expansion. Not inside phrases, and
 	// some versions of xapian will accept only one OR clause
 	// inside NEAR, all others must be leafs.
 	bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;
 	string sterm;
-	list<string>exp;
+	list<string> exp;
 	expandTerm(nostemexp, *it, exp, sterm, prefix);
-
+	LOGDEB0(("ProcessPhrase: exp size %d\n", exp.size()));
 	listList("", exp);
 	// groups is used for highlighting, we don't want prefixes in there.
 	vector<string> noprefs;
 	for (list<string>::const_iterator it = exp.begin(); 
@ -859,21 +877,32 @@ bool StringToXapianQ::processUserString(const string &iq,
 	    // We now adjust the phrase/near slack by the term count
 	    // difference (this is mainly better for cjk where this is a very
 	    // common occurrence because of the ngrams thing.
 	    TermProcQ tpq;
            //    TermProcStop tpstop(&tpidx, stops);
 	    TermProcCommongrams tpstop(&tpq, stops);
 	    tpstop.onlygrams(true);
 	    TermProcPrep tpprep(&tpstop);
 	    TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
-                                                  TextSplit::TXTS_KEEPWILD), 
+						  TextSplit::TXTS_KEEPWILD), 
-                                 stops);
+                                 stops, &tpprep);
 	    tpq.setTSQ(&splitterS);
 	    splitterS.text_to_words(*it);
 	    LOGDEB(("SplitterS has %d terms\n", splitterS.terms.size()));
 	    TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS | 
                                                  TextSplit::TXTS_KEEPWILD),
-                                 stops);
+                                 stops, &tpprep);
 	    tpq.setTSQ(&splitterW);
 	    tpstop.onlygrams(false);
 	    splitterW.text_to_words(*it);
-	    TextSplitQ *splitter = &splitterS;
+
 	    if (splitterS.terms.size() > 1 && 
 		splitterS.terms.size() != splitterW.terms.size()) {
 		slack += splitterW.terms.size() - splitterS.terms.size();
 		// used to: splitData = &splitDataW;
 	    }
 	    TextSplitQ *splitter = &splitterS;
 	    LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
 	    switch (splitter->terms.size() + terminc) {
 	    case 0: 
--- a/src/rcldb/termproc.h
+++ b/src/rcldb/termproc.h
@ -0,0 +1,182 @@
 /* Copyright (C) 2011 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
 #ifndef _TERMPROC_H_INCLUDED_
 #define _TERMPROC_H_INCLUDED_
 #include "textsplit.h"
 #include "stoplist.h"
 namespace Rcl {
 class TermProc {
 public:
    TermProc(TermProc* next) : m_next(next) {}
    virtual ~TermProc() {}
    virtual bool takeword(const string &term, int pos, int bs, int be)
    {
 	if (m_next)
 	    return m_next->takeword(term, pos, bs, be);
 	else
 	    return true;
    }
    virtual bool flush()
    {
 	if (m_next)
 	    return m_next->flush();
 	else
 	    return true;
    }
 private:
    TermProc *m_next;
 };
 class TextSplitP : public TextSplit {
 public:
    TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
 	: TextSplit(flags), m_prc(prc)
    {}
    virtual bool text_to_words(const string &in)
    {
 	bool ret = TextSplit::text_to_words(in);
 	if (m_prc && !m_prc->flush())
 	    return false;
 	return ret;
    }
    virtual bool takeword(const string& term, int pos, int bs, int be)
    {
 	if (m_prc)
 	    return m_prc->takeword(term, pos, bs, be);
 	else
 	    return true;
    }
 private:
    TermProc *m_prc;
 };
 class TermProcPrep : public TermProc {
 public:
    TermProcPrep(TermProc *nxt)	: TermProc(nxt) {}
    virtual bool takeword(const string& itrm, int pos, int bs, int be)
    {
 	string otrm;
 	if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
 	    LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
 	    // We don't generate a fatal error because of a bad term
 	    return true;
 	}
 	return TermProc::takeword(otrm, pos, bs, be);
    }
 };
 class TermProcStop : public TermProc {
 public:
    TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
 	: TermProc(nxt), m_stops(stops) { }
    virtual bool takeword(const string& term, int pos, int bts, int bte)
    {
 	if (m_stops.isStop(term)) {
 	    return true;
 	}
 	return TermProc::takeword(term, pos, bts, bte);
    }
 private:
    const Rcl::StopList& m_stops;
 };
 class TermProcCommongrams : public TermProc {
 public:
    TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
 	: TermProc(nxt), m_stops(stops), m_onlygrams(false) { }
    virtual bool takeword(const string& term, int pos, int bs, int be)
    {
 	LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n", 
 		 pos, bs, be, term.c_str()));
 	bool isstop = m_stops.isStop(term);
 	bool twogramemit = false;
 	if (!m_prevterm.empty() && (m_prevstop || isstop)) {
 	    // create 2-gram. space unnecessary but improves
 	    // lisibility of queries
 	    string twogram;
 	    twogram.swap(m_prevterm);
 	    twogram.append(1, ' ');
 	    twogram += term;
 	    // When emitting a complex term we set the bps to 0. This may
 	    // be used by our clients
 	    if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
 		return false;
 	    twogramemit = true;
 #if 0
 	    if (m_stops.isStop(twogram)) {
 		firstword = twogram;
 		isstop = false;
 	    }
 #endif
 	}
 	m_prevterm = term;
 	m_prevstop = isstop;
 	m_prevpos = pos;
 	m_prevsent = false;
 	m_prevbs = bs;
 	m_prevbe = be;
 	// If flags allow, emit the bare term at the current pos.
 	if (!m_onlygrams || (!isstop && !twogramemit)) {
 	    if (!TermProc::takeword(term, pos, bs, be))
 		return false;
 	    m_prevsent = true;
 	} 
 	return true;
    }
    bool flush()
    {
 	if (!m_prevsent && !m_prevterm.empty())
 	    if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
 		return false;
 	m_prevterm.clear();
 	m_prevsent = true;
 	return TermProc::flush();
    }
    void onlygrams(bool on)
    {
 	m_onlygrams = on;
    }
 private:
    // The stoplist we're using
    const Rcl::StopList& m_stops;
    // Remembered data for the last processed term
    string m_prevterm;
    bool   m_prevstop;
    int    m_prevpos;
    int    m_prevbs;
    int    m_prevbe;
    bool   m_prevsent;
    // If this is set, we only emit longest grams
    bool   m_onlygrams;
 };
 }
 #endif /* _TERMPROC_H_INCLUDED_ */