New text to terms processing pipelines: results identical to 1.16 when used with empty stopfile

This commit is contained in:
Jean-Francois Dockes 2011-10-07 07:53:49 +02:00
parent 61bf17aa46
commit 5fd31172f5
4 changed files with 290 additions and 79 deletions

View File

@ -19,6 +19,7 @@
#include <string>
#include <list>
#ifndef NO_NAMESPACES
using std::string;
using std::list;
@ -26,7 +27,6 @@ using std::list;
class Utf8Iter;
/**
* Split text into words.
* See comments at top of .cpp for more explanations.

View File

@ -52,6 +52,7 @@ using namespace std;
#include "rclversion.h"
#include "cancelcheck.h"
#include "ptmutex.h"
#include "termproc.h"
#ifndef MAX
#define MAX(A,B) (A>B?A:B)
@ -858,7 +859,7 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
// The splitter breaks text into words and adds postings to the Xapian
// document. We use a single object to split all of the document
// fields and position jumps to separate fields
class TextSplitDb : public TextSplit {
class TextSplitDb : public TextSplitP {
public:
Xapian::WritableDatabase db;
Xapian::Document &doc; // Xapian document
@ -873,17 +874,18 @@ class TextSplitDb : public TextSplit {
// to compute the first position of the next section.
Xapian::termpos curpos;
StopList &stops;
TextSplitDb(Xapian::WritableDatabase idb,
Xapian::Document &d, StopList &_stops)
: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
Xapian::Document &d, TermProc *prc)
: TextSplitP(prc),
db(idb), doc(d), basepos(1), curpos(0), wdfinc(1)
{}
// Reimplement text_to_words to add start and end special terms
virtual bool text_to_words(const string &in);
bool takeword(const std::string &term, int pos, int, int);
void setprefix(const string& pref) {prefix = pref;}
void setwdfinc(int i) {wdfinc = i;}
friend class TermProcIdx;
private:
// If prefix is set, we also add a posting for the prefixed terms
// (ie: for titles, add postings for both "term" and "Sterm")
@ -892,7 +894,7 @@ private:
int wdfinc;
};
// Reimplement text_to_words to insert the begin and end anchor terms.
bool TextSplitDb::text_to_words(const string &in)
{
LOGDEB2(("TextSplitDb::text_to_words\n"));
@ -908,7 +910,7 @@ bool TextSplitDb::text_to_words(const string &in)
return false;
}
if (!TextSplit::text_to_words(in)) {
if (!TextSplitP::text_to_words(in)) {
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
basepos += curpos + 100;
return false;
@ -924,51 +926,45 @@ bool TextSplitDb::text_to_words(const string &in)
basepos += curpos + 100;
return false;
}
basepos += curpos + 100;
return true;
}
// Get one term from the doc, remove accents and lowercase, then add posting
bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
{
LOGDEB2(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
class TermProcIdx : public TermProc {
public:
TermProcIdx() : TermProc(0), m_ts(0) {}
void setTSD(TextSplitDb *ts) {m_ts = ts;}
string term;
if (!unacmaybefold(_term, term, "UTF-8", true)) {
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n",
_term.c_str()));
term.clear();
// We don't generate a fatal error because of a bad term
return true;
}
if (stops.isStop(term)) {
LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
return true;
}
// Compute absolute position (pos is relative to current segment),
// and remember relative.
curpos = pos;
pos += basepos;
string ermsg;
try {
// Index without prefix, using the field-specific weighting
doc.add_posting(term, pos, wdfinc);
bool takeword(const std::string &term, int pos, int, int)
{
// Compute absolute position (pos is relative to current segment),
// and remember relative.
m_ts->curpos = pos;
pos += m_ts->basepos;
string ermsg;
try {
// Index without prefix, using the field-specific weighting
LOGDEB1(("Emitting term at %d : [%s]\n", pos, term.c_str()));
m_ts->doc.add_posting(term, pos, m_ts->wdfinc);
#ifdef TESTING_XAPIAN_SPELL
if (Db::isSpellingCandidate(term)) {
db.add_spelling(term);
}
if (Db::isSpellingCandidate(term)) {
m_ts->db.add_spelling(term);
}
#endif
// Index the prefixed term.
if (!prefix.empty()) {
doc.add_posting(prefix + term, pos, wdfinc);
}
return true;
} XCATCHERROR(ermsg);
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
return false;
}
// Index the prefixed term.
if (!m_ts->prefix.empty()) {
m_ts->doc.add_posting(m_ts->prefix + term, pos, m_ts->wdfinc);
}
return true;
} XCATCHERROR(ermsg);
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
return false;
}
private:
TextSplitDb *m_ts;
};
#ifdef TESTING_XAPIAN_SPELL
string Db::getSpellingSuggestion(const string& word)
@ -1032,8 +1028,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
Doc doc = idoc;
Xapian::Document newdocument;
TextSplitDb splitter(m_ndb->xwdb, newdocument, m_stops);
TermProcIdx tpidx;
// TermProcStop tpstop(&tpidx, m_stops);
TermProcCommongrams tpstop(&tpidx, m_stops);
TermProcPrep tpprep(&tpstop);
TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
tpidx.setTSD(&splitter);
// Split and index file name as document term(s)
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
if (!splitter.text_to_words(doc.utf8fn))

View File

@ -35,6 +35,7 @@
#include "utf8iter.h"
#include "stoplist.h"
#include "rclconfig.h"
#include "termproc.h"
#ifndef NO_NAMESPACES
using namespace std;
@ -474,36 +475,23 @@ void SearchData::getUTerms(vector<string>& terms) const
// phrases. This is for parts of the user entry which would appear as
// a single word because there is no white space inside, but are
// actually multiple terms to rcldb (ie term1,term2)
class TextSplitQ : public TextSplit {
class TextSplitQ : public TextSplitP {
public:
TextSplitQ(Flags flags, const StopList &_stops)
: TextSplit(flags), stops(_stops), alltermcount(0), lastpos(0)
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
: TextSplitP(prc, flags), stops(_stops), alltermcount(0), lastpos(0)
{}
bool takeword(const std::string &interm, int pos, int, int) {
alltermcount++;
lastpos = pos
LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));
bool takeword(const std::string &term, int pos, int bs, int be)
{
// Check if the first letter is a majuscule in which
// case we do not want to do stem expansion.
bool nostemexp = unaciscapital(interm);
string noaclowterm;
if (!unacmaybefold(interm, noaclowterm, "UTF-8", true)) {
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
interm.c_str()));
return true;
}
// case we do not want to do stem expansion. Need to do this
// before unac of course...
curnostemexp = unaciscapital(term);
if (stops.isStop(noaclowterm)) {
LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n",
noaclowterm.c_str()));
return true;
}
terms.push_back(noaclowterm);
nostemexps.push_back(nostemexp);
return true;
return TextSplitP::takeword(term, pos, bs, be);
}
bool curnostemexp;
vector<string> terms;
vector<bool> nostemexps;
const StopList &stops;
@ -513,6 +501,26 @@ class TextSplitQ : public TextSplit {
int lastpos;
};
class TermProcQ : public TermProc {
public:
TermProcQ() : TermProc(0), m_ts(0) {}
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
bool takeword(const std::string &term, int pos, int bs, int be)
{
m_ts->alltermcount++;
m_ts->lastpos = pos;
bool noexpand = be ? m_ts->curnostemexp : true;
LOGDEB(("TermProcQ::takeword: pushing [%s] noexp %d\n",
term.c_str(), noexpand));
m_ts->terms.push_back(term);
m_ts->nostemexps.push_back(noexpand);
return true;
}
private:
TextSplitQ *m_ts;
};
// A class used to translate a user compound string (*not* a query
// language string) as may be entered in any_terms/all_terms search
// entry fields, ex: [term1 "a phrase" term3] into a xapian query
@ -566,7 +574,7 @@ private:
vector<vector<string> > m_groups;
};
#if 0
#if 1
static void listVector(const string& what, const vector<string>&l)
{
string a;
@ -575,6 +583,14 @@ static void listVector(const string& what, const vector<string>&l)
}
LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
}
static void listList(const string& what, const list<string>& l)
{
string a;
for (list<string>::const_iterator it = l.begin(); it != l.end(); it++) {
a = a + *it + " ";
}
LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
}
#endif
/** Expand stem and wildcards
@ -734,15 +750,17 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
vector<bool>::iterator nxit = splitData->nostemexps.begin();
for (vector<string>::iterator it = splitData->terms.begin();
it != splitData->terms.end(); it++, nxit++) {
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
// Adjust when we do stem expansion. Not inside phrases, and
// some versions of xapian will accept only one OR clause
// inside NEAR, all others must be leafs.
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;
string sterm;
list<string>exp;
list<string> exp;
expandTerm(nostemexp, *it, exp, sterm, prefix);
LOGDEB0(("ProcessPhrase: exp size %d\n", exp.size()));
listList("", exp);
// groups is used for highlighting, we don't want prefixes in there.
vector<string> noprefs;
for (list<string>::const_iterator it = exp.begin();
@ -859,21 +877,32 @@ bool StringToXapianQ::processUserString(const string &iq,
// We now adjust the phrase/near slack by the term count
// difference (this is mainly better for cjk where this is a very
// common occurrence because of the ngrams thing.
TermProcQ tpq;
// TermProcStop tpstop(&tpidx, stops);
TermProcCommongrams tpstop(&tpq, stops);
tpstop.onlygrams(true);
TermProcPrep tpprep(&tpstop);
TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD),
stops);
TextSplit::TXTS_KEEPWILD),
stops, &tpprep);
tpq.setTSQ(&splitterS);
splitterS.text_to_words(*it);
LOGDEB(("SplitterS has %d terms\n", splitterS.terms.size()));
TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS |
TextSplit::TXTS_KEEPWILD),
stops);
stops, &tpprep);
tpq.setTSQ(&splitterW);
tpstop.onlygrams(false);
splitterW.text_to_words(*it);
TextSplitQ *splitter = &splitterS;
if (splitterS.terms.size() > 1 &&
splitterS.terms.size() != splitterW.terms.size()) {
slack += splitterW.terms.size() - splitterS.terms.size();
// used to: splitData = &splitDataW;
}
TextSplitQ *splitter = &splitterS;
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
switch (splitter->terms.size() + terminc) {
case 0:

182
src/rcldb/termproc.h Normal file
View File

@ -0,0 +1,182 @@
/* Copyright (C) 2011 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _TERMPROC_H_INCLUDED_
#define _TERMPROC_H_INCLUDED_
#include "textsplit.h"
#include "stoplist.h"
namespace Rcl {
class TermProc {
public:
TermProc(TermProc* next) : m_next(next) {}
virtual ~TermProc() {}
virtual bool takeword(const string &term, int pos, int bs, int be)
{
if (m_next)
return m_next->takeword(term, pos, bs, be);
else
return true;
}
virtual bool flush()
{
if (m_next)
return m_next->flush();
else
return true;
}
private:
TermProc *m_next;
};
class TextSplitP : public TextSplit {
public:
TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
: TextSplit(flags), m_prc(prc)
{}
virtual bool text_to_words(const string &in)
{
bool ret = TextSplit::text_to_words(in);
if (m_prc && !m_prc->flush())
return false;
return ret;
}
virtual bool takeword(const string& term, int pos, int bs, int be)
{
if (m_prc)
return m_prc->takeword(term, pos, bs, be);
else
return true;
}
private:
TermProc *m_prc;
};
class TermProcPrep : public TermProc {
public:
TermProcPrep(TermProc *nxt) : TermProc(nxt) {}
virtual bool takeword(const string& itrm, int pos, int bs, int be)
{
string otrm;
if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
// We don't generate a fatal error because of a bad term
return true;
}
return TermProc::takeword(otrm, pos, bs, be);
}
};
class TermProcStop : public TermProc {
public:
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
: TermProc(nxt), m_stops(stops) { }
virtual bool takeword(const string& term, int pos, int bts, int bte)
{
if (m_stops.isStop(term)) {
return true;
}
return TermProc::takeword(term, pos, bts, bte);
}
private:
const Rcl::StopList& m_stops;
};
class TermProcCommongrams : public TermProc {
public:
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
: TermProc(nxt), m_stops(stops), m_onlygrams(false) { }
virtual bool takeword(const string& term, int pos, int bs, int be)
{
LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
pos, bs, be, term.c_str()));
bool isstop = m_stops.isStop(term);
bool twogramemit = false;
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
// create 2-gram. space unnecessary but improves
// lisibility of queries
string twogram;
twogram.swap(m_prevterm);
twogram.append(1, ' ');
twogram += term;
// When emitting a complex term we set the bps to 0. This may
// be used by our clients
if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
return false;
twogramemit = true;
#if 0
if (m_stops.isStop(twogram)) {
firstword = twogram;
isstop = false;
}
#endif
}
m_prevterm = term;
m_prevstop = isstop;
m_prevpos = pos;
m_prevsent = false;
m_prevbs = bs;
m_prevbe = be;
// If flags allow, emit the bare term at the current pos.
if (!m_onlygrams || (!isstop && !twogramemit)) {
if (!TermProc::takeword(term, pos, bs, be))
return false;
m_prevsent = true;
}
return true;
}
bool flush()
{
if (!m_prevsent && !m_prevterm.empty())
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
return false;
m_prevterm.clear();
m_prevsent = true;
return TermProc::flush();
}
void onlygrams(bool on)
{
m_onlygrams = on;
}
private:
// The stoplist we're using
const Rcl::StopList& m_stops;
// Remembered data for the last processed term
string m_prevterm;
bool m_prevstop;
int m_prevpos;
int m_prevbs;
int m_prevbe;
bool m_prevsent;
// If this is set, we only emit longest grams
bool m_onlygrams;
};
}
#endif /* _TERMPROC_H_INCLUDED_ */