New text to terms processing pipelines: results identical to 1.16 when used with empty stopfile
This commit is contained in:
parent
61bf17aa46
commit
5fd31172f5
@ -19,6 +19,7 @@
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using std::string;
|
||||
using std::list;
|
||||
@ -26,7 +27,6 @@ using std::list;
|
||||
|
||||
class Utf8Iter;
|
||||
|
||||
|
||||
/**
|
||||
* Split text into words.
|
||||
* See comments at top of .cpp for more explanations.
|
||||
|
||||
@ -52,6 +52,7 @@ using namespace std;
|
||||
#include "rclversion.h"
|
||||
#include "cancelcheck.h"
|
||||
#include "ptmutex.h"
|
||||
#include "termproc.h"
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(A,B) (A>B?A:B)
|
||||
@ -858,7 +859,7 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
|
||||
// The splitter breaks text into words and adds postings to the Xapian
|
||||
// document. We use a single object to split all of the document
|
||||
// fields and position jumps to separate fields
|
||||
class TextSplitDb : public TextSplit {
|
||||
class TextSplitDb : public TextSplitP {
|
||||
public:
|
||||
Xapian::WritableDatabase db;
|
||||
Xapian::Document &doc; // Xapian document
|
||||
@ -873,17 +874,18 @@ class TextSplitDb : public TextSplit {
|
||||
// to compute the first position of the next section.
|
||||
Xapian::termpos curpos;
|
||||
|
||||
StopList &stops;
|
||||
TextSplitDb(Xapian::WritableDatabase idb,
|
||||
Xapian::Document &d, StopList &_stops)
|
||||
: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
|
||||
Xapian::Document &d, TermProc *prc)
|
||||
: TextSplitP(prc),
|
||||
db(idb), doc(d), basepos(1), curpos(0), wdfinc(1)
|
||||
{}
|
||||
// Reimplement text_to_words to add start and end special terms
|
||||
virtual bool text_to_words(const string &in);
|
||||
bool takeword(const std::string &term, int pos, int, int);
|
||||
void setprefix(const string& pref) {prefix = pref;}
|
||||
void setwdfinc(int i) {wdfinc = i;}
|
||||
|
||||
friend class TermProcIdx;
|
||||
|
||||
private:
|
||||
// If prefix is set, we also add a posting for the prefixed terms
|
||||
// (ie: for titles, add postings for both "term" and "Sterm")
|
||||
@ -892,7 +894,7 @@ private:
|
||||
int wdfinc;
|
||||
};
|
||||
|
||||
|
||||
// Reimplement text_to_words to insert the begin and end anchor terms.
|
||||
bool TextSplitDb::text_to_words(const string &in)
|
||||
{
|
||||
LOGDEB2(("TextSplitDb::text_to_words\n"));
|
||||
@ -908,7 +910,7 @@ bool TextSplitDb::text_to_words(const string &in)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!TextSplit::text_to_words(in)) {
|
||||
if (!TextSplitP::text_to_words(in)) {
|
||||
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
|
||||
basepos += curpos + 100;
|
||||
return false;
|
||||
@ -924,51 +926,45 @@ bool TextSplitDb::text_to_words(const string &in)
|
||||
basepos += curpos + 100;
|
||||
return false;
|
||||
}
|
||||
|
||||
basepos += curpos + 100;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Get one term from the doc, remove accents and lowercase, then add posting
|
||||
bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
|
||||
{
|
||||
LOGDEB2(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
|
||||
class TermProcIdx : public TermProc {
|
||||
public:
|
||||
TermProcIdx() : TermProc(0), m_ts(0) {}
|
||||
void setTSD(TextSplitDb *ts) {m_ts = ts;}
|
||||
|
||||
string term;
|
||||
if (!unacmaybefold(_term, term, "UTF-8", true)) {
|
||||
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n",
|
||||
_term.c_str()));
|
||||
term.clear();
|
||||
// We don't generate a fatal error because of a bad term
|
||||
return true;
|
||||
}
|
||||
|
||||
if (stops.isStop(term)) {
|
||||
LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Compute absolute position (pos is relative to current segment),
|
||||
// and remember relative.
|
||||
curpos = pos;
|
||||
pos += basepos;
|
||||
string ermsg;
|
||||
try {
|
||||
// Index without prefix, using the field-specific weighting
|
||||
doc.add_posting(term, pos, wdfinc);
|
||||
bool takeword(const std::string &term, int pos, int, int)
|
||||
{
|
||||
// Compute absolute position (pos is relative to current segment),
|
||||
// and remember relative.
|
||||
m_ts->curpos = pos;
|
||||
pos += m_ts->basepos;
|
||||
string ermsg;
|
||||
try {
|
||||
// Index without prefix, using the field-specific weighting
|
||||
LOGDEB1(("Emitting term at %d : [%s]\n", pos, term.c_str()));
|
||||
m_ts->doc.add_posting(term, pos, m_ts->wdfinc);
|
||||
#ifdef TESTING_XAPIAN_SPELL
|
||||
if (Db::isSpellingCandidate(term)) {
|
||||
db.add_spelling(term);
|
||||
}
|
||||
if (Db::isSpellingCandidate(term)) {
|
||||
m_ts->db.add_spelling(term);
|
||||
}
|
||||
#endif
|
||||
// Index the prefixed term.
|
||||
if (!prefix.empty()) {
|
||||
doc.add_posting(prefix + term, pos, wdfinc);
|
||||
}
|
||||
return true;
|
||||
} XCATCHERROR(ermsg);
|
||||
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
// Index the prefixed term.
|
||||
if (!m_ts->prefix.empty()) {
|
||||
m_ts->doc.add_posting(m_ts->prefix + term, pos, m_ts->wdfinc);
|
||||
}
|
||||
return true;
|
||||
} XCATCHERROR(ermsg);
|
||||
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
}
|
||||
private:
|
||||
TextSplitDb *m_ts;
|
||||
};
|
||||
|
||||
|
||||
#ifdef TESTING_XAPIAN_SPELL
|
||||
string Db::getSpellingSuggestion(const string& word)
|
||||
@ -1032,8 +1028,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||
Doc doc = idoc;
|
||||
|
||||
Xapian::Document newdocument;
|
||||
TextSplitDb splitter(m_ndb->xwdb, newdocument, m_stops);
|
||||
|
||||
TermProcIdx tpidx;
|
||||
// TermProcStop tpstop(&tpidx, m_stops);
|
||||
TermProcCommongrams tpstop(&tpidx, m_stops);
|
||||
TermProcPrep tpprep(&tpstop);
|
||||
TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
|
||||
tpidx.setTSD(&splitter);
|
||||
// Split and index file name as document term(s)
|
||||
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
||||
if (!splitter.text_to_words(doc.utf8fn))
|
||||
|
||||
@ -35,6 +35,7 @@
|
||||
#include "utf8iter.h"
|
||||
#include "stoplist.h"
|
||||
#include "rclconfig.h"
|
||||
#include "termproc.h"
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
using namespace std;
|
||||
@ -474,36 +475,23 @@ void SearchData::getUTerms(vector<string>& terms) const
|
||||
// phrases. This is for parts of the user entry which would appear as
|
||||
// a single word because there is no white space inside, but are
|
||||
// actually multiple terms to rcldb (ie term1,term2)
|
||||
class TextSplitQ : public TextSplit {
|
||||
class TextSplitQ : public TextSplitP {
|
||||
public:
|
||||
TextSplitQ(Flags flags, const StopList &_stops)
|
||||
: TextSplit(flags), stops(_stops), alltermcount(0), lastpos(0)
|
||||
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
|
||||
: TextSplitP(prc, flags), stops(_stops), alltermcount(0), lastpos(0)
|
||||
{}
|
||||
bool takeword(const std::string &interm, int pos, int, int) {
|
||||
alltermcount++;
|
||||
lastpos = pos
|
||||
LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));
|
||||
|
||||
bool takeword(const std::string &term, int pos, int bs, int be)
|
||||
{
|
||||
// Check if the first letter is a majuscule in which
|
||||
// case we do not want to do stem expansion.
|
||||
bool nostemexp = unaciscapital(interm);
|
||||
string noaclowterm;
|
||||
if (!unacmaybefold(interm, noaclowterm, "UTF-8", true)) {
|
||||
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
|
||||
interm.c_str()));
|
||||
return true;
|
||||
}
|
||||
// case we do not want to do stem expansion. Need to do this
|
||||
// before unac of course...
|
||||
curnostemexp = unaciscapital(term);
|
||||
|
||||
if (stops.isStop(noaclowterm)) {
|
||||
LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n",
|
||||
noaclowterm.c_str()));
|
||||
return true;
|
||||
}
|
||||
terms.push_back(noaclowterm);
|
||||
nostemexps.push_back(nostemexp);
|
||||
return true;
|
||||
return TextSplitP::takeword(term, pos, bs, be);
|
||||
}
|
||||
|
||||
bool curnostemexp;
|
||||
vector<string> terms;
|
||||
vector<bool> nostemexps;
|
||||
const StopList &stops;
|
||||
@ -513,6 +501,26 @@ class TextSplitQ : public TextSplit {
|
||||
int lastpos;
|
||||
};
|
||||
|
||||
class TermProcQ : public TermProc {
|
||||
public:
|
||||
TermProcQ() : TermProc(0), m_ts(0) {}
|
||||
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
|
||||
|
||||
bool takeword(const std::string &term, int pos, int bs, int be)
|
||||
{
|
||||
m_ts->alltermcount++;
|
||||
m_ts->lastpos = pos;
|
||||
bool noexpand = be ? m_ts->curnostemexp : true;
|
||||
LOGDEB(("TermProcQ::takeword: pushing [%s] noexp %d\n",
|
||||
term.c_str(), noexpand));
|
||||
m_ts->terms.push_back(term);
|
||||
m_ts->nostemexps.push_back(noexpand);
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
TextSplitQ *m_ts;
|
||||
};
|
||||
|
||||
// A class used to translate a user compound string (*not* a query
|
||||
// language string) as may be entered in any_terms/all_terms search
|
||||
// entry fields, ex: [term1 "a phrase" term3] into a xapian query
|
||||
@ -566,7 +574,7 @@ private:
|
||||
vector<vector<string> > m_groups;
|
||||
};
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
static void listVector(const string& what, const vector<string>&l)
|
||||
{
|
||||
string a;
|
||||
@ -575,6 +583,14 @@ static void listVector(const string& what, const vector<string>&l)
|
||||
}
|
||||
LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
|
||||
}
|
||||
static void listList(const string& what, const list<string>& l)
|
||||
{
|
||||
string a;
|
||||
for (list<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
||||
a = a + *it + " ";
|
||||
}
|
||||
LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
|
||||
}
|
||||
#endif
|
||||
|
||||
/** Expand stem and wildcards
|
||||
@ -734,15 +750,17 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
||||
vector<bool>::iterator nxit = splitData->nostemexps.begin();
|
||||
for (vector<string>::iterator it = splitData->terms.begin();
|
||||
it != splitData->terms.end(); it++, nxit++) {
|
||||
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
|
||||
// Adjust when we do stem expansion. Not inside phrases, and
|
||||
// some versions of xapian will accept only one OR clause
|
||||
// inside NEAR, all others must be leafs.
|
||||
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;
|
||||
|
||||
string sterm;
|
||||
list<string>exp;
|
||||
list<string> exp;
|
||||
expandTerm(nostemexp, *it, exp, sterm, prefix);
|
||||
|
||||
LOGDEB0(("ProcessPhrase: exp size %d\n", exp.size()));
|
||||
listList("", exp);
|
||||
// groups is used for highlighting, we don't want prefixes in there.
|
||||
vector<string> noprefs;
|
||||
for (list<string>::const_iterator it = exp.begin();
|
||||
@ -859,21 +877,32 @@ bool StringToXapianQ::processUserString(const string &iq,
|
||||
// We now adjust the phrase/near slack by the term count
|
||||
// difference (this is mainly better for cjk where this is a very
|
||||
// common occurrence because of the ngrams thing.
|
||||
|
||||
TermProcQ tpq;
|
||||
// TermProcStop tpstop(&tpidx, stops);
|
||||
TermProcCommongrams tpstop(&tpq, stops);
|
||||
tpstop.onlygrams(true);
|
||||
TermProcPrep tpprep(&tpstop);
|
||||
|
||||
TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||
TextSplit::TXTS_KEEPWILD),
|
||||
stops);
|
||||
TextSplit::TXTS_KEEPWILD),
|
||||
stops, &tpprep);
|
||||
tpq.setTSQ(&splitterS);
|
||||
splitterS.text_to_words(*it);
|
||||
LOGDEB(("SplitterS has %d terms\n", splitterS.terms.size()));
|
||||
TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS |
|
||||
TextSplit::TXTS_KEEPWILD),
|
||||
stops);
|
||||
stops, &tpprep);
|
||||
tpq.setTSQ(&splitterW);
|
||||
tpstop.onlygrams(false);
|
||||
splitterW.text_to_words(*it);
|
||||
TextSplitQ *splitter = &splitterS;
|
||||
|
||||
if (splitterS.terms.size() > 1 &&
|
||||
splitterS.terms.size() != splitterW.terms.size()) {
|
||||
slack += splitterW.terms.size() - splitterS.terms.size();
|
||||
// used to: splitData = &splitDataW;
|
||||
}
|
||||
|
||||
TextSplitQ *splitter = &splitterS;
|
||||
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
|
||||
switch (splitter->terms.size() + terminc) {
|
||||
case 0:
|
||||
|
||||
182
src/rcldb/termproc.h
Normal file
182
src/rcldb/termproc.h
Normal file
@ -0,0 +1,182 @@
|
||||
/* Copyright (C) 2011 J.F.Dockes
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc.,
|
||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef _TERMPROC_H_INCLUDED_
|
||||
#define _TERMPROC_H_INCLUDED_
|
||||
|
||||
#include "textsplit.h"
|
||||
#include "stoplist.h"
|
||||
namespace Rcl {
|
||||
class TermProc {
|
||||
public:
|
||||
TermProc(TermProc* next) : m_next(next) {}
|
||||
virtual ~TermProc() {}
|
||||
virtual bool takeword(const string &term, int pos, int bs, int be)
|
||||
{
|
||||
if (m_next)
|
||||
return m_next->takeword(term, pos, bs, be);
|
||||
else
|
||||
return true;
|
||||
}
|
||||
virtual bool flush()
|
||||
{
|
||||
if (m_next)
|
||||
return m_next->flush();
|
||||
else
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
TermProc *m_next;
|
||||
};
|
||||
|
||||
class TextSplitP : public TextSplit {
|
||||
public:
|
||||
TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
|
||||
: TextSplit(flags), m_prc(prc)
|
||||
{}
|
||||
|
||||
virtual bool text_to_words(const string &in)
|
||||
{
|
||||
bool ret = TextSplit::text_to_words(in);
|
||||
if (m_prc && !m_prc->flush())
|
||||
return false;
|
||||
return ret;
|
||||
}
|
||||
|
||||
virtual bool takeword(const string& term, int pos, int bs, int be)
|
||||
{
|
||||
if (m_prc)
|
||||
return m_prc->takeword(term, pos, bs, be);
|
||||
else
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
TermProc *m_prc;
|
||||
};
|
||||
|
||||
class TermProcPrep : public TermProc {
|
||||
public:
|
||||
TermProcPrep(TermProc *nxt) : TermProc(nxt) {}
|
||||
|
||||
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
||||
{
|
||||
string otrm;
|
||||
if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
|
||||
LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
|
||||
// We don't generate a fatal error because of a bad term
|
||||
return true;
|
||||
}
|
||||
return TermProc::takeword(otrm, pos, bs, be);
|
||||
}
|
||||
};
|
||||
|
||||
class TermProcStop : public TermProc {
|
||||
public:
|
||||
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
|
||||
: TermProc(nxt), m_stops(stops) { }
|
||||
virtual bool takeword(const string& term, int pos, int bts, int bte)
|
||||
{
|
||||
if (m_stops.isStop(term)) {
|
||||
return true;
|
||||
}
|
||||
return TermProc::takeword(term, pos, bts, bte);
|
||||
}
|
||||
private:
|
||||
const Rcl::StopList& m_stops;
|
||||
};
|
||||
|
||||
class TermProcCommongrams : public TermProc {
|
||||
public:
|
||||
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
|
||||
: TermProc(nxt), m_stops(stops), m_onlygrams(false) { }
|
||||
|
||||
virtual bool takeword(const string& term, int pos, int bs, int be)
|
||||
{
|
||||
LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
|
||||
pos, bs, be, term.c_str()));
|
||||
bool isstop = m_stops.isStop(term);
|
||||
bool twogramemit = false;
|
||||
|
||||
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
|
||||
// create 2-gram. space unnecessary but improves
|
||||
// lisibility of queries
|
||||
string twogram;
|
||||
twogram.swap(m_prevterm);
|
||||
twogram.append(1, ' ');
|
||||
twogram += term;
|
||||
// When emitting a complex term we set the bps to 0. This may
|
||||
// be used by our clients
|
||||
if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
|
||||
return false;
|
||||
twogramemit = true;
|
||||
#if 0
|
||||
if (m_stops.isStop(twogram)) {
|
||||
firstword = twogram;
|
||||
isstop = false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
m_prevterm = term;
|
||||
m_prevstop = isstop;
|
||||
m_prevpos = pos;
|
||||
m_prevsent = false;
|
||||
m_prevbs = bs;
|
||||
m_prevbe = be;
|
||||
// If flags allow, emit the bare term at the current pos.
|
||||
if (!m_onlygrams || (!isstop && !twogramemit)) {
|
||||
if (!TermProc::takeword(term, pos, bs, be))
|
||||
return false;
|
||||
m_prevsent = true;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool flush()
|
||||
{
|
||||
if (!m_prevsent && !m_prevterm.empty())
|
||||
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
|
||||
return false;
|
||||
|
||||
m_prevterm.clear();
|
||||
m_prevsent = true;
|
||||
return TermProc::flush();
|
||||
}
|
||||
void onlygrams(bool on)
|
||||
{
|
||||
m_onlygrams = on;
|
||||
}
|
||||
private:
|
||||
// The stoplist we're using
|
||||
const Rcl::StopList& m_stops;
|
||||
// Remembered data for the last processed term
|
||||
string m_prevterm;
|
||||
bool m_prevstop;
|
||||
int m_prevpos;
|
||||
int m_prevbs;
|
||||
int m_prevbe;
|
||||
bool m_prevsent;
|
||||
// If this is set, we only emit longest grams
|
||||
bool m_onlygrams;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif /* _TERMPROC_H_INCLUDED_ */
|
||||
Loading…
x
Reference in New Issue
Block a user