New text to terms processing pipelines: results identical to 1.16 when used with empty stopfile
This commit is contained in:
parent
61bf17aa46
commit
5fd31172f5
@ -19,6 +19,7 @@
|
|||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::list;
|
using std::list;
|
||||||
@ -26,7 +27,6 @@ using std::list;
|
|||||||
|
|
||||||
class Utf8Iter;
|
class Utf8Iter;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Split text into words.
|
* Split text into words.
|
||||||
* See comments at top of .cpp for more explanations.
|
* See comments at top of .cpp for more explanations.
|
||||||
|
|||||||
@ -52,6 +52,7 @@ using namespace std;
|
|||||||
#include "rclversion.h"
|
#include "rclversion.h"
|
||||||
#include "cancelcheck.h"
|
#include "cancelcheck.h"
|
||||||
#include "ptmutex.h"
|
#include "ptmutex.h"
|
||||||
|
#include "termproc.h"
|
||||||
|
|
||||||
#ifndef MAX
|
#ifndef MAX
|
||||||
#define MAX(A,B) (A>B?A:B)
|
#define MAX(A,B) (A>B?A:B)
|
||||||
@ -858,7 +859,7 @@ bool Db::fieldToTraits(const string& fld, const FieldTraits **ftpp)
|
|||||||
// The splitter breaks text into words and adds postings to the Xapian
|
// The splitter breaks text into words and adds postings to the Xapian
|
||||||
// document. We use a single object to split all of the document
|
// document. We use a single object to split all of the document
|
||||||
// fields and position jumps to separate fields
|
// fields and position jumps to separate fields
|
||||||
class TextSplitDb : public TextSplit {
|
class TextSplitDb : public TextSplitP {
|
||||||
public:
|
public:
|
||||||
Xapian::WritableDatabase db;
|
Xapian::WritableDatabase db;
|
||||||
Xapian::Document &doc; // Xapian document
|
Xapian::Document &doc; // Xapian document
|
||||||
@ -873,17 +874,18 @@ class TextSplitDb : public TextSplit {
|
|||||||
// to compute the first position of the next section.
|
// to compute the first position of the next section.
|
||||||
Xapian::termpos curpos;
|
Xapian::termpos curpos;
|
||||||
|
|
||||||
StopList &stops;
|
|
||||||
TextSplitDb(Xapian::WritableDatabase idb,
|
TextSplitDb(Xapian::WritableDatabase idb,
|
||||||
Xapian::Document &d, StopList &_stops)
|
Xapian::Document &d, TermProc *prc)
|
||||||
: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
|
: TextSplitP(prc),
|
||||||
|
db(idb), doc(d), basepos(1), curpos(0), wdfinc(1)
|
||||||
{}
|
{}
|
||||||
// Reimplement text_to_words to add start and end special terms
|
// Reimplement text_to_words to add start and end special terms
|
||||||
virtual bool text_to_words(const string &in);
|
virtual bool text_to_words(const string &in);
|
||||||
bool takeword(const std::string &term, int pos, int, int);
|
|
||||||
void setprefix(const string& pref) {prefix = pref;}
|
void setprefix(const string& pref) {prefix = pref;}
|
||||||
void setwdfinc(int i) {wdfinc = i;}
|
void setwdfinc(int i) {wdfinc = i;}
|
||||||
|
|
||||||
|
friend class TermProcIdx;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// If prefix is set, we also add a posting for the prefixed terms
|
// If prefix is set, we also add a posting for the prefixed terms
|
||||||
// (ie: for titles, add postings for both "term" and "Sterm")
|
// (ie: for titles, add postings for both "term" and "Sterm")
|
||||||
@ -892,7 +894,7 @@ private:
|
|||||||
int wdfinc;
|
int wdfinc;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Reimplement text_to_words to insert the begin and end anchor terms.
|
||||||
bool TextSplitDb::text_to_words(const string &in)
|
bool TextSplitDb::text_to_words(const string &in)
|
||||||
{
|
{
|
||||||
LOGDEB2(("TextSplitDb::text_to_words\n"));
|
LOGDEB2(("TextSplitDb::text_to_words\n"));
|
||||||
@ -908,7 +910,7 @@ bool TextSplitDb::text_to_words(const string &in)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!TextSplit::text_to_words(in)) {
|
if (!TextSplitP::text_to_words(in)) {
|
||||||
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
|
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
|
||||||
basepos += curpos + 100;
|
basepos += curpos + 100;
|
||||||
return false;
|
return false;
|
||||||
@ -924,51 +926,45 @@ bool TextSplitDb::text_to_words(const string &in)
|
|||||||
basepos += curpos + 100;
|
basepos += curpos + 100;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
basepos += curpos + 100;
|
basepos += curpos + 100;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get one term from the doc, remove accents and lowercase, then add posting
|
class TermProcIdx : public TermProc {
|
||||||
bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
|
public:
|
||||||
{
|
TermProcIdx() : TermProc(0), m_ts(0) {}
|
||||||
LOGDEB2(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
|
void setTSD(TextSplitDb *ts) {m_ts = ts;}
|
||||||
|
|
||||||
string term;
|
bool takeword(const std::string &term, int pos, int, int)
|
||||||
if (!unacmaybefold(_term, term, "UTF-8", true)) {
|
{
|
||||||
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n",
|
// Compute absolute position (pos is relative to current segment),
|
||||||
_term.c_str()));
|
// and remember relative.
|
||||||
term.clear();
|
m_ts->curpos = pos;
|
||||||
// We don't generate a fatal error because of a bad term
|
pos += m_ts->basepos;
|
||||||
return true;
|
string ermsg;
|
||||||
}
|
try {
|
||||||
|
// Index without prefix, using the field-specific weighting
|
||||||
if (stops.isStop(term)) {
|
LOGDEB1(("Emitting term at %d : [%s]\n", pos, term.c_str()));
|
||||||
LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
|
m_ts->doc.add_posting(term, pos, m_ts->wdfinc);
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute absolute position (pos is relative to current segment),
|
|
||||||
// and remember relative.
|
|
||||||
curpos = pos;
|
|
||||||
pos += basepos;
|
|
||||||
string ermsg;
|
|
||||||
try {
|
|
||||||
// Index without prefix, using the field-specific weighting
|
|
||||||
doc.add_posting(term, pos, wdfinc);
|
|
||||||
#ifdef TESTING_XAPIAN_SPELL
|
#ifdef TESTING_XAPIAN_SPELL
|
||||||
if (Db::isSpellingCandidate(term)) {
|
if (Db::isSpellingCandidate(term)) {
|
||||||
db.add_spelling(term);
|
m_ts->db.add_spelling(term);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
// Index the prefixed term.
|
// Index the prefixed term.
|
||||||
if (!prefix.empty()) {
|
if (!m_ts->prefix.empty()) {
|
||||||
doc.add_posting(prefix + term, pos, wdfinc);
|
m_ts->doc.add_posting(m_ts->prefix + term, pos, m_ts->wdfinc);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
} XCATCHERROR(ermsg);
|
} XCATCHERROR(ermsg);
|
||||||
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
private:
|
||||||
|
TextSplitDb *m_ts;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
#ifdef TESTING_XAPIAN_SPELL
|
#ifdef TESTING_XAPIAN_SPELL
|
||||||
string Db::getSpellingSuggestion(const string& word)
|
string Db::getSpellingSuggestion(const string& word)
|
||||||
@ -1032,8 +1028,12 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
|||||||
Doc doc = idoc;
|
Doc doc = idoc;
|
||||||
|
|
||||||
Xapian::Document newdocument;
|
Xapian::Document newdocument;
|
||||||
TextSplitDb splitter(m_ndb->xwdb, newdocument, m_stops);
|
TermProcIdx tpidx;
|
||||||
|
// TermProcStop tpstop(&tpidx, m_stops);
|
||||||
|
TermProcCommongrams tpstop(&tpidx, m_stops);
|
||||||
|
TermProcPrep tpprep(&tpstop);
|
||||||
|
TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
|
||||||
|
tpidx.setTSD(&splitter);
|
||||||
// Split and index file name as document term(s)
|
// Split and index file name as document term(s)
|
||||||
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
||||||
if (!splitter.text_to_words(doc.utf8fn))
|
if (!splitter.text_to_words(doc.utf8fn))
|
||||||
|
|||||||
@ -35,6 +35,7 @@
|
|||||||
#include "utf8iter.h"
|
#include "utf8iter.h"
|
||||||
#include "stoplist.h"
|
#include "stoplist.h"
|
||||||
#include "rclconfig.h"
|
#include "rclconfig.h"
|
||||||
|
#include "termproc.h"
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
using namespace std;
|
using namespace std;
|
||||||
@ -474,36 +475,23 @@ void SearchData::getUTerms(vector<string>& terms) const
|
|||||||
// phrases. This is for parts of the user entry which would appear as
|
// phrases. This is for parts of the user entry which would appear as
|
||||||
// a single word because there is no white space inside, but are
|
// a single word because there is no white space inside, but are
|
||||||
// actually multiple terms to rcldb (ie term1,term2)
|
// actually multiple terms to rcldb (ie term1,term2)
|
||||||
class TextSplitQ : public TextSplit {
|
class TextSplitQ : public TextSplitP {
|
||||||
public:
|
public:
|
||||||
TextSplitQ(Flags flags, const StopList &_stops)
|
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
|
||||||
: TextSplit(flags), stops(_stops), alltermcount(0), lastpos(0)
|
: TextSplitP(prc, flags), stops(_stops), alltermcount(0), lastpos(0)
|
||||||
{}
|
{}
|
||||||
bool takeword(const std::string &interm, int pos, int, int) {
|
|
||||||
alltermcount++;
|
|
||||||
lastpos = pos
|
|
||||||
LOGDEB1(("TextSplitQ::takeword: %s\n", interm.c_str()));
|
|
||||||
|
|
||||||
|
bool takeword(const std::string &term, int pos, int bs, int be)
|
||||||
|
{
|
||||||
// Check if the first letter is a majuscule in which
|
// Check if the first letter is a majuscule in which
|
||||||
// case we do not want to do stem expansion.
|
// case we do not want to do stem expansion. Need to do this
|
||||||
bool nostemexp = unaciscapital(interm);
|
// before unac of course...
|
||||||
string noaclowterm;
|
curnostemexp = unaciscapital(term);
|
||||||
if (!unacmaybefold(interm, noaclowterm, "UTF-8", true)) {
|
|
||||||
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n",
|
|
||||||
interm.c_str()));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (stops.isStop(noaclowterm)) {
|
return TextSplitP::takeword(term, pos, bs, be);
|
||||||
LOGDEB1(("TextSplitQ::takeword [%s] in stop list\n",
|
|
||||||
noaclowterm.c_str()));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
terms.push_back(noaclowterm);
|
|
||||||
nostemexps.push_back(nostemexp);
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool curnostemexp;
|
||||||
vector<string> terms;
|
vector<string> terms;
|
||||||
vector<bool> nostemexps;
|
vector<bool> nostemexps;
|
||||||
const StopList &stops;
|
const StopList &stops;
|
||||||
@ -513,6 +501,26 @@ class TextSplitQ : public TextSplit {
|
|||||||
int lastpos;
|
int lastpos;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class TermProcQ : public TermProc {
|
||||||
|
public:
|
||||||
|
TermProcQ() : TermProc(0), m_ts(0) {}
|
||||||
|
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
|
||||||
|
|
||||||
|
bool takeword(const std::string &term, int pos, int bs, int be)
|
||||||
|
{
|
||||||
|
m_ts->alltermcount++;
|
||||||
|
m_ts->lastpos = pos;
|
||||||
|
bool noexpand = be ? m_ts->curnostemexp : true;
|
||||||
|
LOGDEB(("TermProcQ::takeword: pushing [%s] noexp %d\n",
|
||||||
|
term.c_str(), noexpand));
|
||||||
|
m_ts->terms.push_back(term);
|
||||||
|
m_ts->nostemexps.push_back(noexpand);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
TextSplitQ *m_ts;
|
||||||
|
};
|
||||||
|
|
||||||
// A class used to translate a user compound string (*not* a query
|
// A class used to translate a user compound string (*not* a query
|
||||||
// language string) as may be entered in any_terms/all_terms search
|
// language string) as may be entered in any_terms/all_terms search
|
||||||
// entry fields, ex: [term1 "a phrase" term3] into a xapian query
|
// entry fields, ex: [term1 "a phrase" term3] into a xapian query
|
||||||
@ -566,7 +574,7 @@ private:
|
|||||||
vector<vector<string> > m_groups;
|
vector<vector<string> > m_groups;
|
||||||
};
|
};
|
||||||
|
|
||||||
#if 0
|
#if 1
|
||||||
static void listVector(const string& what, const vector<string>&l)
|
static void listVector(const string& what, const vector<string>&l)
|
||||||
{
|
{
|
||||||
string a;
|
string a;
|
||||||
@ -575,6 +583,14 @@ static void listVector(const string& what, const vector<string>&l)
|
|||||||
}
|
}
|
||||||
LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
|
LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
|
||||||
}
|
}
|
||||||
|
static void listList(const string& what, const list<string>& l)
|
||||||
|
{
|
||||||
|
string a;
|
||||||
|
for (list<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
||||||
|
a = a + *it + " ";
|
||||||
|
}
|
||||||
|
LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/** Expand stem and wildcards
|
/** Expand stem and wildcards
|
||||||
@ -734,15 +750,17 @@ void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData,
|
|||||||
vector<bool>::iterator nxit = splitData->nostemexps.begin();
|
vector<bool>::iterator nxit = splitData->nostemexps.begin();
|
||||||
for (vector<string>::iterator it = splitData->terms.begin();
|
for (vector<string>::iterator it = splitData->terms.begin();
|
||||||
it != splitData->terms.end(); it++, nxit++) {
|
it != splitData->terms.end(); it++, nxit++) {
|
||||||
|
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
|
||||||
// Adjust when we do stem expansion. Not inside phrases, and
|
// Adjust when we do stem expansion. Not inside phrases, and
|
||||||
// some versions of xapian will accept only one OR clause
|
// some versions of xapian will accept only one OR clause
|
||||||
// inside NEAR, all others must be leafs.
|
// inside NEAR, all others must be leafs.
|
||||||
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;
|
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;
|
||||||
|
|
||||||
string sterm;
|
string sterm;
|
||||||
list<string>exp;
|
list<string> exp;
|
||||||
expandTerm(nostemexp, *it, exp, sterm, prefix);
|
expandTerm(nostemexp, *it, exp, sterm, prefix);
|
||||||
|
LOGDEB0(("ProcessPhrase: exp size %d\n", exp.size()));
|
||||||
|
listList("", exp);
|
||||||
// groups is used for highlighting, we don't want prefixes in there.
|
// groups is used for highlighting, we don't want prefixes in there.
|
||||||
vector<string> noprefs;
|
vector<string> noprefs;
|
||||||
for (list<string>::const_iterator it = exp.begin();
|
for (list<string>::const_iterator it = exp.begin();
|
||||||
@ -859,21 +877,32 @@ bool StringToXapianQ::processUserString(const string &iq,
|
|||||||
// We now adjust the phrase/near slack by the term count
|
// We now adjust the phrase/near slack by the term count
|
||||||
// difference (this is mainly better for cjk where this is a very
|
// difference (this is mainly better for cjk where this is a very
|
||||||
// common occurrence because of the ngrams thing.
|
// common occurrence because of the ngrams thing.
|
||||||
|
|
||||||
|
TermProcQ tpq;
|
||||||
|
// TermProcStop tpstop(&tpidx, stops);
|
||||||
|
TermProcCommongrams tpstop(&tpq, stops);
|
||||||
|
tpstop.onlygrams(true);
|
||||||
|
TermProcPrep tpprep(&tpstop);
|
||||||
|
|
||||||
TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
TextSplitQ splitterS(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
||||||
TextSplit::TXTS_KEEPWILD),
|
TextSplit::TXTS_KEEPWILD),
|
||||||
stops);
|
stops, &tpprep);
|
||||||
|
tpq.setTSQ(&splitterS);
|
||||||
splitterS.text_to_words(*it);
|
splitterS.text_to_words(*it);
|
||||||
|
LOGDEB(("SplitterS has %d terms\n", splitterS.terms.size()));
|
||||||
TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS |
|
TextSplitQ splitterW(TextSplit::Flags(TextSplit::TXTS_NOSPANS |
|
||||||
TextSplit::TXTS_KEEPWILD),
|
TextSplit::TXTS_KEEPWILD),
|
||||||
stops);
|
stops, &tpprep);
|
||||||
|
tpq.setTSQ(&splitterW);
|
||||||
|
tpstop.onlygrams(false);
|
||||||
splitterW.text_to_words(*it);
|
splitterW.text_to_words(*it);
|
||||||
TextSplitQ *splitter = &splitterS;
|
|
||||||
if (splitterS.terms.size() > 1 &&
|
if (splitterS.terms.size() > 1 &&
|
||||||
splitterS.terms.size() != splitterW.terms.size()) {
|
splitterS.terms.size() != splitterW.terms.size()) {
|
||||||
slack += splitterW.terms.size() - splitterS.terms.size();
|
slack += splitterW.terms.size() - splitterS.terms.size();
|
||||||
// used to: splitData = &splitDataW;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TextSplitQ *splitter = &splitterS;
|
||||||
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
|
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter->terms.size()));
|
||||||
switch (splitter->terms.size() + terminc) {
|
switch (splitter->terms.size() + terminc) {
|
||||||
case 0:
|
case 0:
|
||||||
|
|||||||
182
src/rcldb/termproc.h
Normal file
182
src/rcldb/termproc.h
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
/* Copyright (C) 2011 J.F.Dockes
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the
|
||||||
|
* Free Software Foundation, Inc.,
|
||||||
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef _TERMPROC_H_INCLUDED_
|
||||||
|
#define _TERMPROC_H_INCLUDED_
|
||||||
|
|
||||||
|
#include "textsplit.h"
|
||||||
|
#include "stoplist.h"
|
||||||
|
namespace Rcl {
|
||||||
|
class TermProc {
|
||||||
|
public:
|
||||||
|
TermProc(TermProc* next) : m_next(next) {}
|
||||||
|
virtual ~TermProc() {}
|
||||||
|
virtual bool takeword(const string &term, int pos, int bs, int be)
|
||||||
|
{
|
||||||
|
if (m_next)
|
||||||
|
return m_next->takeword(term, pos, bs, be);
|
||||||
|
else
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
virtual bool flush()
|
||||||
|
{
|
||||||
|
if (m_next)
|
||||||
|
return m_next->flush();
|
||||||
|
else
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
TermProc *m_next;
|
||||||
|
};
|
||||||
|
|
||||||
|
class TextSplitP : public TextSplit {
|
||||||
|
public:
|
||||||
|
TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
|
||||||
|
: TextSplit(flags), m_prc(prc)
|
||||||
|
{}
|
||||||
|
|
||||||
|
virtual bool text_to_words(const string &in)
|
||||||
|
{
|
||||||
|
bool ret = TextSplit::text_to_words(in);
|
||||||
|
if (m_prc && !m_prc->flush())
|
||||||
|
return false;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool takeword(const string& term, int pos, int bs, int be)
|
||||||
|
{
|
||||||
|
if (m_prc)
|
||||||
|
return m_prc->takeword(term, pos, bs, be);
|
||||||
|
else
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
TermProc *m_prc;
|
||||||
|
};
|
||||||
|
|
||||||
|
class TermProcPrep : public TermProc {
|
||||||
|
public:
|
||||||
|
TermProcPrep(TermProc *nxt) : TermProc(nxt) {}
|
||||||
|
|
||||||
|
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
||||||
|
{
|
||||||
|
string otrm;
|
||||||
|
if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
|
||||||
|
LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
|
||||||
|
// We don't generate a fatal error because of a bad term
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return TermProc::takeword(otrm, pos, bs, be);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class TermProcStop : public TermProc {
|
||||||
|
public:
|
||||||
|
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
|
||||||
|
: TermProc(nxt), m_stops(stops) { }
|
||||||
|
virtual bool takeword(const string& term, int pos, int bts, int bte)
|
||||||
|
{
|
||||||
|
if (m_stops.isStop(term)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return TermProc::takeword(term, pos, bts, bte);
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
const Rcl::StopList& m_stops;
|
||||||
|
};
|
||||||
|
|
||||||
|
class TermProcCommongrams : public TermProc {
|
||||||
|
public:
|
||||||
|
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
|
||||||
|
: TermProc(nxt), m_stops(stops), m_onlygrams(false) { }
|
||||||
|
|
||||||
|
virtual bool takeword(const string& term, int pos, int bs, int be)
|
||||||
|
{
|
||||||
|
LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
|
||||||
|
pos, bs, be, term.c_str()));
|
||||||
|
bool isstop = m_stops.isStop(term);
|
||||||
|
bool twogramemit = false;
|
||||||
|
|
||||||
|
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
|
||||||
|
// create 2-gram. space unnecessary but improves
|
||||||
|
// lisibility of queries
|
||||||
|
string twogram;
|
||||||
|
twogram.swap(m_prevterm);
|
||||||
|
twogram.append(1, ' ');
|
||||||
|
twogram += term;
|
||||||
|
// When emitting a complex term we set the bps to 0. This may
|
||||||
|
// be used by our clients
|
||||||
|
if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
|
||||||
|
return false;
|
||||||
|
twogramemit = true;
|
||||||
|
#if 0
|
||||||
|
if (m_stops.isStop(twogram)) {
|
||||||
|
firstword = twogram;
|
||||||
|
isstop = false;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
m_prevterm = term;
|
||||||
|
m_prevstop = isstop;
|
||||||
|
m_prevpos = pos;
|
||||||
|
m_prevsent = false;
|
||||||
|
m_prevbs = bs;
|
||||||
|
m_prevbe = be;
|
||||||
|
// If flags allow, emit the bare term at the current pos.
|
||||||
|
if (!m_onlygrams || (!isstop && !twogramemit)) {
|
||||||
|
if (!TermProc::takeword(term, pos, bs, be))
|
||||||
|
return false;
|
||||||
|
m_prevsent = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool flush()
|
||||||
|
{
|
||||||
|
if (!m_prevsent && !m_prevterm.empty())
|
||||||
|
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
m_prevterm.clear();
|
||||||
|
m_prevsent = true;
|
||||||
|
return TermProc::flush();
|
||||||
|
}
|
||||||
|
void onlygrams(bool on)
|
||||||
|
{
|
||||||
|
m_onlygrams = on;
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
// The stoplist we're using
|
||||||
|
const Rcl::StopList& m_stops;
|
||||||
|
// Remembered data for the last processed term
|
||||||
|
string m_prevterm;
|
||||||
|
bool m_prevstop;
|
||||||
|
int m_prevpos;
|
||||||
|
int m_prevbs;
|
||||||
|
int m_prevbe;
|
||||||
|
bool m_prevsent;
|
||||||
|
// If this is set, we only emit longest grams
|
||||||
|
bool m_onlygrams;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* _TERMPROC_H_INCLUDED_ */
|
||||||
Loading…
x
Reference in New Issue
Block a user