comments and indent
This commit is contained in:
parent
0755f4f4e2
commit
94b94593e3
@ -25,14 +25,19 @@
|
|||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Termproc objects take a stream of term tokens as input and do something
|
* Termproc objects take term tokens as input and do something
|
||||||
* with them: transform to lowercase, filter out stop words, generate n-grams,
|
* with them: transform to lowercase, filter out stop words, generate n-grams,
|
||||||
* finally index or generate search clauses, etc. They are chained and can
|
* finally index or generate search clauses, etc. They are chained and can
|
||||||
* be arranged to form different pipelines depending on the desired processing
|
* be arranged to form different pipelines depending on the desired processing
|
||||||
* steps: for example, optional stoplist or commongram processing.
|
* steps: for example, optional stoplist or commongram processing.
|
||||||
*
|
*
|
||||||
* Shared processing steps are defined in this file. The first and last steps
|
* Shared processing steps are defined in this file. The first and last steps
|
||||||
* (ie: adding index term) are usually defined in the specific module.
|
* are usually defined in the specific module.
|
||||||
|
* - The front TermProc is typically chained from a TextSplit object
|
||||||
|
* which generates the original terms, and calls takeword() from its
|
||||||
|
* own takeword() method.
|
||||||
|
* - The last TermProc does something with the finalized terms, e.g. adds
|
||||||
|
* them to the index.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -45,106 +50,110 @@ public:
|
|||||||
virtual ~TermProc() {}
|
virtual ~TermProc() {}
|
||||||
virtual bool takeword(const string &term, int pos, int bs, int be)
|
virtual bool takeword(const string &term, int pos, int bs, int be)
|
||||||
{
|
{
|
||||||
if (m_next)
|
if (m_next)
|
||||||
return m_next->takeword(term, pos, bs, be);
|
return m_next->takeword(term, pos, bs, be);
|
||||||
else
|
else
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
// newpage() is like takeword(), but for page breaks.
|
||||||
virtual void newpage(int pos)
|
virtual void newpage(int pos)
|
||||||
{
|
{
|
||||||
if (m_next)
|
if (m_next)
|
||||||
m_next->newpage(pos);
|
m_next->newpage(pos);
|
||||||
}
|
}
|
||||||
virtual bool flush()
|
virtual bool flush()
|
||||||
{
|
{
|
||||||
if (m_next)
|
if (m_next)
|
||||||
return m_next->flush();
|
return m_next->flush();
|
||||||
else
|
else
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
TermProc *m_next;
|
TermProc *m_next;
|
||||||
/* Copyconst and assignment private and forbidden */
|
/* Copyconst and assignment private and forbidden */
|
||||||
TermProc(const TermProc &) {}
|
TermProc(const TermProc &) {}
|
||||||
TermProc& operator=(const TermProc &) {return *this;};
|
TermProc& operator=(const TermProc &) {
|
||||||
|
return *this;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Specialized TextSplit class: this will probably replace the base
|
* Helper specialized TextSplit class, feeds the pipeline:
|
||||||
* TextSplit when we've converted all the code. The takeword() routine in this
|
* - The takeword() method calls a TermProc->takeword().
|
||||||
* calls a TermProc's instead of being overriden in a user derived class.
|
* - The text_to_words() method also takes care of flushing.
|
||||||
* The text_to_words() method also takes care of flushing.
|
* Both methods can be further specialized by the user (they should then call
|
||||||
|
* the base methods when they've done the local processing).
|
||||||
*/
|
*/
|
||||||
class TextSplitP : public TextSplit {
|
class TextSplitP : public TextSplit {
|
||||||
public:
|
public:
|
||||||
TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
|
TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
|
||||||
: TextSplit(flags), m_prc(prc) {}
|
: TextSplit(flags), m_prc(prc) {}
|
||||||
|
|
||||||
virtual bool text_to_words(const string &in)
|
virtual bool text_to_words(const string &in) {
|
||||||
{
|
bool ret = TextSplit::text_to_words(in);
|
||||||
bool ret = TextSplit::text_to_words(in);
|
if (m_prc && !m_prc->flush())
|
||||||
if (m_prc && !m_prc->flush())
|
return false;
|
||||||
return false;
|
return ret;
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool takeword(const string& term, int pos, int bs, int be)
|
virtual bool takeword(const string& term, int pos, int bs, int be) {
|
||||||
{
|
if (m_prc)
|
||||||
if (m_prc)
|
return m_prc->takeword(term, pos, bs, be);
|
||||||
return m_prc->takeword(term, pos, bs, be);
|
else
|
||||||
else
|
return true;
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
virtual void newpage(int pos)
|
|
||||||
{
|
virtual void newpage(int pos) {
|
||||||
if (m_prc)
|
if (m_prc)
|
||||||
return m_prc->newpage(pos);
|
return m_prc->newpage(pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
TermProc *m_prc;
|
TermProc *m_prc;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Unaccent and lowercase term. This is usually the first in the pipeline */
|
/** Unaccent and lowercase term. If the index is
|
||||||
|
* not case/diac-sensitive, this is usually the first step in the pipeline
|
||||||
|
*/
|
||||||
class TermProcPrep : public TermProc {
|
class TermProcPrep : public TermProc {
|
||||||
public:
|
public:
|
||||||
TermProcPrep(TermProc *nxt)
|
TermProcPrep(TermProc *nxt)
|
||||||
: TermProc(nxt), m_totalterms(0), m_unacerrors(0)
|
: TermProc(nxt), m_totalterms(0), m_unacerrors(0)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
||||||
{
|
{
|
||||||
m_totalterms++;
|
m_totalterms++;
|
||||||
string otrm;
|
string otrm;
|
||||||
if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
|
if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
|
||||||
LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
|
LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
|
||||||
m_unacerrors++;
|
m_unacerrors++;
|
||||||
// We don't generate a fatal error because of a bad term,
|
// We don't generate a fatal error because of a bad term,
|
||||||
// but one has to put the limit somewhere
|
// but one has to put the limit somewhere
|
||||||
if (m_unacerrors > 500 &&
|
if (m_unacerrors > 500 &&
|
||||||
(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
|
(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
|
||||||
// More than 1 error for every other term
|
// More than 1 error for every other term
|
||||||
LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
|
LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
|
||||||
m_unacerrors, m_totalterms));
|
m_unacerrors, m_totalterms));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// It may happen in some weird cases that the output from unac is
|
// It may happen in some weird cases that the output from unac is
|
||||||
// empty (if the word actually consisted entirely of diacritics ...)
|
// empty (if the word actually consisted entirely of diacritics ...)
|
||||||
// The consequence is that a phrase search won't work without addional
|
// The consequence is that a phrase search won't work without addional
|
||||||
// slack.
|
// slack.
|
||||||
if (otrm.empty())
|
if (otrm.empty())
|
||||||
return true;
|
return true;
|
||||||
else
|
else
|
||||||
return TermProc::takeword(otrm, pos, bs, be);
|
return TermProc::takeword(otrm, pos, bs, be);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool flush()
|
virtual bool flush()
|
||||||
{
|
{
|
||||||
m_totalterms = m_unacerrors = 0;
|
m_totalterms = m_unacerrors = 0;
|
||||||
return TermProc::flush();
|
return TermProc::flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -156,16 +165,16 @@ private:
|
|||||||
class TermProcStop : public TermProc {
|
class TermProcStop : public TermProc {
|
||||||
public:
|
public:
|
||||||
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
|
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
|
||||||
: TermProc(nxt), m_stops(stops)
|
: TermProc(nxt), m_stops(stops)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool takeword(const string& term, int pos, int bs, int be)
|
virtual bool takeword(const string& term, int pos, int bs, int be)
|
||||||
{
|
{
|
||||||
if (m_stops.isStop(term)) {
|
if (m_stops.isStop(term)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return TermProc::takeword(term, pos, bs, be);
|
return TermProc::takeword(term, pos, bs, be);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -181,66 +190,66 @@ private:
|
|||||||
class TermProcCommongrams : public TermProc {
|
class TermProcCommongrams : public TermProc {
|
||||||
public:
|
public:
|
||||||
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
|
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
|
||||||
: TermProc(nxt), m_stops(stops), m_onlygrams(false)
|
: TermProc(nxt), m_stops(stops), m_onlygrams(false)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool takeword(const string& term, int pos, int bs, int be)
|
virtual bool takeword(const string& term, int pos, int bs, int be)
|
||||||
{
|
{
|
||||||
LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
|
LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
|
||||||
pos, bs, be, term.c_str()));
|
pos, bs, be, term.c_str()));
|
||||||
bool isstop = m_stops.isStop(term);
|
bool isstop = m_stops.isStop(term);
|
||||||
bool twogramemit = false;
|
bool twogramemit = false;
|
||||||
|
|
||||||
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
|
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
|
||||||
// create 2-gram. space unnecessary but improves
|
// create 2-gram. space unnecessary but improves
|
||||||
// the readability of queries
|
// the readability of queries
|
||||||
string twogram;
|
string twogram;
|
||||||
twogram.swap(m_prevterm);
|
twogram.swap(m_prevterm);
|
||||||
twogram.append(1, ' ');
|
twogram.append(1, ' ');
|
||||||
twogram += term;
|
twogram += term;
|
||||||
// When emitting a complex term we set the bps to 0. This may
|
// When emitting a complex term we set the bps to 0. This may
|
||||||
// be used by our clients
|
// be used by our clients
|
||||||
if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
|
if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
|
||||||
return false;
|
return false;
|
||||||
twogramemit = true;
|
twogramemit = true;
|
||||||
#if 0
|
#if 0
|
||||||
if (m_stops.isStop(twogram)) {
|
if (m_stops.isStop(twogram)) {
|
||||||
firstword = twogram;
|
firstword = twogram;
|
||||||
isstop = false;
|
isstop = false;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
m_prevterm = term;
|
m_prevterm = term;
|
||||||
m_prevstop = isstop;
|
m_prevstop = isstop;
|
||||||
m_prevpos = pos;
|
m_prevpos = pos;
|
||||||
m_prevsent = false;
|
m_prevsent = false;
|
||||||
m_prevbs = bs;
|
m_prevbs = bs;
|
||||||
m_prevbe = be;
|
m_prevbe = be;
|
||||||
// If flags allow, emit the bare term at the current pos.
|
// If flags allow, emit the bare term at the current pos.
|
||||||
if (!m_onlygrams || (!isstop && !twogramemit)) {
|
if (!m_onlygrams || (!isstop && !twogramemit)) {
|
||||||
if (!TermProc::takeword(term, pos, bs, be))
|
if (!TermProc::takeword(term, pos, bs, be))
|
||||||
return false;
|
return false;
|
||||||
m_prevsent = true;
|
m_prevsent = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool flush()
|
virtual bool flush()
|
||||||
{
|
{
|
||||||
if (!m_prevsent && !m_prevterm.empty())
|
if (!m_prevsent && !m_prevterm.empty())
|
||||||
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
|
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
m_prevterm.clear();
|
m_prevterm.clear();
|
||||||
m_prevsent = true;
|
m_prevsent = true;
|
||||||
return TermProc::flush();
|
return TermProc::flush();
|
||||||
}
|
}
|
||||||
void onlygrams(bool on)
|
void onlygrams(bool on)
|
||||||
{
|
{
|
||||||
m_onlygrams = on;
|
m_onlygrams = on;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
// The stoplist we're using
|
// The stoplist we're using
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user