comments and indent

This commit is contained in:
Jean-Francois Dockes 2015-06-09 19:34:15 +02:00
parent 0755f4f4e2
commit 94b94593e3

View File

@ -25,14 +25,19 @@
namespace Rcl { namespace Rcl {
/** /**
* Termproc objects take a stream of term tokens as input and do something * Termproc objects take term tokens as input and do something
* with them: transform to lowercase, filter out stop words, generate n-grams, * with them: transform to lowercase, filter out stop words, generate n-grams,
* finally index or generate search clauses, etc. They are chained and can * finally index or generate search clauses, etc. They are chained and can
* be arranged to form different pipelines depending on the desired processing * be arranged to form different pipelines depending on the desired processing
* steps: for example, optional stoplist or commongram processing. * steps: for example, optional stoplist or commongram processing.
* *
* Shared processing steps are defined in this file. The first and last steps * Shared processing steps are defined in this file. The first and last steps
* (ie: adding index term) are usually defined in the specific module. * are usually defined in the specific module.
* - The front TermProc is typically chained from a TextSplit object
* which generates the original terms, and calls takeword() from its
* own takeword() method.
* - The last TermProc does something with the finalized terms, e.g. adds
* them to the index.
*/ */
/** /**
@ -45,106 +50,110 @@ public:
virtual ~TermProc() {} virtual ~TermProc() {}
virtual bool takeword(const string &term, int pos, int bs, int be) virtual bool takeword(const string &term, int pos, int bs, int be)
{ {
if (m_next) if (m_next)
return m_next->takeword(term, pos, bs, be); return m_next->takeword(term, pos, bs, be);
else else
return true; return true;
} }
// newpage() is like takeword(), but for page breaks.
virtual void newpage(int pos) virtual void newpage(int pos)
{ {
if (m_next) if (m_next)
m_next->newpage(pos); m_next->newpage(pos);
} }
virtual bool flush() virtual bool flush()
{ {
if (m_next) if (m_next)
return m_next->flush(); return m_next->flush();
else else
return true; return true;
} }
private: private:
TermProc *m_next; TermProc *m_next;
/* Copyconst and assignment private and forbidden */ /* Copyconst and assignment private and forbidden */
TermProc(const TermProc &) {} TermProc(const TermProc &) {}
TermProc& operator=(const TermProc &) {return *this;}; TermProc& operator=(const TermProc &) {
return *this;
};
}; };
/** /**
* Specialized TextSplit class: this will probably replace the base * Helper specialized TextSplit class, feeds the pipeline:
* TextSplit when we've converted all the code. The takeword() routine in this * - The takeword() method calls a TermProc->takeword().
* calls a TermProc's instead of being overriden in a user derived class. * - The text_to_words() method also takes care of flushing.
* The text_to_words() method also takes care of flushing. * Both methods can be further specialized by the user (they should then call
* the base methods when they've done the local processing).
*/ */
class TextSplitP : public TextSplit { class TextSplitP : public TextSplit {
public: public:
TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE)) TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
: TextSplit(flags), m_prc(prc) {} : TextSplit(flags), m_prc(prc) {}
virtual bool text_to_words(const string &in) virtual bool text_to_words(const string &in) {
{ bool ret = TextSplit::text_to_words(in);
bool ret = TextSplit::text_to_words(in); if (m_prc && !m_prc->flush())
if (m_prc && !m_prc->flush()) return false;
return false; return ret;
return ret;
} }
virtual bool takeword(const string& term, int pos, int bs, int be) virtual bool takeword(const string& term, int pos, int bs, int be) {
{ if (m_prc)
if (m_prc) return m_prc->takeword(term, pos, bs, be);
return m_prc->takeword(term, pos, bs, be); else
else return true;
return true;
} }
virtual void newpage(int pos)
{ virtual void newpage(int pos) {
if (m_prc) if (m_prc)
return m_prc->newpage(pos); return m_prc->newpage(pos);
} }
private: private:
TermProc *m_prc; TermProc *m_prc;
}; };
/** Unaccent and lowercase term. This is usually the first in the pipeline */ /** Unaccent and lowercase term. If the index is
* not case/diac-sensitive, this is usually the first step in the pipeline
*/
class TermProcPrep : public TermProc { class TermProcPrep : public TermProc {
public: public:
TermProcPrep(TermProc *nxt) TermProcPrep(TermProc *nxt)
: TermProc(nxt), m_totalterms(0), m_unacerrors(0) : TermProc(nxt), m_totalterms(0), m_unacerrors(0)
{ {
} }
virtual bool takeword(const string& itrm, int pos, int bs, int be) virtual bool takeword(const string& itrm, int pos, int bs, int be)
{ {
m_totalterms++; m_totalterms++;
string otrm; string otrm;
if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) { if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str())); LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
m_unacerrors++; m_unacerrors++;
// We don't generate a fatal error because of a bad term, // We don't generate a fatal error because of a bad term,
// but one has to put the limit somewhere // but one has to put the limit somewhere
if (m_unacerrors > 500 && if (m_unacerrors > 500 &&
(double(m_totalterms) / double(m_unacerrors)) < 2.0) { (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
// More than 1 error for every other term // More than 1 error for every other term
LOGERR(("splitter::takeword: too many unac errors %d/%d\n", LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
m_unacerrors, m_totalterms)); m_unacerrors, m_totalterms));
return false; return false;
} }
return true; return true;
} }
// It may happen in some weird cases that the output from unac is // It may happen in some weird cases that the output from unac is
// empty (if the word actually consisted entirely of diacritics ...) // empty (if the word actually consisted entirely of diacritics ...)
// The consequence is that a phrase search won't work without addional // The consequence is that a phrase search won't work without addional
// slack. // slack.
if (otrm.empty()) if (otrm.empty())
return true; return true;
else else
return TermProc::takeword(otrm, pos, bs, be); return TermProc::takeword(otrm, pos, bs, be);
} }
virtual bool flush() virtual bool flush()
{ {
m_totalterms = m_unacerrors = 0; m_totalterms = m_unacerrors = 0;
return TermProc::flush(); return TermProc::flush();
} }
private: private:
@ -156,16 +165,16 @@ private:
class TermProcStop : public TermProc { class TermProcStop : public TermProc {
public: public:
TermProcStop(TermProc *nxt, const Rcl::StopList& stops) TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
: TermProc(nxt), m_stops(stops) : TermProc(nxt), m_stops(stops)
{ {
} }
virtual bool takeword(const string& term, int pos, int bs, int be) virtual bool takeword(const string& term, int pos, int bs, int be)
{ {
if (m_stops.isStop(term)) { if (m_stops.isStop(term)) {
return true; return true;
} }
return TermProc::takeword(term, pos, bs, be); return TermProc::takeword(term, pos, bs, be);
} }
private: private:
@ -181,66 +190,66 @@ private:
class TermProcCommongrams : public TermProc { class TermProcCommongrams : public TermProc {
public: public:
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops) TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
: TermProc(nxt), m_stops(stops), m_onlygrams(false) : TermProc(nxt), m_stops(stops), m_onlygrams(false)
{ {
} }
virtual bool takeword(const string& term, int pos, int bs, int be) virtual bool takeword(const string& term, int pos, int bs, int be)
{ {
LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n", LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
pos, bs, be, term.c_str())); pos, bs, be, term.c_str()));
bool isstop = m_stops.isStop(term); bool isstop = m_stops.isStop(term);
bool twogramemit = false; bool twogramemit = false;
if (!m_prevterm.empty() && (m_prevstop || isstop)) { if (!m_prevterm.empty() && (m_prevstop || isstop)) {
// create 2-gram. space unnecessary but improves // create 2-gram. space unnecessary but improves
// the readability of queries // the readability of queries
string twogram; string twogram;
twogram.swap(m_prevterm); twogram.swap(m_prevterm);
twogram.append(1, ' '); twogram.append(1, ' ');
twogram += term; twogram += term;
// When emitting a complex term we set the bps to 0. This may // When emitting a complex term we set the bps to 0. This may
// be used by our clients // be used by our clients
if (!TermProc::takeword(twogram, m_prevpos, 0, 0)) if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
return false; return false;
twogramemit = true; twogramemit = true;
#if 0 #if 0
if (m_stops.isStop(twogram)) { if (m_stops.isStop(twogram)) {
firstword = twogram; firstword = twogram;
isstop = false; isstop = false;
} }
#endif #endif
} }
m_prevterm = term; m_prevterm = term;
m_prevstop = isstop; m_prevstop = isstop;
m_prevpos = pos; m_prevpos = pos;
m_prevsent = false; m_prevsent = false;
m_prevbs = bs; m_prevbs = bs;
m_prevbe = be; m_prevbe = be;
// If flags allow, emit the bare term at the current pos. // If flags allow, emit the bare term at the current pos.
if (!m_onlygrams || (!isstop && !twogramemit)) { if (!m_onlygrams || (!isstop && !twogramemit)) {
if (!TermProc::takeword(term, pos, bs, be)) if (!TermProc::takeword(term, pos, bs, be))
return false; return false;
m_prevsent = true; m_prevsent = true;
} }
return true; return true;
} }
virtual bool flush() virtual bool flush()
{ {
if (!m_prevsent && !m_prevterm.empty()) if (!m_prevsent && !m_prevterm.empty())
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe)) if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
return false; return false;
m_prevterm.clear(); m_prevterm.clear();
m_prevsent = true; m_prevsent = true;
return TermProc::flush(); return TermProc::flush();
} }
void onlygrams(bool on) void onlygrams(bool on)
{ {
m_onlygrams = on; m_onlygrams = on;
} }
private: private:
// The stoplist we're using // The stoplist we're using