diff --git a/src/rcldb/termproc.h b/src/rcldb/termproc.h index 33c18f89..eb7cd4a8 100644 --- a/src/rcldb/termproc.h +++ b/src/rcldb/termproc.h @@ -24,19 +24,24 @@ namespace Rcl { -/** - * Termproc objects take a stream of term tokens as input and do something +/** + * Termproc objects take term tokens as input and do something * with them: transform to lowercase, filter out stop words, generate n-grams, - * finally index or generate search clauses, etc. They are chained and can + * finally index or generate search clauses, etc. They are chained and can * be arranged to form different pipelines depending on the desired processing * steps: for example, optional stoplist or commongram processing. * * Shared processing steps are defined in this file. The first and last steps - * (ie: adding index term) are usually defined in the specific module. + * are usually defined in the specific module. + * - The front TermProc is typically chained from a TextSplit object + * which generates the original terms, and calls takeword() from its + * own takeword() method. + * - The last TermProc does something with the finalized terms, e.g. adds + * them to the index. */ -/** - * The base class takes care of chaining: all derived classes call its +/** + * The base class takes care of chaining: all derived classes call its * takeword() and flush() methods to ensure that terms go through the pipe. */ class TermProc { @@ -45,106 +50,110 @@ public: virtual ~TermProc() {} virtual bool takeword(const string &term, int pos, int bs, int be) { - if (m_next) - return m_next->takeword(term, pos, bs, be); - else - return true; + if (m_next) + return m_next->takeword(term, pos, bs, be); + else + return true; } + // newpage() is like takeword(), but for page breaks. virtual void newpage(int pos) { - if (m_next) - m_next->newpage(pos); + if (m_next) + m_next->newpage(pos); } virtual bool flush() { - if (m_next) - return m_next->flush(); - else - return true; + if (m_next) + return m_next->flush(); + else + return true; } private: TermProc *m_next; /* Copyconst and assignment private and forbidden */ TermProc(const TermProc &) {} - TermProc& operator=(const TermProc &) {return *this;}; + TermProc& operator=(const TermProc &) { + return *this; + }; }; -/** - * Specialized TextSplit class: this will probably replace the base - * TextSplit when we've converted all the code. The takeword() routine in this - * calls a TermProc's instead of being overriden in a user derived class. - * The text_to_words() method also takes care of flushing. +/** + * Helper specialized TextSplit class, feeds the pipeline: + * - The takeword() method calls a TermProc->takeword(). + * - The text_to_words() method also takes care of flushing. + * Both methods can be further specialized by the user (they should then call + * the base methods when they've done the local processing). */ class TextSplitP : public TextSplit { public: TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE)) - : TextSplit(flags), m_prc(prc) {} + : TextSplit(flags), m_prc(prc) {} - virtual bool text_to_words(const string &in) - { - bool ret = TextSplit::text_to_words(in); - if (m_prc && !m_prc->flush()) - return false; - return ret; + virtual bool text_to_words(const string &in) { + bool ret = TextSplit::text_to_words(in); + if (m_prc && !m_prc->flush()) + return false; + return ret; } - virtual bool takeword(const string& term, int pos, int bs, int be) - { - if (m_prc) - return m_prc->takeword(term, pos, bs, be); - else - return true; + virtual bool takeword(const string& term, int pos, int bs, int be) { + if (m_prc) + return m_prc->takeword(term, pos, bs, be); + else + return true; } - virtual void newpage(int pos) - { - if (m_prc) - return m_prc->newpage(pos); + + virtual void newpage(int pos) { + if (m_prc) + return m_prc->newpage(pos); } private: TermProc *m_prc; }; -/** Unaccent and lowercase term. This is usually the first in the pipeline */ +/** Unaccent and lowercase term. If the index is + * not case/diac-sensitive, this is usually the first step in the pipeline + */ class TermProcPrep : public TermProc { public: - TermProcPrep(TermProc *nxt) - : TermProc(nxt), m_totalterms(0), m_unacerrors(0) + TermProcPrep(TermProc *nxt) + : TermProc(nxt), m_totalterms(0), m_unacerrors(0) { } virtual bool takeword(const string& itrm, int pos, int bs, int be) { - m_totalterms++; - string otrm; - if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) { - LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str())); - m_unacerrors++; - // We don't generate a fatal error because of a bad term, - // but one has to put the limit somewhere - if (m_unacerrors > 500 && - (double(m_totalterms) / double(m_unacerrors)) < 2.0) { - // More than 1 error for every other term - LOGERR(("splitter::takeword: too many unac errors %d/%d\n", - m_unacerrors, m_totalterms)); - return false; - } - return true; - } - // It may happen in some weird cases that the output from unac is - // empty (if the word actually consisted entirely of diacritics ...) - // The consequence is that a phrase search won't work without addional - // slack. - if (otrm.empty()) - return true; - else - return TermProc::takeword(otrm, pos, bs, be); + m_totalterms++; + string otrm; + if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) { + LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str())); + m_unacerrors++; + // We don't generate a fatal error because of a bad term, + // but one has to put the limit somewhere + if (m_unacerrors > 500 && + (double(m_totalterms) / double(m_unacerrors)) < 2.0) { + // More than 1 error for every other term + LOGERR(("splitter::takeword: too many unac errors %d/%d\n", + m_unacerrors, m_totalterms)); + return false; + } + return true; + } + // It may happen in some weird cases that the output from unac is + // empty (if the word actually consisted entirely of diacritics ...) + // The consequence is that a phrase search won't work without addional + // slack. + if (otrm.empty()) + return true; + else + return TermProc::takeword(otrm, pos, bs, be); } virtual bool flush() { - m_totalterms = m_unacerrors = 0; - return TermProc::flush(); + m_totalterms = m_unacerrors = 0; + return TermProc::flush(); } private: @@ -156,16 +165,16 @@ private: class TermProcStop : public TermProc { public: TermProcStop(TermProc *nxt, const Rcl::StopList& stops) - : TermProc(nxt), m_stops(stops) + : TermProc(nxt), m_stops(stops) { } virtual bool takeword(const string& term, int pos, int bs, int be) { - if (m_stops.isStop(term)) { - return true; - } - return TermProc::takeword(term, pos, bs, be); + if (m_stops.isStop(term)) { + return true; + } + return TermProc::takeword(term, pos, bs, be); } private: @@ -174,73 +183,73 @@ private: /** Handle common-gram generation: combine frequent terms with neighbours to * shorten the positions lists for phrase searches. - * NOTE: This does not currently work because of bad interaction with the + * NOTE: This does not currently work because of bad interaction with the * spans (ie john@domain.com) generation in textsplit. Not used, kept for * testing only */ class TermProcCommongrams : public TermProc { public: TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops) - : TermProc(nxt), m_stops(stops), m_onlygrams(false) + : TermProc(nxt), m_stops(stops), m_onlygrams(false) { } virtual bool takeword(const string& term, int pos, int bs, int be) { - LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n", - pos, bs, be, term.c_str())); - bool isstop = m_stops.isStop(term); - bool twogramemit = false; + LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n", + pos, bs, be, term.c_str())); + bool isstop = m_stops.isStop(term); + bool twogramemit = false; - if (!m_prevterm.empty() && (m_prevstop || isstop)) { - // create 2-gram. space unnecessary but improves - // the readability of queries - string twogram; - twogram.swap(m_prevterm); - twogram.append(1, ' '); - twogram += term; - // When emitting a complex term we set the bps to 0. This may - // be used by our clients - if (!TermProc::takeword(twogram, m_prevpos, 0, 0)) - return false; - twogramemit = true; + if (!m_prevterm.empty() && (m_prevstop || isstop)) { + // create 2-gram. space unnecessary but improves + // the readability of queries + string twogram; + twogram.swap(m_prevterm); + twogram.append(1, ' '); + twogram += term; + // When emitting a complex term we set the bps to 0. This may + // be used by our clients + if (!TermProc::takeword(twogram, m_prevpos, 0, 0)) + return false; + twogramemit = true; #if 0 - if (m_stops.isStop(twogram)) { - firstword = twogram; - isstop = false; - } + if (m_stops.isStop(twogram)) { + firstword = twogram; + isstop = false; + } #endif - } - - m_prevterm = term; - m_prevstop = isstop; - m_prevpos = pos; - m_prevsent = false; - m_prevbs = bs; - m_prevbe = be; - // If flags allow, emit the bare term at the current pos. - if (!m_onlygrams || (!isstop && !twogramemit)) { - if (!TermProc::takeword(term, pos, bs, be)) - return false; - m_prevsent = true; - } + } - return true; + m_prevterm = term; + m_prevstop = isstop; + m_prevpos = pos; + m_prevsent = false; + m_prevbs = bs; + m_prevbe = be; + // If flags allow, emit the bare term at the current pos. + if (!m_onlygrams || (!isstop && !twogramemit)) { + if (!TermProc::takeword(term, pos, bs, be)) + return false; + m_prevsent = true; + } + + return true; } virtual bool flush() { - if (!m_prevsent && !m_prevterm.empty()) - if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe)) - return false; - - m_prevterm.clear(); - m_prevsent = true; - return TermProc::flush(); + if (!m_prevsent && !m_prevterm.empty()) + if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe)) + return false; + + m_prevterm.clear(); + m_prevsent = true; + return TermProc::flush(); } void onlygrams(bool on) { - m_onlygrams = on; + m_onlygrams = on; } private: // The stoplist we're using