comments and indent

2015-06-09 19:34:15 +02:00 · 2015-06-09 19:34:15 +02:00 · 94b94593e3
commit 94b94593e3
parent 0755f4f4e2
1 changed files with 127 additions and 118 deletions
--- a/src/rcldb/termproc.h
+++ b/src/rcldb/termproc.h
@ -25,14 +25,19 @@
 namespace Rcl {
 /**
- * Termproc objects take a stream of term tokens as input and do something
+ * Termproc objects take term tokens as input and do something
 * with them: transform to lowercase, filter out stop words, generate n-grams,
 * finally index or generate search clauses, etc. They are chained and can
 * be arranged to form different pipelines depending on the desired processing
 * steps: for example, optional stoplist or commongram processing.
 *
 * Shared processing steps are defined in this file. The first and last steps
- * (ie: adding index term) are usually defined in the specific module.
+ * are usually defined in the specific module.
 * - The front TermProc is typically chained from a TextSplit object
 *   which generates the original terms, and calls takeword() from its
 *   own takeword() method.
 * - The last TermProc does something with the finalized terms, e.g. adds
 *   them to the index.
 */
 /**
@ -45,106 +50,110 @@ public:
    virtual ~TermProc() {}
    virtual bool takeword(const string &term, int pos, int bs, int be)
    {
-	if (m_next)
+        if (m_next)
-	    return m_next->takeword(term, pos, bs, be);
+            return m_next->takeword(term, pos, bs, be);
-	else
+        else
-	    return true;
+            return true;
    }
    // newpage() is like takeword(), but for page breaks.
    virtual void newpage(int pos)
    {
-	if (m_next)
+        if (m_next)
-	    m_next->newpage(pos);
+            m_next->newpage(pos);
    }
    virtual bool flush()
    {
-	if (m_next)
+        if (m_next)
-	    return m_next->flush();
+            return m_next->flush();
-	else
+        else
-	    return true;
+            return true;
    }
 private:
    TermProc *m_next;
    /* Copyconst and assignment private and forbidden */
    TermProc(const TermProc &) {}
-    TermProc& operator=(const TermProc &) {return *this;};
+    TermProc& operator=(const TermProc &) {
        return *this;
    };
 };
 /**
- * Specialized TextSplit class: this will probably replace the base
+ * Helper specialized TextSplit class, feeds the pipeline:
- * TextSplit when we've converted all the code. The takeword() routine in this
+ * - The takeword() method calls a TermProc->takeword().
- * calls a TermProc's instead of being overriden in a user derived class.
+ * - The text_to_words() method also takes care of flushing.
- * The text_to_words() method also takes care of flushing.
+ * Both methods can be further specialized by the user (they should then call
 * the base methods when they've done the local processing).
 */
 class TextSplitP : public TextSplit {
 public:
    TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
-	: TextSplit(flags), m_prc(prc)  {}
+        : TextSplit(flags), m_prc(prc)  {}
-    virtual bool text_to_words(const string &in)
+    virtual bool text_to_words(const string &in) {
-    {
+        bool ret = TextSplit::text_to_words(in);
-	bool ret = TextSplit::text_to_words(in);
+        if (m_prc && !m_prc->flush())
-	if (m_prc && !m_prc->flush())
+            return false;
-	    return false;
+        return ret;
 	return ret;
    }
-    virtual bool takeword(const string& term, int pos, int bs, int be)
+    virtual bool takeword(const string& term, int pos, int bs, int be) {
-    {
+        if (m_prc)
-	if (m_prc)
+            return m_prc->takeword(term, pos, bs, be);
-	    return m_prc->takeword(term, pos, bs, be);
+        else
-	else
+            return true;
 	    return true;
    }
-    virtual void newpage(int pos)
+
-    {
+    virtual void newpage(int pos) {
-	if (m_prc)
+        if (m_prc)
-	    return m_prc->newpage(pos);
+            return m_prc->newpage(pos);
    }
 private:
    TermProc *m_prc;
 };
-/** Unaccent and lowercase term. This is usually the first in the pipeline */
+/** Unaccent and lowercase term. If the index is
 *  not case/diac-sensitive, this is usually the first step in the pipeline
 */
 class TermProcPrep : public TermProc {
 public:
    TermProcPrep(TermProc *nxt)
-	: TermProc(nxt), m_totalterms(0), m_unacerrors(0) 
+        : TermProc(nxt), m_totalterms(0), m_unacerrors(0)
    {
    }
    virtual bool takeword(const string& itrm, int pos, int bs, int be)
    {
-	m_totalterms++;
+        m_totalterms++;
-	string otrm;
+        string otrm;
-	if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
+        if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
-	    LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
+            LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
-	    m_unacerrors++;
+            m_unacerrors++;
-	    // We don't generate a fatal error because of a bad term,
+            // We don't generate a fatal error because of a bad term,
-	    // but one has to put the limit somewhere
+            // but one has to put the limit somewhere
-	    if (m_unacerrors > 500 && 
+            if (m_unacerrors > 500 &&
-		(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
+                    (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
-		// More than 1 error for every other term
+                // More than 1 error for every other term
-		LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
+                LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
-			m_unacerrors, m_totalterms));
+                        m_unacerrors, m_totalterms));
-		return false;
+                return false;
-	    }
+            }
-	    return true;
+            return true;
-	}
+        }
-	// It may happen in some weird cases that the output from unac is 
+        // It may happen in some weird cases that the output from unac is
-	// empty (if the word actually consisted entirely of diacritics ...)
+        // empty (if the word actually consisted entirely of diacritics ...)
-	// The consequence is that a phrase search won't work without addional
+        // The consequence is that a phrase search won't work without addional
-	// slack. 
+        // slack.
-	if (otrm.empty())
+        if (otrm.empty())
-	    return true;
+            return true;
-	else
+        else
-	    return TermProc::takeword(otrm, pos, bs, be);
+            return TermProc::takeword(otrm, pos, bs, be);
    }
    virtual bool flush()
    {
-	m_totalterms = m_unacerrors = 0;
+        m_totalterms = m_unacerrors = 0;
-	return TermProc::flush();
+        return TermProc::flush();
    }
 private:
@ -156,16 +165,16 @@ private:
 class TermProcStop : public TermProc {
 public:
    TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
-	: TermProc(nxt), m_stops(stops) 
+        : TermProc(nxt), m_stops(stops)
    {
    }
    virtual bool takeword(const string& term, int pos, int bs, int be)
    {
-	if (m_stops.isStop(term)) {
+        if (m_stops.isStop(term)) {
-	    return true;
+            return true;
-	}
+        }
-	return TermProc::takeword(term, pos, bs, be);
+        return TermProc::takeword(term, pos, bs, be);
    }
 private:
@ -181,66 +190,66 @@ private:
 class TermProcCommongrams : public TermProc {
 public:
    TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
-	: TermProc(nxt), m_stops(stops), m_onlygrams(false) 
+        : TermProc(nxt), m_stops(stops), m_onlygrams(false)
    {
    }
    virtual bool takeword(const string& term, int pos, int bs, int be)
    {
-	LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n", 
+        LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
-		 pos, bs, be, term.c_str()));
+                 pos, bs, be, term.c_str()));
-	bool isstop = m_stops.isStop(term);
+        bool isstop = m_stops.isStop(term);
-	bool twogramemit = false;
+        bool twogramemit = false;
-	if (!m_prevterm.empty() && (m_prevstop || isstop)) {
+        if (!m_prevterm.empty() && (m_prevstop || isstop)) {
-	    // create 2-gram. space unnecessary but improves
+            // create 2-gram. space unnecessary but improves
-	    // the readability of queries
+            // the readability of queries
-	    string twogram;
+            string twogram;
-	    twogram.swap(m_prevterm);
+            twogram.swap(m_prevterm);
-	    twogram.append(1, ' ');
+            twogram.append(1, ' ');
-	    twogram += term;
+            twogram += term;
-	    // When emitting a complex term we set the bps to 0. This may
+            // When emitting a complex term we set the bps to 0. This may
-	    // be used by our clients
+            // be used by our clients
-	    if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
+            if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
-		return false;
+                return false;
-	    twogramemit = true;
+            twogramemit = true;
 #if 0
-	    if (m_stops.isStop(twogram)) {
+            if (m_stops.isStop(twogram)) {
-		firstword = twogram;
+                firstword = twogram;
-		isstop = false;
+                isstop = false;
-	    }
+            }
 #endif
-	}
+        }
-	m_prevterm = term;
+        m_prevterm = term;
-	m_prevstop = isstop;
+        m_prevstop = isstop;
-	m_prevpos = pos;
+        m_prevpos = pos;
-	m_prevsent = false;
+        m_prevsent = false;
-	m_prevbs = bs;
+        m_prevbs = bs;
-	m_prevbe = be;
+        m_prevbe = be;
-	// If flags allow, emit the bare term at the current pos.
+        // If flags allow, emit the bare term at the current pos.
-	if (!m_onlygrams || (!isstop && !twogramemit)) {
+        if (!m_onlygrams || (!isstop && !twogramemit)) {
-	    if (!TermProc::takeword(term, pos, bs, be))
+            if (!TermProc::takeword(term, pos, bs, be))
-		return false;
+                return false;
-	    m_prevsent = true;
+            m_prevsent = true;
-	} 
+        }
-	return true;
+        return true;
    }
    virtual bool flush()
    {
-	if (!m_prevsent && !m_prevterm.empty())
+        if (!m_prevsent && !m_prevterm.empty())
-	    if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
+            if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
-		return false;
+                return false;
-	m_prevterm.clear();
+        m_prevterm.clear();
-	m_prevsent = true;
+        m_prevsent = true;
-	return TermProc::flush();
+        return TermProc::flush();
    }
    void onlygrams(bool on)
    {
-	m_onlygrams = on;
+        m_onlygrams = on;
    }
 private:
    // The stoplist we're using