From b4c7efe490abdd149195181075ec62e577cf7284 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Sat, 27 Apr 2013 08:29:55 +0200
Subject: [PATCH] Added (unifdefd) code to detect garbage data like undecoded
 base64 by looking at word length stats

---
 src/common/textsplit.cpp | 44 +++++++++++++++++++----
 src/common/textsplit.h   | 76 ++++++++++++++++++++++++++++++++++++++++
 src/rcldb/rcldb.cpp      | 19 ++++++++++
 3 files changed, 133 insertions(+), 6 deletions(-)

diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index 7faa4afb..7380c144 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -205,6 +205,14 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
     LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
 
     unsigned int l = w.length();
+
+#ifdef TEXTSPLIT_STATS
+    // Update word length statistics. Do this before we filter out
+    // long words because stats are used to detect bad text
+    if (!isspan || m_wordLen == m_span.length())
+	m_stats.newsamp(m_wordChars);
+#endif
+
     if (l > 0 && l < (unsigned)m_maxWordLength) {
 	// 1 byte word: we index single ascii letters and digits, but
 	// nothing else. We might want to turn this into a test for a
@@ -316,7 +324,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
     // Adjust state
     if (m_wordLen) {
 	m_wordpos++;
-	m_wordLen = 0;
+	m_wordLen = m_wordChars = 0;
     }
     if (spanerase) {
 	discardspan();
@@ -332,7 +340,7 @@ void TextSplit::discardspan()
     m_span.erase();
     m_spanpos = m_wordpos;
     m_wordStart = 0;
-    m_wordLen = 0;
+    m_wordLen = m_wordChars = 0;
 }
 
 static inline bool isalphanum(int what, unsigned int flgs)
@@ -346,6 +354,12 @@ static inline bool isdigit(int what, unsigned int flgs)
     return what == DIGIT || ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
 }
 
+#ifdef TEXTSPLIT_STATS
+#define INC_WORDCHARS ++m_wordChars
+#else
+#define INC_WORDCHARS
+#endif
+
 /** 
  * Splitting a text into terms to be indexed.
  * We basically emit a word every time we see a separator, but some chars are
@@ -366,7 +380,8 @@ bool TextSplit::text_to_words(const string &in)
 
     m_span.erase();
     m_inNumber = false;
-    m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
+    m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos 
+	= m_spanpos = 0;
     int curspanglue = 0;
     bool pagepending = false;
     bool softhyphenpending = false;
@@ -423,6 +438,7 @@ bool TextSplit::text_to_words(const string &in)
 	    if (m_wordLen == 0)
 		m_inNumber = true;
 	    m_wordLen += it.appendchartostring(m_span);
+	    INC_WORDCHARS;
 	    nonalnumcnt = 0;
 	    break;
 
@@ -458,6 +474,7 @@ bool TextSplit::text_to_words(const string &in)
 		    // -10
 		    m_inNumber = true;
 		    m_wordLen += it.appendchartostring(m_span);
+		    INC_WORDCHARS;
 		} else {
 		    goto SPACE;
 		} 
@@ -465,6 +482,7 @@ bool TextSplit::text_to_words(const string &in)
 				      m_span[m_span.length() - 1] == 'E')) {
 		if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
 		    m_wordLen += it.appendchartostring(m_span);
+		    INC_WORDCHARS;
 		} else {
 		    goto SPACE;
 		}
@@ -482,6 +500,7 @@ bool TextSplit::text_to_words(const string &in)
 		if (!isdigit(nextwhat, m_flags))
 		    goto SPACE;
 		m_wordLen += it.appendchartostring(m_span);
+		INC_WORDCHARS;
 		curspanglue = cc;
 		break;
 	    } else {
@@ -501,6 +520,7 @@ bool TextSplit::text_to_words(const string &in)
                     if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
                         m_inNumber = true;
                         m_wordLen += it.appendchartostring(m_span);
+			INC_WORDCHARS;
                         curspanglue = cc;
                         break;
                     }
@@ -567,6 +587,7 @@ bool TextSplit::text_to_words(const string &in)
 		int w = whatcc(it[it.getCpos()+1]);
 		if (w == SPACE || w == '\n' || w == '\r') {
 		    m_wordLen += it.appendchartostring(m_span);
+		    INC_WORDCHARS;
 		    break;
 		}
 	    }
@@ -639,6 +660,7 @@ bool TextSplit::text_to_words(const string &in)
                 m_inNumber = false;
             }
 	    m_wordLen += it.appendchartostring(m_span);
+	    INC_WORDCHARS;
 	    nonalnumcnt = 0;
 	    break;
 	}
@@ -738,7 +760,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
 
     m_span.erase();
     m_inNumber = false;
-    m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0;
+    m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = 0;
     m_spanpos = m_wordpos;
     *cp = c;
     return true;
@@ -864,6 +886,7 @@ bool TextSplit::stringToStrings(const string &s, vector<string> &tokens)
 #include <errno.h>
 #include <fcntl.h>
 #include <string.h>
+#include <math.h>
 
 #include <iostream>
 
@@ -880,7 +903,7 @@ class myTermProc : public Rcl::TermProc {
     int first;
     bool nooutput;
 public:
-    myTermProc() : TermProc(0), first(1), nooutput(false)  {}
+    myTermProc() : TermProc(0), first(1), nooutput(false) {}
     void setNoOut(bool val) {nooutput = val;}
     virtual bool takeword(const string &term, int pos, int bs, int be)
     {
@@ -1058,7 +1081,16 @@ int main(int argc, char **argv)
             printproc.setNoOut(true);
 
 	splitter.text_to_words(data);
-
+#ifdef TEXTSPLIT_STATS
+	TextSplit::Stats::Values v = splitter.getStats();
+	cout << "Average length: " 
+	     <<  v.avglen
+	     << " Standard deviation: " 
+	     << v.sigma
+	     << " Coef of variation "
+	     << v.sigma / v.avglen
+	     << endl;
+#endif
     }    
 }
 #endif // TEST
diff --git a/src/common/textsplit.h b/src/common/textsplit.h
index b5be2be2..fd30ea9a 100644
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@@ -17,6 +17,8 @@
 #ifndef _TEXTSPLIT_H_INCLUDED_
 #define _TEXTSPLIT_H_INCLUDED_
 
+#include <math.h>
+
 #include <string>
 #include <vector>
 
@@ -66,6 +68,10 @@ public:
     }
     virtual ~TextSplit() {}
 
+    virtual void setMaxWordLength(int l)
+    {
+	m_maxWordLength = l;
+    }
     /** Split text, emit words and positions. */
     virtual bool text_to_words(const string &in);
 
@@ -103,6 +109,67 @@ public:
     /** Is char CJK ? */
     static bool isCJK(int c);
 
+    /** Statistics about word length (average and dispersion) can
+     * detect bad data like undecoded base64 or other mis-identified
+     * pieces of data taken as text. In practise, this keeps some junk out 
+     * of the index, but does not decrease the index size much, and is
+     * probably not worth the trouble in general. Code kept because it
+     * probably can be useful in special cases. Base64 data does has
+     * word separators in it (+/) and is characterised by high average
+     * word length (>10, often close to 20) and high word length
+     * dispersion (avg/sigma > 0.8). In my tests, most natural
+     * language text has average word lengths around 5-8 and avg/sigma
+     * < 0.7
+     */
+#ifdef TEXTSPLIT_STATS
+    class Stats {
+    public:
+	Stats()
+	{
+	    reset();
+	}
+	void reset()
+	{
+	    count = 0;
+	    totlen = 0;
+	    sigma_acc = 0;
+	}
+	void newsamp(unsigned int len)
+	{
+	    ++count;
+	    totlen += len;
+	    double avglen = double(totlen) / double(count);
+	    sigma_acc += (avglen - len) * (avglen - len);
+	}
+	struct Values {
+	    int count;
+	    double avglen;
+	    double sigma;
+	};
+	Values get()
+	{
+	    Values v;
+	    v.count = count;
+	    v.avglen = double(totlen) / double(count);
+	    v.sigma = sqrt(sigma_acc / count);
+	    return v;
+	}
+    private:
+	int count;
+	int totlen;
+	double sigma_acc;
+    };
+
+    Stats::Values getStats()
+    {
+	return m_stats.get();
+    }
+    void resetStats()
+    {
+	m_stats.reset();
+    }
+#endif // TEXTSPLIT_STATS
+
 private:
     Flags         m_flags;
     int           m_maxWordLength;
@@ -127,6 +194,15 @@ private:
     int           m_prevpos;
     unsigned int  m_prevlen;
 
+#ifdef TEXTSPLIT_STATS
+    // Stats counters. These are processed in TextSplit rather than by a 
+    // TermProc so that we can take very long words (not emitted) into
+    // account.
+    Stats         m_stats;
+#endif
+    // Word length in characters. Declared but not updated if !TEXTSPLIT_STATS
+    unsigned int  m_wordChars;
+
     // This processes cjk text:
     bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
 
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index 3ca55fd5..a381740f 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1192,9 +1192,28 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
 
     // Split and index body text
     LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
+
+#ifdef TEXTSPLIT_STATS
+    splitter.resetStats();
+#endif
     if (!splitter.text_to_words(doc.text))
         LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
 
+#ifdef TEXTSPLIT_STATS
+    // Reject bad data. unrecognized base64 text is characterized by
+    // high avg word length and high variation (because there are
+    // word-splitters like +/ inside the data).
+    TextSplit::Stats::Values v = splitter.getStats();
+    // v.avglen > 15 && v.sigma > 12 
+    if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
+	LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
+	 "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
+		 v.count, v.avglen, v.sigma, doc.url.c_str(), 
+		 doc.ipath.c_str(), doc.text.c_str()));
+	return true;
+    }
+#endif
+
     ////// Special terms for other metadata. No positions for these.
     // Mime type
     newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);