Added (unifdefd) code to detect garbage data like undecoded base64 by looking at word length stats

2013-04-27 08:29:55 +02:00 · 2013-04-27 08:29:55 +02:00 · b4c7efe490
commit b4c7efe490
parent b75859b046
3 changed files with 133 additions and 6 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -205,6 +205,14 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
    LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));

    unsigned int l = w.length();
+
+#ifdef TEXTSPLIT_STATS
+    // Update word length statistics. Do this before we filter out
+    // long words because stats are used to detect bad text
+    if (!isspan || m_wordLen == m_span.length())
+	m_stats.newsamp(m_wordChars);
+#endif
+
    if (l > 0 && l < (unsigned)m_maxWordLength) {
 	// 1 byte word: we index single ascii letters and digits, but
 	// nothing else. We might want to turn this into a test for a
@ -316,7 +324,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
    // Adjust state
    if (m_wordLen) {
 	m_wordpos++;
-	m_wordLen = 0;
+	m_wordLen = m_wordChars = 0;
    }
    if (spanerase) {
 	discardspan();
@ -332,7 +340,7 @@ void TextSplit::discardspan()
    m_span.erase();
    m_spanpos = m_wordpos;
    m_wordStart = 0;
-    m_wordLen = 0;
+    m_wordLen = m_wordChars = 0;
 }

 static inline bool isalphanum(int what, unsigned int flgs)
@ -346,6 +354,12 @@ static inline bool isdigit(int what, unsigned int flgs)
    return what == DIGIT || ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
 }

+#ifdef TEXTSPLIT_STATS
+#define INC_WORDCHARS ++m_wordChars
+#else
+#define INC_WORDCHARS
+#endif
+
 /** 
 * Splitting a text into terms to be indexed.
 * We basically emit a word every time we see a separator, but some chars are
@ -366,7 +380,8 @@ bool TextSplit::text_to_words(const string &in)

    m_span.erase();
    m_inNumber = false;
-    m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
+    m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos 
+	= m_spanpos = 0;
    int curspanglue = 0;
    bool pagepending = false;
    bool softhyphenpending = false;
@ -423,6 +438,7 @@ bool TextSplit::text_to_words(const string &in)
 	    if (m_wordLen == 0)
 		m_inNumber = true;
 	    m_wordLen += it.appendchartostring(m_span);
+	    INC_WORDCHARS;
 	    nonalnumcnt = 0;
 	    break;

@ -458,6 +474,7 @@ bool TextSplit::text_to_words(const string &in)
 		    // -10
 		    m_inNumber = true;
 		    m_wordLen += it.appendchartostring(m_span);
+		    INC_WORDCHARS;
 		} else {
 		    goto SPACE;
 		} 
@ -465,6 +482,7 @@ bool TextSplit::text_to_words(const string &in)
 				      m_span[m_span.length() - 1] == 'E')) {
 		if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
 		    m_wordLen += it.appendchartostring(m_span);
+		    INC_WORDCHARS;
 		} else {
 		    goto SPACE;
 		}
@ -482,6 +500,7 @@ bool TextSplit::text_to_words(const string &in)
 		if (!isdigit(nextwhat, m_flags))
 		    goto SPACE;
 		m_wordLen += it.appendchartostring(m_span);
+		INC_WORDCHARS;
 		curspanglue = cc;
 		break;
 	    } else {
@ -501,6 +520,7 @@ bool TextSplit::text_to_words(const string &in)
                    if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
                        m_inNumber = true;
                        m_wordLen += it.appendchartostring(m_span);
+			INC_WORDCHARS;
                        curspanglue = cc;
                        break;
                    }
@ -567,6 +587,7 @@ bool TextSplit::text_to_words(const string &in)
 		int w = whatcc(it[it.getCpos()+1]);
 		if (w == SPACE || w == '\n' || w == '\r') {
 		    m_wordLen += it.appendchartostring(m_span);
+		    INC_WORDCHARS;
 		    break;
 		}
 	    }
@ -639,6 +660,7 @@ bool TextSplit::text_to_words(const string &in)
                m_inNumber = false;
            }
 	    m_wordLen += it.appendchartostring(m_span);
+	    INC_WORDCHARS;
 	    nonalnumcnt = 0;
 	    break;
 	}
@ -738,7 +760,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)

    m_span.erase();
    m_inNumber = false;
-    m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0;
+    m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = 0;
    m_spanpos = m_wordpos;
    *cp = c;
    return true;
@ -864,6 +886,7 @@ bool TextSplit::stringToStrings(const string &s, vector<string> &tokens)
 #include <errno.h>
 #include <fcntl.h>
 #include <string.h>
+#include <math.h>

 #include <iostream>

@ -880,7 +903,7 @@ class myTermProc : public Rcl::TermProc {
    int first;
    bool nooutput;
 public:
-    myTermProc() : TermProc(0), first(1), nooutput(false)  {}
+    myTermProc() : TermProc(0), first(1), nooutput(false) {}
    void setNoOut(bool val) {nooutput = val;}
    virtual bool takeword(const string &term, int pos, int bs, int be)
    {
@ -1058,7 +1081,16 @@ int main(int argc, char **argv)
            printproc.setNoOut(true);

 	splitter.text_to_words(data);
-
+#ifdef TEXTSPLIT_STATS
+	TextSplit::Stats::Values v = splitter.getStats();
+	cout << "Average length: " 
+	     <<  v.avglen
+	     << " Standard deviation: " 
+	     << v.sigma
+	     << " Coef of variation "
+	     << v.sigma / v.avglen
+	     << endl;
+#endif
    }    
 }
 #endif // TEST
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -17,6 +17,8 @@
 #ifndef _TEXTSPLIT_H_INCLUDED_
 #define _TEXTSPLIT_H_INCLUDED_

+#include <math.h>
+
 #include <string>
 #include <vector>

@ -66,6 +68,10 @@ public:
    }
    virtual ~TextSplit() {}

+    virtual void setMaxWordLength(int l)
+    {
+	m_maxWordLength = l;
+    }
    /** Split text, emit words and positions. */
    virtual bool text_to_words(const string &in);

@ -103,6 +109,67 @@ public:
    /** Is char CJK ? */
    static bool isCJK(int c);

+    /** Statistics about word length (average and dispersion) can
+     * detect bad data like undecoded base64 or other mis-identified
+     * pieces of data taken as text. In practise, this keeps some junk out 
+     * of the index, but does not decrease the index size much, and is
+     * probably not worth the trouble in general. Code kept because it
+     * probably can be useful in special cases. Base64 data does has
+     * word separators in it (+/) and is characterised by high average
+     * word length (>10, often close to 20) and high word length
+     * dispersion (avg/sigma > 0.8). In my tests, most natural
+     * language text has average word lengths around 5-8 and avg/sigma
+     * < 0.7
+     */
+#ifdef TEXTSPLIT_STATS
+    class Stats {
+    public:
+	Stats()
+	{
+	    reset();
+	}
+	void reset()
+	{
+	    count = 0;
+	    totlen = 0;
+	    sigma_acc = 0;
+	}
+	void newsamp(unsigned int len)
+	{
+	    ++count;
+	    totlen += len;
+	    double avglen = double(totlen) / double(count);
+	    sigma_acc += (avglen - len) * (avglen - len);
+	}
+	struct Values {
+	    int count;
+	    double avglen;
+	    double sigma;
+	};
+	Values get()
+	{
+	    Values v;
+	    v.count = count;
+	    v.avglen = double(totlen) / double(count);
+	    v.sigma = sqrt(sigma_acc / count);
+	    return v;
+	}
+    private:
+	int count;
+	int totlen;
+	double sigma_acc;
+    };
+
+    Stats::Values getStats()
+    {
+	return m_stats.get();
+    }
+    void resetStats()
+    {
+	m_stats.reset();
+    }
+#endif // TEXTSPLIT_STATS
+
 private:
    Flags         m_flags;
    int           m_maxWordLength;
@ -127,6 +194,15 @@ private:
    int           m_prevpos;
    unsigned int  m_prevlen;

+#ifdef TEXTSPLIT_STATS
+    // Stats counters. These are processed in TextSplit rather than by a 
+    // TermProc so that we can take very long words (not emitted) into
+    // account.
+    Stats         m_stats;
+#endif
+    // Word length in characters. Declared but not updated if !TEXTSPLIT_STATS
+    unsigned int  m_wordChars;
+
    // This processes cjk text:
    bool cjk_to_words(Utf8Iter *it, unsigned int *cp);

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1192,9 +1192,28 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)

    // Split and index body text
    LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
+
+#ifdef TEXTSPLIT_STATS
+    splitter.resetStats();
+#endif
    if (!splitter.text_to_words(doc.text))
        LOGDEB(("Db::addOrUpdate: split failed for main text\n"));

+#ifdef TEXTSPLIT_STATS
+    // Reject bad data. unrecognized base64 text is characterized by
+    // high avg word length and high variation (because there are
+    // word-splitters like +/ inside the data).
+    TextSplit::Stats::Values v = splitter.getStats();
+    // v.avglen > 15 && v.sigma > 12 
+    if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
+	LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
+	 "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
+		 v.count, v.avglen, v.sigma, doc.url.c_str(), 
+		 doc.ipath.c_str(), doc.text.c_str()));
+	return true;
+    }
+#endif
+
    ////// Special terms for other metadata. No positions for these.
    // Mime type
    newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);