From b4c7efe490abdd149195181075ec62e577cf7284 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 27 Apr 2013 08:29:55 +0200 Subject: [PATCH] Added (unifdefd) code to detect garbage data like undecoded base64 by looking at word length stats --- src/common/textsplit.cpp | 44 +++++++++++++++++++---- src/common/textsplit.h | 76 ++++++++++++++++++++++++++++++++++++++++ src/rcldb/rcldb.cpp | 19 ++++++++++ 3 files changed, 133 insertions(+), 6 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 7faa4afb..7380c144 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -205,6 +205,14 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos, LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos)); unsigned int l = w.length(); + +#ifdef TEXTSPLIT_STATS + // Update word length statistics. Do this before we filter out + // long words because stats are used to detect bad text + if (!isspan || m_wordLen == m_span.length()) + m_stats.newsamp(m_wordChars); +#endif + if (l > 0 && l < (unsigned)m_maxWordLength) { // 1 byte word: we index single ascii letters and digits, but // nothing else. We might want to turn this into a test for a @@ -316,7 +324,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit) // Adjust state if (m_wordLen) { m_wordpos++; - m_wordLen = 0; + m_wordLen = m_wordChars = 0; } if (spanerase) { discardspan(); @@ -332,7 +340,7 @@ void TextSplit::discardspan() m_span.erase(); m_spanpos = m_wordpos; m_wordStart = 0; - m_wordLen = 0; + m_wordLen = m_wordChars = 0; } static inline bool isalphanum(int what, unsigned int flgs) @@ -346,6 +354,12 @@ static inline bool isdigit(int what, unsigned int flgs) return what == DIGIT || ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD); } +#ifdef TEXTSPLIT_STATS +#define INC_WORDCHARS ++m_wordChars +#else +#define INC_WORDCHARS +#endif + /** * Splitting a text into terms to be indexed. * We basically emit a word every time we see a separator, but some chars are @@ -366,7 +380,8 @@ bool TextSplit::text_to_words(const string &in) m_span.erase(); m_inNumber = false; - m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0; + m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos + = m_spanpos = 0; int curspanglue = 0; bool pagepending = false; bool softhyphenpending = false; @@ -423,6 +438,7 @@ bool TextSplit::text_to_words(const string &in) if (m_wordLen == 0) m_inNumber = true; m_wordLen += it.appendchartostring(m_span); + INC_WORDCHARS; nonalnumcnt = 0; break; @@ -458,6 +474,7 @@ bool TextSplit::text_to_words(const string &in) // -10 m_inNumber = true; m_wordLen += it.appendchartostring(m_span); + INC_WORDCHARS; } else { goto SPACE; } @@ -465,6 +482,7 @@ bool TextSplit::text_to_words(const string &in) m_span[m_span.length() - 1] == 'E')) { if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) { m_wordLen += it.appendchartostring(m_span); + INC_WORDCHARS; } else { goto SPACE; } @@ -482,6 +500,7 @@ bool TextSplit::text_to_words(const string &in) if (!isdigit(nextwhat, m_flags)) goto SPACE; m_wordLen += it.appendchartostring(m_span); + INC_WORDCHARS; curspanglue = cc; break; } else { @@ -501,6 +520,7 @@ bool TextSplit::text_to_words(const string &in) if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) { m_inNumber = true; m_wordLen += it.appendchartostring(m_span); + INC_WORDCHARS; curspanglue = cc; break; } @@ -567,6 +587,7 @@ bool TextSplit::text_to_words(const string &in) int w = whatcc(it[it.getCpos()+1]); if (w == SPACE || w == '\n' || w == '\r') { m_wordLen += it.appendchartostring(m_span); + INC_WORDCHARS; break; } } @@ -639,6 +660,7 @@ bool TextSplit::text_to_words(const string &in) m_inNumber = false; } m_wordLen += it.appendchartostring(m_span); + INC_WORDCHARS; nonalnumcnt = 0; break; } @@ -738,7 +760,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp) m_span.erase(); m_inNumber = false; - m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0; + m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = 0; m_spanpos = m_wordpos; *cp = c; return true; @@ -864,6 +886,7 @@ bool TextSplit::stringToStrings(const string &s, vector &tokens) #include #include #include +#include #include @@ -880,7 +903,7 @@ class myTermProc : public Rcl::TermProc { int first; bool nooutput; public: - myTermProc() : TermProc(0), first(1), nooutput(false) {} + myTermProc() : TermProc(0), first(1), nooutput(false) {} void setNoOut(bool val) {nooutput = val;} virtual bool takeword(const string &term, int pos, int bs, int be) { @@ -1058,7 +1081,16 @@ int main(int argc, char **argv) printproc.setNoOut(true); splitter.text_to_words(data); - +#ifdef TEXTSPLIT_STATS + TextSplit::Stats::Values v = splitter.getStats(); + cout << "Average length: " + << v.avglen + << " Standard deviation: " + << v.sigma + << " Coef of variation " + << v.sigma / v.avglen + << endl; +#endif } } #endif // TEST diff --git a/src/common/textsplit.h b/src/common/textsplit.h index b5be2be2..fd30ea9a 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -17,6 +17,8 @@ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ +#include + #include #include @@ -66,6 +68,10 @@ public: } virtual ~TextSplit() {} + virtual void setMaxWordLength(int l) + { + m_maxWordLength = l; + } /** Split text, emit words and positions. */ virtual bool text_to_words(const string &in); @@ -103,6 +109,67 @@ public: /** Is char CJK ? */ static bool isCJK(int c); + /** Statistics about word length (average and dispersion) can + * detect bad data like undecoded base64 or other mis-identified + * pieces of data taken as text. In practise, this keeps some junk out + * of the index, but does not decrease the index size much, and is + * probably not worth the trouble in general. Code kept because it + * probably can be useful in special cases. Base64 data does has + * word separators in it (+/) and is characterised by high average + * word length (>10, often close to 20) and high word length + * dispersion (avg/sigma > 0.8). In my tests, most natural + * language text has average word lengths around 5-8 and avg/sigma + * < 0.7 + */ +#ifdef TEXTSPLIT_STATS + class Stats { + public: + Stats() + { + reset(); + } + void reset() + { + count = 0; + totlen = 0; + sigma_acc = 0; + } + void newsamp(unsigned int len) + { + ++count; + totlen += len; + double avglen = double(totlen) / double(count); + sigma_acc += (avglen - len) * (avglen - len); + } + struct Values { + int count; + double avglen; + double sigma; + }; + Values get() + { + Values v; + v.count = count; + v.avglen = double(totlen) / double(count); + v.sigma = sqrt(sigma_acc / count); + return v; + } + private: + int count; + int totlen; + double sigma_acc; + }; + + Stats::Values getStats() + { + return m_stats.get(); + } + void resetStats() + { + m_stats.reset(); + } +#endif // TEXTSPLIT_STATS + private: Flags m_flags; int m_maxWordLength; @@ -127,6 +194,15 @@ private: int m_prevpos; unsigned int m_prevlen; +#ifdef TEXTSPLIT_STATS + // Stats counters. These are processed in TextSplit rather than by a + // TermProc so that we can take very long words (not emitted) into + // account. + Stats m_stats; +#endif + // Word length in characters. Declared but not updated if !TEXTSPLIT_STATS + unsigned int m_wordChars; + // This processes cjk text: bool cjk_to_words(Utf8Iter *it, unsigned int *cp); diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 3ca55fd5..a381740f 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1192,9 +1192,28 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc) // Split and index body text LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str())); + +#ifdef TEXTSPLIT_STATS + splitter.resetStats(); +#endif if (!splitter.text_to_words(doc.text)) LOGDEB(("Db::addOrUpdate: split failed for main text\n")); +#ifdef TEXTSPLIT_STATS + // Reject bad data. unrecognized base64 text is characterized by + // high avg word length and high variation (because there are + // word-splitters like +/ inside the data). + TextSplit::Stats::Values v = splitter.getStats(); + // v.avglen > 15 && v.sigma > 12 + if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) { + LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats " + "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n", + v.count, v.avglen, v.sigma, doc.url.c_str(), + doc.ipath.c_str(), doc.text.c_str())); + return true; + } +#endif + ////// Special terms for other metadata. No positions for these. // Mime type newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);