Added (unifdefd) code to detect garbage data like undecoded base64 by looking at word length stats

This commit is contained in:
Jean-Francois Dockes 2013-04-27 08:29:55 +02:00
parent b75859b046
commit b4c7efe490
3 changed files with 133 additions and 6 deletions

View File

@ -205,6 +205,14 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
unsigned int l = w.length();
#ifdef TEXTSPLIT_STATS
// Update word length statistics. Do this before we filter out
// long words because stats are used to detect bad text
if (!isspan || m_wordLen == m_span.length())
m_stats.newsamp(m_wordChars);
#endif
if (l > 0 && l < (unsigned)m_maxWordLength) {
// 1 byte word: we index single ascii letters and digits, but
// nothing else. We might want to turn this into a test for a
@ -316,7 +324,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
// Adjust state
if (m_wordLen) {
m_wordpos++;
m_wordLen = 0;
m_wordLen = m_wordChars = 0;
}
if (spanerase) {
discardspan();
@ -332,7 +340,7 @@ void TextSplit::discardspan()
m_span.erase();
m_spanpos = m_wordpos;
m_wordStart = 0;
m_wordLen = 0;
m_wordLen = m_wordChars = 0;
}
static inline bool isalphanum(int what, unsigned int flgs)
@ -346,6 +354,12 @@ static inline bool isdigit(int what, unsigned int flgs)
return what == DIGIT || ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
}
#ifdef TEXTSPLIT_STATS
#define INC_WORDCHARS ++m_wordChars
#else
#define INC_WORDCHARS
#endif
/**
* Splitting a text into terms to be indexed.
* We basically emit a word every time we see a separator, but some chars are
@ -366,7 +380,8 @@ bool TextSplit::text_to_words(const string &in)
m_span.erase();
m_inNumber = false;
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos
= m_spanpos = 0;
int curspanglue = 0;
bool pagepending = false;
bool softhyphenpending = false;
@ -423,6 +438,7 @@ bool TextSplit::text_to_words(const string &in)
if (m_wordLen == 0)
m_inNumber = true;
m_wordLen += it.appendchartostring(m_span);
INC_WORDCHARS;
nonalnumcnt = 0;
break;
@ -458,6 +474,7 @@ bool TextSplit::text_to_words(const string &in)
// -10
m_inNumber = true;
m_wordLen += it.appendchartostring(m_span);
INC_WORDCHARS;
} else {
goto SPACE;
}
@ -465,6 +482,7 @@ bool TextSplit::text_to_words(const string &in)
m_span[m_span.length() - 1] == 'E')) {
if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
m_wordLen += it.appendchartostring(m_span);
INC_WORDCHARS;
} else {
goto SPACE;
}
@ -482,6 +500,7 @@ bool TextSplit::text_to_words(const string &in)
if (!isdigit(nextwhat, m_flags))
goto SPACE;
m_wordLen += it.appendchartostring(m_span);
INC_WORDCHARS;
curspanglue = cc;
break;
} else {
@ -501,6 +520,7 @@ bool TextSplit::text_to_words(const string &in)
if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
m_inNumber = true;
m_wordLen += it.appendchartostring(m_span);
INC_WORDCHARS;
curspanglue = cc;
break;
}
@ -567,6 +587,7 @@ bool TextSplit::text_to_words(const string &in)
int w = whatcc(it[it.getCpos()+1]);
if (w == SPACE || w == '\n' || w == '\r') {
m_wordLen += it.appendchartostring(m_span);
INC_WORDCHARS;
break;
}
}
@ -639,6 +660,7 @@ bool TextSplit::text_to_words(const string &in)
m_inNumber = false;
}
m_wordLen += it.appendchartostring(m_span);
INC_WORDCHARS;
nonalnumcnt = 0;
break;
}
@ -738,7 +760,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
m_span.erase();
m_inNumber = false;
m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0;
m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = 0;
m_spanpos = m_wordpos;
*cp = c;
return true;
@ -864,6 +886,7 @@ bool TextSplit::stringToStrings(const string &s, vector<string> &tokens)
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <math.h>
#include <iostream>
@ -880,7 +903,7 @@ class myTermProc : public Rcl::TermProc {
int first;
bool nooutput;
public:
myTermProc() : TermProc(0), first(1), nooutput(false) {}
myTermProc() : TermProc(0), first(1), nooutput(false) {}
void setNoOut(bool val) {nooutput = val;}
virtual bool takeword(const string &term, int pos, int bs, int be)
{
@ -1058,7 +1081,16 @@ int main(int argc, char **argv)
printproc.setNoOut(true);
splitter.text_to_words(data);
#ifdef TEXTSPLIT_STATS
TextSplit::Stats::Values v = splitter.getStats();
cout << "Average length: "
<< v.avglen
<< " Standard deviation: "
<< v.sigma
<< " Coef of variation "
<< v.sigma / v.avglen
<< endl;
#endif
}
}
#endif // TEST

View File

@ -17,6 +17,8 @@
#ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_
#include <math.h>
#include <string>
#include <vector>
@ -66,6 +68,10 @@ public:
}
virtual ~TextSplit() {}
virtual void setMaxWordLength(int l)
{
m_maxWordLength = l;
}
/** Split text, emit words and positions. */
virtual bool text_to_words(const string &in);
@ -103,6 +109,67 @@ public:
/** Is char CJK ? */
static bool isCJK(int c);
/** Statistics about word length (average and dispersion) can
* detect bad data like undecoded base64 or other mis-identified
* pieces of data taken as text. In practise, this keeps some junk out
* of the index, but does not decrease the index size much, and is
* probably not worth the trouble in general. Code kept because it
* probably can be useful in special cases. Base64 data does has
* word separators in it (+/) and is characterised by high average
* word length (>10, often close to 20) and high word length
* dispersion (avg/sigma > 0.8). In my tests, most natural
* language text has average word lengths around 5-8 and avg/sigma
* < 0.7
*/
#ifdef TEXTSPLIT_STATS
class Stats {
public:
Stats()
{
reset();
}
void reset()
{
count = 0;
totlen = 0;
sigma_acc = 0;
}
void newsamp(unsigned int len)
{
++count;
totlen += len;
double avglen = double(totlen) / double(count);
sigma_acc += (avglen - len) * (avglen - len);
}
struct Values {
int count;
double avglen;
double sigma;
};
Values get()
{
Values v;
v.count = count;
v.avglen = double(totlen) / double(count);
v.sigma = sqrt(sigma_acc / count);
return v;
}
private:
int count;
int totlen;
double sigma_acc;
};
Stats::Values getStats()
{
return m_stats.get();
}
void resetStats()
{
m_stats.reset();
}
#endif // TEXTSPLIT_STATS
private:
Flags m_flags;
int m_maxWordLength;
@ -127,6 +194,15 @@ private:
int m_prevpos;
unsigned int m_prevlen;
#ifdef TEXTSPLIT_STATS
// Stats counters. These are processed in TextSplit rather than by a
// TermProc so that we can take very long words (not emitted) into
// account.
Stats m_stats;
#endif
// Word length in characters. Declared but not updated if !TEXTSPLIT_STATS
unsigned int m_wordChars;
// This processes cjk text:
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);

View File

@ -1192,9 +1192,28 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
// Split and index body text
LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
#ifdef TEXTSPLIT_STATS
splitter.resetStats();
#endif
if (!splitter.text_to_words(doc.text))
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
#ifdef TEXTSPLIT_STATS
// Reject bad data. unrecognized base64 text is characterized by
// high avg word length and high variation (because there are
// word-splitters like +/ inside the data).
TextSplit::Stats::Values v = splitter.getStats();
// v.avglen > 15 && v.sigma > 12
if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
"count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
v.count, v.avglen, v.sigma, doc.url.c_str(),
doc.ipath.c_str(), doc.text.c_str()));
return true;
}
#endif
////// Special terms for other metadata. No positions for these.
// Mime type
newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);