Added (unifdefd) code to detect garbage data like undecoded base64 by looking at word length stats
This commit is contained in:
parent
b75859b046
commit
b4c7efe490
@ -205,6 +205,14 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
||||
|
||||
unsigned int l = w.length();
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
// Update word length statistics. Do this before we filter out
|
||||
// long words because stats are used to detect bad text
|
||||
if (!isspan || m_wordLen == m_span.length())
|
||||
m_stats.newsamp(m_wordChars);
|
||||
#endif
|
||||
|
||||
if (l > 0 && l < (unsigned)m_maxWordLength) {
|
||||
// 1 byte word: we index single ascii letters and digits, but
|
||||
// nothing else. We might want to turn this into a test for a
|
||||
@ -316,7 +324,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
||||
// Adjust state
|
||||
if (m_wordLen) {
|
||||
m_wordpos++;
|
||||
m_wordLen = 0;
|
||||
m_wordLen = m_wordChars = 0;
|
||||
}
|
||||
if (spanerase) {
|
||||
discardspan();
|
||||
@ -332,7 +340,7 @@ void TextSplit::discardspan()
|
||||
m_span.erase();
|
||||
m_spanpos = m_wordpos;
|
||||
m_wordStart = 0;
|
||||
m_wordLen = 0;
|
||||
m_wordLen = m_wordChars = 0;
|
||||
}
|
||||
|
||||
static inline bool isalphanum(int what, unsigned int flgs)
|
||||
@ -346,6 +354,12 @@ static inline bool isdigit(int what, unsigned int flgs)
|
||||
return what == DIGIT || ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
|
||||
}
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
#define INC_WORDCHARS ++m_wordChars
|
||||
#else
|
||||
#define INC_WORDCHARS
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Splitting a text into terms to be indexed.
|
||||
* We basically emit a word every time we see a separator, but some chars are
|
||||
@ -366,7 +380,8 @@ bool TextSplit::text_to_words(const string &in)
|
||||
|
||||
m_span.erase();
|
||||
m_inNumber = false;
|
||||
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
||||
m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos
|
||||
= m_spanpos = 0;
|
||||
int curspanglue = 0;
|
||||
bool pagepending = false;
|
||||
bool softhyphenpending = false;
|
||||
@ -423,6 +438,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
if (m_wordLen == 0)
|
||||
m_inNumber = true;
|
||||
m_wordLen += it.appendchartostring(m_span);
|
||||
INC_WORDCHARS;
|
||||
nonalnumcnt = 0;
|
||||
break;
|
||||
|
||||
@ -458,6 +474,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
// -10
|
||||
m_inNumber = true;
|
||||
m_wordLen += it.appendchartostring(m_span);
|
||||
INC_WORDCHARS;
|
||||
} else {
|
||||
goto SPACE;
|
||||
}
|
||||
@ -465,6 +482,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
m_span[m_span.length() - 1] == 'E')) {
|
||||
if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
|
||||
m_wordLen += it.appendchartostring(m_span);
|
||||
INC_WORDCHARS;
|
||||
} else {
|
||||
goto SPACE;
|
||||
}
|
||||
@ -482,6 +500,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
if (!isdigit(nextwhat, m_flags))
|
||||
goto SPACE;
|
||||
m_wordLen += it.appendchartostring(m_span);
|
||||
INC_WORDCHARS;
|
||||
curspanglue = cc;
|
||||
break;
|
||||
} else {
|
||||
@ -501,6 +520,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
|
||||
m_inNumber = true;
|
||||
m_wordLen += it.appendchartostring(m_span);
|
||||
INC_WORDCHARS;
|
||||
curspanglue = cc;
|
||||
break;
|
||||
}
|
||||
@ -567,6 +587,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
int w = whatcc(it[it.getCpos()+1]);
|
||||
if (w == SPACE || w == '\n' || w == '\r') {
|
||||
m_wordLen += it.appendchartostring(m_span);
|
||||
INC_WORDCHARS;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -639,6 +660,7 @@ bool TextSplit::text_to_words(const string &in)
|
||||
m_inNumber = false;
|
||||
}
|
||||
m_wordLen += it.appendchartostring(m_span);
|
||||
INC_WORDCHARS;
|
||||
nonalnumcnt = 0;
|
||||
break;
|
||||
}
|
||||
@ -738,7 +760,7 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
||||
|
||||
m_span.erase();
|
||||
m_inNumber = false;
|
||||
m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0;
|
||||
m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = 0;
|
||||
m_spanpos = m_wordpos;
|
||||
*cp = c;
|
||||
return true;
|
||||
@ -864,6 +886,7 @@ bool TextSplit::stringToStrings(const string &s, vector<string> &tokens)
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
@ -880,7 +903,7 @@ class myTermProc : public Rcl::TermProc {
|
||||
int first;
|
||||
bool nooutput;
|
||||
public:
|
||||
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
||||
myTermProc() : TermProc(0), first(1), nooutput(false) {}
|
||||
void setNoOut(bool val) {nooutput = val;}
|
||||
virtual bool takeword(const string &term, int pos, int bs, int be)
|
||||
{
|
||||
@ -1058,7 +1081,16 @@ int main(int argc, char **argv)
|
||||
printproc.setNoOut(true);
|
||||
|
||||
splitter.text_to_words(data);
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
TextSplit::Stats::Values v = splitter.getStats();
|
||||
cout << "Average length: "
|
||||
<< v.avglen
|
||||
<< " Standard deviation: "
|
||||
<< v.sigma
|
||||
<< " Coef of variation "
|
||||
<< v.sigma / v.avglen
|
||||
<< endl;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif // TEST
|
||||
|
||||
@ -17,6 +17,8 @@
|
||||
#ifndef _TEXTSPLIT_H_INCLUDED_
|
||||
#define _TEXTSPLIT_H_INCLUDED_
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@ -66,6 +68,10 @@ public:
|
||||
}
|
||||
virtual ~TextSplit() {}
|
||||
|
||||
virtual void setMaxWordLength(int l)
|
||||
{
|
||||
m_maxWordLength = l;
|
||||
}
|
||||
/** Split text, emit words and positions. */
|
||||
virtual bool text_to_words(const string &in);
|
||||
|
||||
@ -103,6 +109,67 @@ public:
|
||||
/** Is char CJK ? */
|
||||
static bool isCJK(int c);
|
||||
|
||||
/** Statistics about word length (average and dispersion) can
|
||||
* detect bad data like undecoded base64 or other mis-identified
|
||||
* pieces of data taken as text. In practise, this keeps some junk out
|
||||
* of the index, but does not decrease the index size much, and is
|
||||
* probably not worth the trouble in general. Code kept because it
|
||||
* probably can be useful in special cases. Base64 data does has
|
||||
* word separators in it (+/) and is characterised by high average
|
||||
* word length (>10, often close to 20) and high word length
|
||||
* dispersion (avg/sigma > 0.8). In my tests, most natural
|
||||
* language text has average word lengths around 5-8 and avg/sigma
|
||||
* < 0.7
|
||||
*/
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
class Stats {
|
||||
public:
|
||||
Stats()
|
||||
{
|
||||
reset();
|
||||
}
|
||||
void reset()
|
||||
{
|
||||
count = 0;
|
||||
totlen = 0;
|
||||
sigma_acc = 0;
|
||||
}
|
||||
void newsamp(unsigned int len)
|
||||
{
|
||||
++count;
|
||||
totlen += len;
|
||||
double avglen = double(totlen) / double(count);
|
||||
sigma_acc += (avglen - len) * (avglen - len);
|
||||
}
|
||||
struct Values {
|
||||
int count;
|
||||
double avglen;
|
||||
double sigma;
|
||||
};
|
||||
Values get()
|
||||
{
|
||||
Values v;
|
||||
v.count = count;
|
||||
v.avglen = double(totlen) / double(count);
|
||||
v.sigma = sqrt(sigma_acc / count);
|
||||
return v;
|
||||
}
|
||||
private:
|
||||
int count;
|
||||
int totlen;
|
||||
double sigma_acc;
|
||||
};
|
||||
|
||||
Stats::Values getStats()
|
||||
{
|
||||
return m_stats.get();
|
||||
}
|
||||
void resetStats()
|
||||
{
|
||||
m_stats.reset();
|
||||
}
|
||||
#endif // TEXTSPLIT_STATS
|
||||
|
||||
private:
|
||||
Flags m_flags;
|
||||
int m_maxWordLength;
|
||||
@ -127,6 +194,15 @@ private:
|
||||
int m_prevpos;
|
||||
unsigned int m_prevlen;
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
// Stats counters. These are processed in TextSplit rather than by a
|
||||
// TermProc so that we can take very long words (not emitted) into
|
||||
// account.
|
||||
Stats m_stats;
|
||||
#endif
|
||||
// Word length in characters. Declared but not updated if !TEXTSPLIT_STATS
|
||||
unsigned int m_wordChars;
|
||||
|
||||
// This processes cjk text:
|
||||
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
||||
|
||||
|
||||
@ -1192,9 +1192,28 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
||||
|
||||
// Split and index body text
|
||||
LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
splitter.resetStats();
|
||||
#endif
|
||||
if (!splitter.text_to_words(doc.text))
|
||||
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
|
||||
|
||||
#ifdef TEXTSPLIT_STATS
|
||||
// Reject bad data. unrecognized base64 text is characterized by
|
||||
// high avg word length and high variation (because there are
|
||||
// word-splitters like +/ inside the data).
|
||||
TextSplit::Stats::Values v = splitter.getStats();
|
||||
// v.avglen > 15 && v.sigma > 12
|
||||
if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
|
||||
LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
|
||||
"count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
|
||||
v.count, v.avglen, v.sigma, doc.url.c_str(),
|
||||
doc.ipath.c_str(), doc.text.c_str()));
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
////// Special terms for other metadata. No positions for these.
|
||||
// Mime type
|
||||
newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user