textsplit: eliminate some garbage terms (ie long sequences of dashes)

This commit is contained in:
Jean-Francois Dockes 2011-07-06 16:20:32 +02:00
parent 3b6870f133
commit cb0794e92c
2 changed files with 52 additions and 20 deletions

View File

@ -208,11 +208,13 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
*/ */
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit) inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
{ {
LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d " LOGDEB3(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
"innum %d\n", m_span.c_str(), m_spanpos, m_wordStart, "inn %d span [%s]\n",
m_wordLen, spanerase, bp, m_inNumber)); spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
m_inNumber, m_span.c_str()));
// Emit span. When splitting for query, we only emit final spans // Emit span? When splitting for query, we only emit final spans
// (spanerase)
bool spanemitted = false; bool spanemitted = false;
if (!(m_flags & TXTS_NOSPANS) && if (!(m_flags & TXTS_NOSPANS) &&
!((m_wordLen == m_span.length()) && !((m_wordLen == m_span.length()) &&
@ -223,6 +225,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
while (m_span.length() > 0) { while (m_span.length() > 0) {
switch (m_span[m_span.length()-1]) { switch (m_span[m_span.length()-1]) {
case '.': case '.':
case '-':
case ',': case ',':
case '@': case '@':
case '\'': case '\'':
@ -250,12 +253,12 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
} }
// Adjust state // Adjust state
m_wordpos++; if (m_wordLen) {
m_wordLen = 0; m_wordpos++;
m_wordLen = 0;
}
if (spanerase) { if (spanerase) {
m_span.erase(); discardspan();
m_spanpos = m_wordpos;
m_wordStart = 0;
} else { } else {
m_wordStart = m_span.length(); m_wordStart = m_span.length();
} }
@ -263,6 +266,14 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
return true; return true;
} }
void TextSplit::discardspan()
{
m_span.erase();
m_spanpos = m_wordpos;
m_wordStart = 0;
m_wordLen = 0;
}
/** /**
* Splitting a text into terms to be indexed. * Splitting a text into terms to be indexed.
* We basically emit a word every time we see a separator, but some chars are * We basically emit a word every time we see a separator, but some chars are
@ -283,10 +294,14 @@ bool TextSplit::text_to_words(const string &in)
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0; m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
int curspanglue = 0; int curspanglue = 0;
// Running count of non-alphanum chars. Reset when we see one;
int nonalnumcnt = 0;
Utf8Iter it(in); Utf8Iter it(in);
for (; !it.eof(); it++) { for (; !it.eof(); it++) {
unsigned int c = *it; unsigned int c = *it;
nonalnumcnt++;
if (c == (unsigned int)-1) { if (c == (unsigned int)-1) {
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n")); LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
@ -319,11 +334,13 @@ bool TextSplit::text_to_words(const string &in)
if (m_wordLen == 0) if (m_wordLen == 0)
m_inNumber = true; m_inNumber = true;
m_wordLen += it.appendchartostring(m_span); m_wordLen += it.appendchartostring(m_span);
nonalnumcnt = 0;
break; break;
case SPACE: case SPACE:
SPACE: SPACE:
curspanglue = 0; curspanglue = 0;
nonalnumcnt = 0;
if (m_wordLen || m_span.length()) { if (m_wordLen || m_span.length()) {
if (!doemit(true, it.getBpos())) if (!doemit(true, it.getBpos()))
return false; return false;
@ -338,20 +355,33 @@ bool TextSplit::text_to_words(const string &in)
break; break;
case '-': case '-':
case '+': case '+':
if (m_wordLen == 0 || curspanglue = cc;
(m_inNumber && (m_span[m_span.length() - 1] == 'e' || if (m_wordLen == 0) {
m_span[m_span.length() - 1] == 'E'))) { if (cc == '-') {
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
// -10
m_inNumber = true;
m_wordLen += it.appendchartostring(m_span);
} else {
goto SPACE;
}
} else {
if (nonalnumcnt > 2) {
discardspan();
} else {
m_wordStart += it.appendchartostring(m_span);
}
}
} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
m_span[m_span.length() - 1] == 'E')) {
if (whatcc(it[it.getCpos()+1]) == DIGIT) { if (whatcc(it[it.getCpos()+1]) == DIGIT) {
m_inNumber = true;
m_wordLen += it.appendchartostring(m_span); m_wordLen += it.appendchartostring(m_span);
} else { } else {
m_wordStart += it.appendchartostring(m_span); goto SPACE;
} }
curspanglue = cc;
} else { } else {
if (!doemit(false, it.getBpos())) if (!doemit(false, it.getBpos()))
return false; return false;
curspanglue = cc;
m_inNumber = false; m_inNumber = false;
m_wordStart += it.appendchartostring(m_span); m_wordStart += it.appendchartostring(m_span);
} }
@ -367,13 +397,13 @@ bool TextSplit::text_to_words(const string &in)
curspanglue = cc; curspanglue = cc;
break; break;
} else { } else {
// If . inside a word, keep it, else, this is whitespace. // If . inside a word, it's spanglue, else, it's whitespace.
// We also keep an initial '.' for catching .net, but this adds // We also keep an initial '.' for catching .net, but this adds
// quite a few spurious terms ! // quite a few spurious terms !
// Another problem is that something like .x-errs // Another problem is that something like .x-errs
// will be split as .x-errs, x, errs but not x-errs // will be split as .x-errs, x, errs but not x-errs
// A final comma in a word will be removed by doemit // A final comma in a word will be removed by doemit
if (cc == '.') { if (cc == '.' && it[it.getCpos()+1] != '.') {
// Check for number like .1 // Check for number like .1
if (m_span.length() == 0 && if (m_span.length() == 0 &&
whatcc(it[it.getCpos()+1]) == DIGIT) { whatcc(it[it.getCpos()+1]) == DIGIT) {
@ -386,7 +416,7 @@ bool TextSplit::text_to_words(const string &in)
if (m_wordLen) { if (m_wordLen) {
// Disputable special case: set spanemit to // Disputable special case: set spanemit to
// true when encountering a '.' while spanglue // true when encountering a '.' while spanglue
// is '_'. Think of a_b.c Done because to // is '_'. Think of a_b.c Done to
// avoid breaking stuff after changing '_' // avoid breaking stuff after changing '_'
// from wordchar to spanglue // from wordchar to spanglue
if (!doemit(false, it.getBpos(), curspanglue == '_')) if (!doemit(false, it.getBpos(), curspanglue == '_'))
@ -509,6 +539,7 @@ bool TextSplit::text_to_words(const string &in)
m_inNumber = false; m_inNumber = false;
} }
m_wordLen += it.appendchartostring(m_span); m_wordLen += it.appendchartostring(m_span);
nonalnumcnt = 0;
break; break;
} }
} }

View File

@ -69,7 +69,7 @@ public:
virtual ~TextSplit() {} virtual ~TextSplit() {}
/** Split text, emit words and positions. */ /** Split text, emit words and positions. */
bool text_to_words(const string &in); virtual bool text_to_words(const string &in);
/** Process one output word: to be implemented by the actual user class */ /** Process one output word: to be implemented by the actual user class */
virtual bool takeword(const string& term, virtual bool takeword(const string& term,
@ -128,6 +128,7 @@ private:
bool emitterm(bool isspan, string &term, int pos, int bs, int be); bool emitterm(bool isspan, string &term, int pos, int bs, int be);
bool doemit(bool spanerase, int bp, bool spanemit=false); bool doemit(bool spanerase, int bp, bool spanemit=false);
void discardspan();
}; };
#endif /* _TEXTSPLIT_H_INCLUDED_ */ #endif /* _TEXTSPLIT_H_INCLUDED_ */