textsplit: eliminate some garbage terms (ie long sequences of dashes)
This commit is contained in:
parent
3b6870f133
commit
cb0794e92c
@ -208,11 +208,13 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
*/
|
*/
|
||||||
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
||||||
{
|
{
|
||||||
LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d "
|
LOGDEB3(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
|
||||||
"innum %d\n", m_span.c_str(), m_spanpos, m_wordStart,
|
"inn %d span [%s]\n",
|
||||||
m_wordLen, spanerase, bp, m_inNumber));
|
spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
|
||||||
|
m_inNumber, m_span.c_str()));
|
||||||
|
|
||||||
// Emit span. When splitting for query, we only emit final spans
|
// Emit span? When splitting for query, we only emit final spans
|
||||||
|
// (spanerase)
|
||||||
bool spanemitted = false;
|
bool spanemitted = false;
|
||||||
if (!(m_flags & TXTS_NOSPANS) &&
|
if (!(m_flags & TXTS_NOSPANS) &&
|
||||||
!((m_wordLen == m_span.length()) &&
|
!((m_wordLen == m_span.length()) &&
|
||||||
@ -223,6 +225,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
|||||||
while (m_span.length() > 0) {
|
while (m_span.length() > 0) {
|
||||||
switch (m_span[m_span.length()-1]) {
|
switch (m_span[m_span.length()-1]) {
|
||||||
case '.':
|
case '.':
|
||||||
|
case '-':
|
||||||
case ',':
|
case ',':
|
||||||
case '@':
|
case '@':
|
||||||
case '\'':
|
case '\'':
|
||||||
@ -250,12 +253,12 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Adjust state
|
// Adjust state
|
||||||
m_wordpos++;
|
if (m_wordLen) {
|
||||||
m_wordLen = 0;
|
m_wordpos++;
|
||||||
|
m_wordLen = 0;
|
||||||
|
}
|
||||||
if (spanerase) {
|
if (spanerase) {
|
||||||
m_span.erase();
|
discardspan();
|
||||||
m_spanpos = m_wordpos;
|
|
||||||
m_wordStart = 0;
|
|
||||||
} else {
|
} else {
|
||||||
m_wordStart = m_span.length();
|
m_wordStart = m_span.length();
|
||||||
}
|
}
|
||||||
@ -263,6 +266,14 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TextSplit::discardspan()
|
||||||
|
{
|
||||||
|
m_span.erase();
|
||||||
|
m_spanpos = m_wordpos;
|
||||||
|
m_wordStart = 0;
|
||||||
|
m_wordLen = 0;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Splitting a text into terms to be indexed.
|
* Splitting a text into terms to be indexed.
|
||||||
* We basically emit a word every time we see a separator, but some chars are
|
* We basically emit a word every time we see a separator, but some chars are
|
||||||
@ -283,10 +294,14 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
||||||
int curspanglue = 0;
|
int curspanglue = 0;
|
||||||
|
|
||||||
|
// Running count of non-alphanum chars. Reset when we see one;
|
||||||
|
int nonalnumcnt = 0;
|
||||||
|
|
||||||
Utf8Iter it(in);
|
Utf8Iter it(in);
|
||||||
|
|
||||||
for (; !it.eof(); it++) {
|
for (; !it.eof(); it++) {
|
||||||
unsigned int c = *it;
|
unsigned int c = *it;
|
||||||
|
nonalnumcnt++;
|
||||||
|
|
||||||
if (c == (unsigned int)-1) {
|
if (c == (unsigned int)-1) {
|
||||||
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
||||||
@ -319,11 +334,13 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
if (m_wordLen == 0)
|
if (m_wordLen == 0)
|
||||||
m_inNumber = true;
|
m_inNumber = true;
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
|
nonalnumcnt = 0;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SPACE:
|
case SPACE:
|
||||||
SPACE:
|
SPACE:
|
||||||
curspanglue = 0;
|
curspanglue = 0;
|
||||||
|
nonalnumcnt = 0;
|
||||||
if (m_wordLen || m_span.length()) {
|
if (m_wordLen || m_span.length()) {
|
||||||
if (!doemit(true, it.getBpos()))
|
if (!doemit(true, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
@ -338,20 +355,33 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
case '-':
|
case '-':
|
||||||
case '+':
|
case '+':
|
||||||
if (m_wordLen == 0 ||
|
curspanglue = cc;
|
||||||
(m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
|
if (m_wordLen == 0) {
|
||||||
m_span[m_span.length() - 1] == 'E'))) {
|
if (cc == '-') {
|
||||||
|
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
||||||
|
// -10
|
||||||
|
m_inNumber = true;
|
||||||
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
|
} else {
|
||||||
|
goto SPACE;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (nonalnumcnt > 2) {
|
||||||
|
discardspan();
|
||||||
|
} else {
|
||||||
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
|
||||||
|
m_span[m_span.length() - 1] == 'E')) {
|
||||||
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
if (whatcc(it[it.getCpos()+1]) == DIGIT) {
|
||||||
m_inNumber = true;
|
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
} else {
|
} else {
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
goto SPACE;
|
||||||
}
|
}
|
||||||
curspanglue = cc;
|
|
||||||
} else {
|
} else {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
curspanglue = cc;
|
|
||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
}
|
}
|
||||||
@ -367,13 +397,13 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
curspanglue = cc;
|
curspanglue = cc;
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
// If . inside a word, keep it, else, this is whitespace.
|
// If . inside a word, it's spanglue, else, it's whitespace.
|
||||||
// We also keep an initial '.' for catching .net, but this adds
|
// We also keep an initial '.' for catching .net, but this adds
|
||||||
// quite a few spurious terms !
|
// quite a few spurious terms !
|
||||||
// Another problem is that something like .x-errs
|
// Another problem is that something like .x-errs
|
||||||
// will be split as .x-errs, x, errs but not x-errs
|
// will be split as .x-errs, x, errs but not x-errs
|
||||||
// A final comma in a word will be removed by doemit
|
// A final comma in a word will be removed by doemit
|
||||||
if (cc == '.') {
|
if (cc == '.' && it[it.getCpos()+1] != '.') {
|
||||||
// Check for number like .1
|
// Check for number like .1
|
||||||
if (m_span.length() == 0 &&
|
if (m_span.length() == 0 &&
|
||||||
whatcc(it[it.getCpos()+1]) == DIGIT) {
|
whatcc(it[it.getCpos()+1]) == DIGIT) {
|
||||||
@ -386,7 +416,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
if (m_wordLen) {
|
if (m_wordLen) {
|
||||||
// Disputable special case: set spanemit to
|
// Disputable special case: set spanemit to
|
||||||
// true when encountering a '.' while spanglue
|
// true when encountering a '.' while spanglue
|
||||||
// is '_'. Think of a_b.c Done because to
|
// is '_'. Think of a_b.c Done to
|
||||||
// avoid breaking stuff after changing '_'
|
// avoid breaking stuff after changing '_'
|
||||||
// from wordchar to spanglue
|
// from wordchar to spanglue
|
||||||
if (!doemit(false, it.getBpos(), curspanglue == '_'))
|
if (!doemit(false, it.getBpos(), curspanglue == '_'))
|
||||||
@ -509,6 +539,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
}
|
}
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
|
nonalnumcnt = 0;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -69,7 +69,7 @@ public:
|
|||||||
virtual ~TextSplit() {}
|
virtual ~TextSplit() {}
|
||||||
|
|
||||||
/** Split text, emit words and positions. */
|
/** Split text, emit words and positions. */
|
||||||
bool text_to_words(const string &in);
|
virtual bool text_to_words(const string &in);
|
||||||
|
|
||||||
/** Process one output word: to be implemented by the actual user class */
|
/** Process one output word: to be implemented by the actual user class */
|
||||||
virtual bool takeword(const string& term,
|
virtual bool takeword(const string& term,
|
||||||
@ -128,6 +128,7 @@ private:
|
|||||||
|
|
||||||
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
||||||
bool doemit(bool spanerase, int bp, bool spanemit=false);
|
bool doemit(bool spanerase, int bp, bool spanemit=false);
|
||||||
|
void discardspan();
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user