From cb0794e92c7728a5a5b1ead151c0dedb537a3851 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 6 Jul 2011 16:20:32 +0200 Subject: [PATCH] textsplit: eliminate some garbage terms (ie long sequences of dashes) --- src/common/textsplit.cpp | 69 +++++++++++++++++++++++++++++----------- src/common/textsplit.h | 3 +- 2 files changed, 52 insertions(+), 20 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 637b263c..8a7f2578 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -208,11 +208,13 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos, */ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit) { - LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d " - "innum %d\n", m_span.c_str(), m_spanpos, m_wordStart, - m_wordLen, spanerase, bp, m_inNumber)); + LOGDEB3(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d " + "inn %d span [%s]\n", + spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen, + m_inNumber, m_span.c_str())); - // Emit span. When splitting for query, we only emit final spans + // Emit span? When splitting for query, we only emit final spans + // (spanerase) bool spanemitted = false; if (!(m_flags & TXTS_NOSPANS) && !((m_wordLen == m_span.length()) && @@ -223,6 +225,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit) while (m_span.length() > 0) { switch (m_span[m_span.length()-1]) { case '.': + case '-': case ',': case '@': case '\'': @@ -250,12 +253,12 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit) } // Adjust state - m_wordpos++; - m_wordLen = 0; + if (m_wordLen) { + m_wordpos++; + m_wordLen = 0; + } if (spanerase) { - m_span.erase(); - m_spanpos = m_wordpos; - m_wordStart = 0; + discardspan(); } else { m_wordStart = m_span.length(); } @@ -263,6 +266,14 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit) return true; } +void TextSplit::discardspan() +{ + m_span.erase(); + m_spanpos = m_wordpos; + m_wordStart = 0; + m_wordLen = 0; +} + /** * Splitting a text into terms to be indexed. * We basically emit a word every time we see a separator, but some chars are @@ -283,10 +294,14 @@ bool TextSplit::text_to_words(const string &in) m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0; int curspanglue = 0; + // Running count of non-alphanum chars. Reset when we see one; + int nonalnumcnt = 0; + Utf8Iter it(in); for (; !it.eof(); it++) { unsigned int c = *it; + nonalnumcnt++; if (c == (unsigned int)-1) { LOGERR(("Textsplit: error occured while scanning UTF-8 string\n")); @@ -319,11 +334,13 @@ bool TextSplit::text_to_words(const string &in) if (m_wordLen == 0) m_inNumber = true; m_wordLen += it.appendchartostring(m_span); + nonalnumcnt = 0; break; case SPACE: SPACE: curspanglue = 0; + nonalnumcnt = 0; if (m_wordLen || m_span.length()) { if (!doemit(true, it.getBpos())) return false; @@ -338,20 +355,33 @@ bool TextSplit::text_to_words(const string &in) break; case '-': case '+': - if (m_wordLen == 0 || - (m_inNumber && (m_span[m_span.length() - 1] == 'e' || - m_span[m_span.length() - 1] == 'E'))) { + curspanglue = cc; + if (m_wordLen == 0) { + if (cc == '-') { + if (whatcc(it[it.getCpos()+1]) == DIGIT) { + // -10 + m_inNumber = true; + m_wordLen += it.appendchartostring(m_span); + } else { + goto SPACE; + } + } else { + if (nonalnumcnt > 2) { + discardspan(); + } else { + m_wordStart += it.appendchartostring(m_span); + } + } + } else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' || + m_span[m_span.length() - 1] == 'E')) { if (whatcc(it[it.getCpos()+1]) == DIGIT) { - m_inNumber = true; m_wordLen += it.appendchartostring(m_span); } else { - m_wordStart += it.appendchartostring(m_span); + goto SPACE; } - curspanglue = cc; } else { if (!doemit(false, it.getBpos())) return false; - curspanglue = cc; m_inNumber = false; m_wordStart += it.appendchartostring(m_span); } @@ -367,13 +397,13 @@ bool TextSplit::text_to_words(const string &in) curspanglue = cc; break; } else { - // If . inside a word, keep it, else, this is whitespace. + // If . inside a word, it's spanglue, else, it's whitespace. // We also keep an initial '.' for catching .net, but this adds // quite a few spurious terms ! // Another problem is that something like .x-errs // will be split as .x-errs, x, errs but not x-errs // A final comma in a word will be removed by doemit - if (cc == '.') { + if (cc == '.' && it[it.getCpos()+1] != '.') { // Check for number like .1 if (m_span.length() == 0 && whatcc(it[it.getCpos()+1]) == DIGIT) { @@ -386,7 +416,7 @@ bool TextSplit::text_to_words(const string &in) if (m_wordLen) { // Disputable special case: set spanemit to // true when encountering a '.' while spanglue - // is '_'. Think of a_b.c Done because to + // is '_'. Think of a_b.c Done to // avoid breaking stuff after changing '_' // from wordchar to spanglue if (!doemit(false, it.getBpos(), curspanglue == '_')) @@ -509,6 +539,7 @@ bool TextSplit::text_to_words(const string &in) m_inNumber = false; } m_wordLen += it.appendchartostring(m_span); + nonalnumcnt = 0; break; } } diff --git a/src/common/textsplit.h b/src/common/textsplit.h index d9613a35..a73f8895 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -69,7 +69,7 @@ public: virtual ~TextSplit() {} /** Split text, emit words and positions. */ - bool text_to_words(const string &in); + virtual bool text_to_words(const string &in); /** Process one output word: to be implemented by the actual user class */ virtual bool takeword(const string& term, @@ -128,6 +128,7 @@ private: bool emitterm(bool isspan, string &term, int pos, int bs, int be); bool doemit(bool spanerase, int bp, bool spanemit=false); + void discardspan(); }; #endif /* _TEXTSPLIT_H_INCLUDED_ */