Emit a_b intermediary span when splitting a_b.c
This commit is contained in:
parent
7dcc7c61c8
commit
6169fdec4b
@ -201,15 +201,17 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
* @return true if ok, false for error. Splitting should stop in this case.
|
* @return true if ok, false for error. Splitting should stop in this case.
|
||||||
* @param spanerase Set if the current span is at its end. Reset it.
|
* @param spanerase Set if the current span is at its end. Reset it.
|
||||||
* @param bp The current BYTE position in the stream
|
* @param bp The current BYTE position in the stream
|
||||||
|
* @param spanemit This is set for intermediate spans: glue char changed.
|
||||||
*/
|
*/
|
||||||
inline bool TextSplit::doemit(bool spanerase, int bp)
|
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
||||||
{
|
{
|
||||||
LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d\n",
|
LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d\n",
|
||||||
span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
|
span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
|
||||||
|
|
||||||
// Emit span. When splitting for query, we only emit final spans
|
// Emit span. When splitting for query, we only emit final spans
|
||||||
bool spanemitted = false;
|
bool spanemitted = false;
|
||||||
if (spanerase && !(m_flags & TXTS_NOSPANS)) {
|
if (!(m_flags & TXTS_NOSPANS) &&
|
||||||
|
((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
|
||||||
// Maybe trim at end. These are chars that we would keep inside
|
// Maybe trim at end. These are chars that we would keep inside
|
||||||
// a span, but not at the end
|
// a span, but not at the end
|
||||||
while (m_span.length() > 0) {
|
while (m_span.length() > 0) {
|
||||||
@ -274,6 +276,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
m_span.erase();
|
m_span.erase();
|
||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
||||||
|
int curspanglue = 0;
|
||||||
|
|
||||||
Utf8Iter it(in);
|
Utf8Iter it(in);
|
||||||
|
|
||||||
@ -319,6 +322,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
|
|
||||||
case SPACE:
|
case SPACE:
|
||||||
SPACE:
|
SPACE:
|
||||||
|
curspanglue = 0;
|
||||||
if (m_wordLen || m_span.length()) {
|
if (m_wordLen || m_span.length()) {
|
||||||
if (!doemit(true, it.getBpos()))
|
if (!doemit(true, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
@ -340,9 +344,11 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
} else {
|
} else {
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
}
|
}
|
||||||
|
curspanglue = cc;
|
||||||
} else {
|
} else {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
|
curspanglue = cc;
|
||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
}
|
}
|
||||||
@ -354,6 +360,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
if (whatcc(it[it.getCpos()+1]) != DIGIT)
|
if (whatcc(it[it.getCpos()+1]) != DIGIT)
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
|
curspanglue = cc;
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
// If . inside a word, keep it, else, this is whitespace.
|
// If . inside a word, keep it, else, this is whitespace.
|
||||||
@ -364,8 +371,13 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
// A final comma in a word will be removed by doemit
|
// A final comma in a word will be removed by doemit
|
||||||
if (cc == '.') {
|
if (cc == '.') {
|
||||||
if (m_wordLen) {
|
if (m_wordLen) {
|
||||||
if (!doemit(false, it.getBpos()))
|
// Disputable special case: set spanemit to
|
||||||
|
// true when encountering a '.' while spanglue is '_'. Think of
|
||||||
|
// a_b.c Done because to avoid breaking stuff after changing
|
||||||
|
// '_' from wordchar to spanglue
|
||||||
|
if (!doemit(false, it.getBpos(), curspanglue == '_'))
|
||||||
return false;
|
return false;
|
||||||
|
curspanglue = cc;
|
||||||
// span length could have been adjusted by trimming
|
// span length could have been adjusted by trimming
|
||||||
// inside doemit
|
// inside doemit
|
||||||
if (m_span.length())
|
if (m_span.length())
|
||||||
@ -373,6 +385,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
|
curspanglue = cc;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -383,6 +396,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
if (m_wordLen) {
|
if (m_wordLen) {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
|
curspanglue = cc;
|
||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
}
|
}
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
@ -391,6 +405,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
if (m_wordLen) {
|
if (m_wordLen) {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
|
curspanglue = cc;
|
||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
}
|
}
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
@ -401,6 +416,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
if (m_wordLen) {
|
if (m_wordLen) {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
|
curspanglue = cc;
|
||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -129,7 +129,7 @@ private:
|
|||||||
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
||||||
|
|
||||||
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
||||||
bool doemit(bool spanerase, int bp);
|
bool doemit(bool spanerase, int bp, bool spanemit=false);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user