generate acronyms for dotted abbrevs. ie O.E.C.D -> OECD

This commit is contained in:
Jean-Francois Dockes 2011-10-20 13:24:29 +02:00
parent 348421eae7
commit 6c72454396

View File

@ -208,10 +208,10 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
*/
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
{
LOGDEB3(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
"inn %d span [%s]\n",
spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
m_inNumber, m_span.c_str()));
LOGDEB2(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
"inn %d span [%s]\n",
spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
m_inNumber, m_span.c_str()));
// Emit span? When splitting for query, we only emit final spans
// (spanerase)
@ -220,6 +220,28 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
!((m_wordLen == m_span.length()) &&
(o_noNumbers) && m_inNumber) &&
((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
// Check for an acronym/abbreviation ie I.B.M.
if (spanerase && m_wordLen != m_span.length() && m_span.length() > 2
&& m_span.length() <= 20) {
bool acron = true;
for (unsigned int i = 1 ; i < m_span.length(); i += 2) {
if (m_span[i] != '.') {
acron = false;
break;
}
}
if (acron) {
string acronym;
for (unsigned int i = 0; i < m_span.length(); i += 2) {
acronym += m_span[i];
}
if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(),
bp))
return false;
}
}
// Maybe trim at end. These are chars that we would keep inside
// a span, but not at the end
while (m_span.length() > 0) {