From 6c7245439641eebd1bc71b063ba64ffa2c15d22a Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 20 Oct 2011 13:24:29 +0200 Subject: [PATCH] generate acronyms for dotted abbrevs. ie O.E.C.D -> OECD --- src/common/textsplit.cpp | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 4e2b7c40..2c0cf283 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -208,10 +208,10 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos, */ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit) { - LOGDEB3(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d " - "inn %d span [%s]\n", - spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen, - m_inNumber, m_span.c_str())); + LOGDEB2(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d " + "inn %d span [%s]\n", + spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen, + m_inNumber, m_span.c_str())); // Emit span? When splitting for query, we only emit final spans // (spanerase) @@ -220,6 +220,28 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit) !((m_wordLen == m_span.length()) && (o_noNumbers) && m_inNumber) && ((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) { + + // Check for an acronym/abbreviation ie I.B.M. + if (spanerase && m_wordLen != m_span.length() && m_span.length() > 2 + && m_span.length() <= 20) { + bool acron = true; + for (unsigned int i = 1 ; i < m_span.length(); i += 2) { + if (m_span[i] != '.') { + acron = false; + break; + } + } + if (acron) { + string acronym; + for (unsigned int i = 0; i < m_span.length(); i += 2) { + acronym += m_span[i]; + } + if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(), + bp)) + return false; + } + } + // Maybe trim at end. These are chars that we would keep inside // a span, but not at the end while (m_span.length() > 0) {