generate acronyms for dotted abbrevs. ie O.E.C.D -> OECD
This commit is contained in:
parent
348421eae7
commit
6c72454396
@ -208,10 +208,10 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
*/
|
*/
|
||||||
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
||||||
{
|
{
|
||||||
LOGDEB3(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
|
LOGDEB2(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
|
||||||
"inn %d span [%s]\n",
|
"inn %d span [%s]\n",
|
||||||
spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
|
spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
|
||||||
m_inNumber, m_span.c_str()));
|
m_inNumber, m_span.c_str()));
|
||||||
|
|
||||||
// Emit span? When splitting for query, we only emit final spans
|
// Emit span? When splitting for query, we only emit final spans
|
||||||
// (spanerase)
|
// (spanerase)
|
||||||
@ -220,6 +220,28 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
|||||||
!((m_wordLen == m_span.length()) &&
|
!((m_wordLen == m_span.length()) &&
|
||||||
(o_noNumbers) && m_inNumber) &&
|
(o_noNumbers) && m_inNumber) &&
|
||||||
((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
|
((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
|
||||||
|
|
||||||
|
// Check for an acronym/abbreviation ie I.B.M.
|
||||||
|
if (spanerase && m_wordLen != m_span.length() && m_span.length() > 2
|
||||||
|
&& m_span.length() <= 20) {
|
||||||
|
bool acron = true;
|
||||||
|
for (unsigned int i = 1 ; i < m_span.length(); i += 2) {
|
||||||
|
if (m_span[i] != '.') {
|
||||||
|
acron = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (acron) {
|
||||||
|
string acronym;
|
||||||
|
for (unsigned int i = 0; i < m_span.length(); i += 2) {
|
||||||
|
acronym += m_span[i];
|
||||||
|
}
|
||||||
|
if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(),
|
||||||
|
bp))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Maybe trim at end. These are chars that we would keep inside
|
// Maybe trim at end. These are chars that we would keep inside
|
||||||
// a span, but not at the end
|
// a span, but not at the end
|
||||||
while (m_span.length() > 0) {
|
while (m_span.length() > 0) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user