From 94eb3119cec1c6200376740a160ed6a011ac1e01 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 13 Aug 2015 18:18:49 +0200 Subject: [PATCH] Generate an additional unhyphenated term for singly hyphenated words: co-worker will index as [co worker], [co-worker] and [coworker]. Only produce terms for alphanumeric hashtags (discard #,xyz) --- src/common/textsplit.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 72a1272a..728edb90 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -290,8 +290,8 @@ bool TextSplit::span_is_acronym(string *acronym) } - // Generate terms from span. Have to take into account the - // flags: ONLYSPANS, NOSPANS, noNumbers +// Generate terms from span. Have to take into account the +// flags: ONLYSPANS, NOSPANS, noNumbers bool TextSplit::words_from_span(int bp) { #if 0 @@ -309,6 +309,17 @@ bool TextSplit::words_from_span(int bp) // Byte position of the span start int spboffs = bp - m_span.size(); + if (spanwords == 2 && m_span[m_words_in_span[0].second] == '-') { + unsigned int s0 = m_words_in_span[0].first; + unsigned int l0 = m_words_in_span[0].second - m_words_in_span[0].first; + unsigned int s1 = m_words_in_span[1].first; + unsigned int l1 = m_words_in_span[1].second - m_words_in_span[1].first; + string word = m_span.substr(s0, l0) + m_span.substr(s1, l1); + if (l0 && l1) + emitterm(false, word, + m_spanpos, spboffs, spboffs + m_words_in_span[1].second); + } + for (unsigned int i = 0; i < ((m_flags&TXTS_ONLYSPANS) ? 1 : spanwords); i++, pos++) { @@ -655,16 +666,16 @@ bool TextSplit::text_to_words(const string &in) } break; - case '#': + case '#': { + int w = whatcc(it[it.getCpos()+1]); // Keep it only at the beginning of a word (hashtag), - if (m_wordLen == 0) { + if (m_wordLen == 0 && isalphanum(w, m_flags)) { m_wordLen += it.appendchartostring(m_span); STATS_INC_WORDCHARS; break; } // or at the end (special case for c# ...) if (m_wordLen > 0) { - int w = whatcc(it[it.getCpos()+1]); if (w == SPACE || w == '\n' || w == '\r') { m_wordLen += it.appendchartostring(m_span); STATS_INC_WORDCHARS; @@ -672,6 +683,7 @@ bool TextSplit::text_to_words(const string &in) } } goto SPACE; + } break; case '\n':