From 94eb3119cec1c6200376740a160ed6a011ac1e01 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jfd@recoll.org>
Date: Thu, 13 Aug 2015 18:18:49 +0200
Subject: [PATCH] Generate an additional unhyphenated term for singly
 hyphenated words: co-worker will index as [co worker], [co-worker] and
 [coworker]. Only produce terms for alphanumeric hashtags (discard #,xyz)

---
 src/common/textsplit.cpp | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index 72a1272a..728edb90 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -290,8 +290,8 @@ bool TextSplit::span_is_acronym(string *acronym)
 }
 
 
-        // Generate terms from span. Have to take into account the
-        // flags: ONLYSPANS, NOSPANS, noNumbers
+// Generate terms from span. Have to take into account the
+// flags: ONLYSPANS, NOSPANS, noNumbers
 bool TextSplit::words_from_span(int bp)
 {
 #if 0
@@ -309,6 +309,17 @@ bool TextSplit::words_from_span(int bp)
     // Byte position of the span start
     int spboffs = bp - m_span.size();
 
+    if (spanwords == 2 && m_span[m_words_in_span[0].second] == '-') {
+	unsigned int s0 = m_words_in_span[0].first;
+	unsigned int l0 = m_words_in_span[0].second - m_words_in_span[0].first;
+	unsigned int s1 = m_words_in_span[1].first;
+	unsigned int l1 = m_words_in_span[1].second - m_words_in_span[1].first;
+	string word = m_span.substr(s0, l0) + m_span.substr(s1, l1);
+	if (l0 && l1) 
+	    emitterm(false, word,
+		     m_spanpos, spboffs, spboffs + m_words_in_span[1].second);
+    }
+
     for (unsigned int i = 0; 
          i < ((m_flags&TXTS_ONLYSPANS) ? 1 : spanwords); 
          i++, pos++) {
@@ -655,16 +666,16 @@ bool TextSplit::text_to_words(const string &in)
 	    }
 	    break;
 
-	case '#': 
+	case '#':  {
+	    int w = whatcc(it[it.getCpos()+1]);
 	    // Keep it only at the beginning of a word (hashtag), 
-            if (m_wordLen == 0) {
+            if (m_wordLen == 0 && isalphanum(w, m_flags)) {
                 m_wordLen += it.appendchartostring(m_span);
                 STATS_INC_WORDCHARS;
                 break;
             }
             // or at the end (special case for c# ...)
 	    if (m_wordLen > 0) {
-		int w = whatcc(it[it.getCpos()+1]);
 		if (w == SPACE || w == '\n' || w == '\r') {
 		    m_wordLen += it.appendchartostring(m_span);
 		    STATS_INC_WORDCHARS;
@@ -672,6 +683,7 @@ bool TextSplit::text_to_words(const string &in)
 		}
 	    }
 	    goto SPACE;
+	}
 	    break;
 
 	case '\n':