diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 070b4b45..2fe43a91 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -92,6 +92,16 @@ public: for (i = 0; i < strlen(wild); i++) charclasses[int(wild[i])] = WILD; + // Characters with special treatment: + // + // The first ones are mostly span-constructing "glue" + // characters, for example those typically allowing us to + // search for an email address as a whole (bob@isp.org instead + // of as a phrase "bob isp org" + // + // The case of the minus sign is a complicated one. It went + // from glue to non-glue to glue along Recoll versions. + // See minus-hyphen-dash.txt in doc/notes char special[] = ".@+-#'_\n\r\f"; for (i = 0; i < strlen(special); i++) charclasses[int(special[i])] = special[i]; @@ -121,7 +131,11 @@ static inline int whatcc(unsigned int c) if (c <= 127) { return charclasses[c]; } else { - if (sskip.find(c) != sskip.end()) { + if (c == 0x2010) { + // Special treatment for hyphen: handle as ascii minus. See + // doc/notes/minus-hyphen-dash.txt + return 0x2010; + } else if (sskip.find(c) != sskip.end()) { return SKIP; } else if (spunc.find(c) != spunc.end()) { return SPACE; @@ -574,6 +588,19 @@ bool TextSplit::text_to_words(const string &in) goto SPACE; break; + case 0x2010: + // Hyphen is replaced with ascii minus + if (m_wordLen != 0) { + // Treat '-' inside span as glue char + if (!doemit(false, it.getBpos())) + return false; + m_inNumber = false; + m_span += '-'; + m_wordStart++; + break; + } + goto SPACE; + case '.': { // Need a little lookahead here. At worse this gets the end null @@ -1036,7 +1063,9 @@ static const char *teststrings[] = { "soft\xc2\xadhyphen", "soft\xc2\xad\nhyphen", "soft\xc2\xad\n\rhyphen", - "hard-\nhyphen", + "real\xe2\x80\x90hyphen", + "real\xe2\x80\x90\nhyphen", + "hyphen-\nminus", }; const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);