diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index 755ef5ce..c6975065 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -14,7 +14,7 @@ * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#ifndef TEST_TEXTSPLIT + #include "autoconfig.h" #include @@ -80,29 +80,28 @@ static std::unordered_set sskip; class CharClassInit { public: - CharClassInit() - { - unsigned int i; + CharClassInit() { + unsigned int i; - // Set default value for all: SPACE - for (i = 0 ; i < 256 ; i ++) - charclasses[i] = SPACE; + // Set default value for all: SPACE + for (i = 0 ; i < 256 ; i ++) + charclasses[i] = SPACE; - char digits[] = "0123456789"; - for (i = 0; i < strlen(digits); i++) - charclasses[int(digits[i])] = DIGIT; + char digits[] = "0123456789"; + for (i = 0; i < strlen(digits); i++) + charclasses[int(digits[i])] = DIGIT; - char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - for (i = 0; i < strlen(upper); i++) - charclasses[int(upper[i])] = A_ULETTER; + char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + for (i = 0; i < strlen(upper); i++) + charclasses[int(upper[i])] = A_ULETTER; - char lower[] = "abcdefghijklmnopqrstuvwxyz"; - for (i = 0; i < strlen(lower); i++) - charclasses[int(lower[i])] = A_LLETTER; + char lower[] = "abcdefghijklmnopqrstuvwxyz"; + for (i = 0; i < strlen(lower); i++) + charclasses[int(lower[i])] = A_LLETTER; - char wild[] = "*?[]"; - for (i = 0; i < strlen(wild); i++) - charclasses[int(wild[i])] = WILD; + char wild[] = "*?[]"; + for (i = 0; i < strlen(wild); i++) + charclasses[int(wild[i])] = WILD; // Characters with special treatment: // @@ -114,70 +113,78 @@ public: // The case of the minus sign is a complicated one. It went // from glue to non-glue to glue along Recoll versions. // See minus-hyphen-dash.txt in doc/notes - char special[] = ".@+-#'_\n\r\f"; - for (i = 0; i < strlen(special); i++) - charclasses[int(special[i])] = special[i]; + char special[] = ".@+-#'_\n\r\f"; + for (i = 0; i < strlen(special); i++) + charclasses[int(special[i])] = special[i]; - for (i = 0; i < sizeof(unipunc) / sizeof(int); i++) { - spunc.insert(unipunc[i]); - } - spunc.insert((unsigned int)-1); + for (i = 0; i < sizeof(unipunc) / sizeof(int); i++) { + spunc.insert(unipunc[i]); + } + spunc.insert((unsigned int)-1); - for (i = 0; i < sizeof(unipuncblocks) / sizeof(int); i++) { - vpuncblocks.push_back(unipuncblocks[i]); - } - assert((vpuncblocks.size() % 2) == 0); + for (i = 0; i < sizeof(unipuncblocks) / sizeof(int); i++) { + vpuncblocks.push_back(unipuncblocks[i]); + } + assert((vpuncblocks.size() % 2) == 0); - for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) { - visiblewhite.insert(avsbwht[i]); - } - for (i = 0; i < sizeof(uniskip) / sizeof(int); i++) { - sskip.insert(uniskip[i]); - } + for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) { + visiblewhite.insert(avsbwht[i]); + } + for (i = 0; i < sizeof(uniskip) / sizeof(int); i++) { + sskip.insert(uniskip[i]); + } } }; static const CharClassInit charClassInitInstance; -static inline int whatcc(unsigned int c) +static inline int whatcc(unsigned int c, char *asciirep = nullptr) { if (c <= 127) { - return charclasses[c]; + return charclasses[c]; } else { if (c == 0x2010) { // Special treatment for hyphen: handle as ascii minus. See // doc/notes/minus-hyphen-dash.txt - return 0x2010; + if (asciirep) + *asciirep = '-'; + return c; + } else if (c == 0x2019 || c == 0x275c || c == 0x02bc) { + // Things sometimes replacing a single quote. Use single + // quote so that span processing works ok + if (asciirep) + *asciirep = '\''; + return c; } else if (sskip.find(c) != sskip.end()) { - return SKIP; - } else if (spunc.find(c) != spunc.end()) { - return SPACE; - } else { - vector::iterator it = - lower_bound(vpuncblocks.begin(), vpuncblocks.end(), c); - if (it == vpuncblocks.end()) - return LETTER; - if (c == *it) - return SPACE; - if ((it - vpuncblocks.begin()) % 2 == 1) { - return SPACE; - } else { - return LETTER; - } - } + return SKIP; + } else if (spunc.find(c) != spunc.end()) { + return SPACE; + } else { + vector::iterator it = + lower_bound(vpuncblocks.begin(), vpuncblocks.end(), c); + if (it == vpuncblocks.end()) + return LETTER; + if (c == *it) + return SPACE; + if ((it - vpuncblocks.begin()) % 2 == 1) { + return SPACE; + } else { + return LETTER; + } + } } } // testing whatcc... #if 0 unsigned int testvalues[] = {'a', '0', 0x80, 0xbf, 0xc0, 0x05c3, 0x1000, - 0x2000, 0x2001, 0x206e, 0x206f, 0x20d0, 0x2399, - 0x2400, 0x2401, 0x243f, 0x2440, 0xff65}; + 0x2000, 0x2001, 0x206e, 0x206f, 0x20d0, 0x2399, + 0x2400, 0x2401, 0x243f, 0x2440, 0xff65}; int ntest = sizeof(testvalues) / sizeof(int); for (int i = 0; i < ntest; i++) { int ret = whatcc(testvalues[i]); printf("Tested value 0x%x, returned value %d %s\n", - testvalues[i], ret, ret == LETTER ? "LETTER" : - ret == SPACE ? "SPACE" : "OTHER"); + testvalues[i], ret, ret == LETTER ? "LETTER" : + ret == SPACE ? "SPACE" : "OTHER"); } #endif @@ -205,15 +212,15 @@ static inline int whatcc(unsigned int c) // FF00..FFEF; Halfwidth and Fullwidth Forms // 20000..2A6DF; CJK Unified Ideographs Extension B // 2F800..2FA1F; CJK Compatibility Ideographs Supplement -#define UNICODE_IS_CJK(p) \ - (((p) >= 0x2E80 && (p) <= 0x2EFF) || \ - ((p) >= 0x3000 && (p) <= 0x9FFF) || \ - ((p) >= 0xA700 && (p) <= 0xA71F) || \ - ((p) >= 0xAC00 && (p) <= 0xD7AF) || \ - ((p) >= 0xF900 && (p) <= 0xFAFF) || \ - ((p) >= 0xFE30 && (p) <= 0xFE4F) || \ - ((p) >= 0xFF00 && (p) <= 0xFFEF) || \ - ((p) >= 0x20000 && (p) <= 0x2A6DF) || \ +#define UNICODE_IS_CJK(p) \ + (((p) >= 0x2E80 && (p) <= 0x2EFF) || \ + ((p) >= 0x3000 && (p) <= 0x9FFF) || \ + ((p) >= 0xA700 && (p) <= 0xA71F) || \ + ((p) >= 0xAC00 && (p) <= 0xD7AF) || \ + ((p) >= 0xF900 && (p) <= 0xFAFF) || \ + ((p) >= 0xFE30 && (p) <= 0xFE4F) || \ + ((p) >= 0xFF00 && (p) <= 0xFFEF) || \ + ((p) >= 0x20000 && (p) <= 0x2A6DF) || \ ((p) >= 0x2F800 && (p) <= 0x2FA1F)) // We should probably map 'fullwidth ascii variants' and 'halfwidth @@ -257,9 +264,9 @@ void TextSplit::staticConfInit(RclConfig *config) bool bvalue{false}; if (config->getConfParam("nocjk", &bvalue) && bvalue == true) { - o_processCJK = false; + o_processCJK = false; } else { - o_processCJK = true; + o_processCJK = true; int ngramlen; if (config->getConfParam("cjkngramlen", &ngramlen)) { o_CJKNgramLen = (unsigned int)(ngramlen <= o_CJKMaxNgramLen ? @@ -269,12 +276,12 @@ void TextSplit::staticConfInit(RclConfig *config) bvalue = false; if (config->getConfParam("nonumbers", &bvalue)) { - o_noNumbers = bvalue; + o_noNumbers = bvalue; } bvalue = false; if (config->getConfParam("dehyphenate", &bvalue)) { - o_deHyphenate = bvalue; + o_deHyphenate = bvalue; } bvalue = false; @@ -289,7 +296,7 @@ void TextSplit::staticConfInit(RclConfig *config) // Final term checkpoint: do some checking (the kind which is simpler // to do here than in the main loop), then send term to our client. inline bool TextSplit::emitterm(bool isspan, string &w, int pos, - size_t btstart, size_t btend) + size_t btstart, size_t btend) { LOGDEB2("TextSplit::emitterm: [" << w << "] pos " << pos << "\n"); @@ -299,30 +306,30 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos, // Update word length statistics. Do this before we filter out // long words because stats are used to detect bad text if (!isspan || m_wordLen == m_span.length()) - m_stats.newsamp(m_wordChars); + m_stats.newsamp(m_wordChars); #endif if (l > 0 && l <= o_maxWordLength) { - // 1 byte word: we index single ascii letters and digits, but - // nothing else. We might want to turn this into a test for a - // single utf8 character instead ? - if (l == 1) { - unsigned int c = ((unsigned int)w[0]) & 0xff; - if (charclasses[c] != A_ULETTER && charclasses[c] != A_LLETTER && + // 1 byte word: we index single ascii letters and digits, but + // nothing else. We might want to turn this into a test for a + // single utf8 character instead ? + if (l == 1) { + unsigned int c = ((unsigned int)w[0]) & 0xff; + if (charclasses[c] != A_ULETTER && charclasses[c] != A_LLETTER && charclasses[c] != DIGIT && - (!(m_flags & TXTS_KEEPWILD) || charclasses[c] != WILD) - ) { - //cerr << "ERASING single letter term " << c << endl; - return true; - } - } - if (pos != m_prevpos || l != m_prevlen) { - bool ret = takeword(w, pos, int(btstart), int(btend)); - m_prevpos = pos; - m_prevlen = int(w.length()); - return ret; - } - LOGDEB2("TextSplit::emitterm:dup: [" << w << "] pos " << pos << "\n"); + (!(m_flags & TXTS_KEEPWILD) || charclasses[c] != WILD) + ) { + //cerr << "ERASING single letter term " << c << endl; + return true; + } + } + if (pos != m_prevpos || l != m_prevlen) { + bool ret = takeword(w, pos, int(btstart), int(btend)); + m_prevpos = pos; + m_prevlen = int(w.length()); + return ret; + } + LOGDEB2("TextSplit::emitterm:dup: [" << w << "] pos " << pos << "\n"); } return true; } @@ -392,15 +399,15 @@ bool TextSplit::words_from_span(size_t bp) size_t spboffs = bp - m_span.size(); if (o_deHyphenate && spanwords == 2 && - m_span[m_words_in_span[0].second] == '-') { - unsigned int s0 = m_words_in_span[0].first; - unsigned int l0 = m_words_in_span[0].second - m_words_in_span[0].first; - unsigned int s1 = m_words_in_span[1].first; - unsigned int l1 = m_words_in_span[1].second - m_words_in_span[1].first; - string word = m_span.substr(s0, l0) + m_span.substr(s1, l1); - if (l0 && l1) - emitterm(false, word, - m_spanpos, spboffs, spboffs + m_words_in_span[1].second); + m_span[m_words_in_span[0].second] == '-') { + unsigned int s0 = m_words_in_span[0].first; + unsigned int l0 = m_words_in_span[0].second - m_words_in_span[0].first; + unsigned int s1 = m_words_in_span[1].first; + unsigned int l1 = m_words_in_span[1].second - m_words_in_span[1].first; + string word = m_span.substr(s0, l0) + m_span.substr(s1, l1); + if (l0 && l1) + emitterm(false, word, + m_spanpos, spboffs, spboffs + m_words_in_span[1].second); } for (int i = 0; @@ -463,8 +470,8 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp) m_words_in_span.push_back(pair(m_wordStart, m_wordStart + m_wordLen)); - m_wordpos++; - m_wordLen = m_wordChars = 0; + m_wordpos++; + m_wordLen = m_wordChars = 0; } if (spanerase) { @@ -476,37 +483,37 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp) return false; } - // Maybe trim at end. These are chars that we might keep - // inside a span, but not at the end. - while (m_span.length() > 0) { - switch (*(m_span.rbegin())) { - case '.': - case '-': - case ',': - case '@': - case '_': - case '\'': - m_span.resize(m_span.length()-1); + // Maybe trim at end. These are chars that we might keep + // inside a span, but not at the end. + while (m_span.length() > 0) { + switch (*(m_span.rbegin())) { + case '.': + case '-': + case ',': + case '@': + case '_': + case '\'': + m_span.resize(m_span.length()-1); if (m_words_in_span.size() && m_words_in_span.back().second > int(m_span.size())) m_words_in_span.back().second = int(m_span.size()); - if (--bp < 0) - bp = 0; - break; - default: - goto breaktrimloop; - } - } + if (--bp < 0) + bp = 0; + break; + default: + goto breaktrimloop; + } + } breaktrimloop: if (!words_from_span(bp)) { return false; } - discardspan(); + discardspan(); } else { - m_wordStart = int(m_span.length()); + m_wordStart = int(m_span.length()); } @@ -525,8 +532,8 @@ void TextSplit::discardspan() static inline bool isalphanum(int what, unsigned int flgs) { return what == A_LLETTER || what == A_ULETTER || - what == DIGIT || what == LETTER || - ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD); + what == DIGIT || what == LETTER || + ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD); } static inline bool isdigit(int what, unsigned int flgs) { @@ -558,7 +565,7 @@ bool TextSplit::text_to_words(const string &in) " [" << in.substr(0,50) << "]\n"); if (in.empty()) - return true; + return true; // Reset the data members relative to splitting state clearsplitstate(); @@ -573,13 +580,13 @@ bool TextSplit::text_to_words(const string &in) int prev_csc = -1; for (; !it.eof(); it++) { - unsigned int c = *it; - nonalnumcnt++; + unsigned int c = *it; + nonalnumcnt++; - if (c == (unsigned int)-1) { - LOGERR("Textsplit: error occurred while scanning UTF-8 string\n"); - return false; - } + if (c == (unsigned int)-1) { + LOGERR("Textsplit: error occurred while scanning UTF-8 string\n"); + return false; + } CharSpanClass csc; if (UNICODE_IS_KATAKANA(c)) { @@ -590,25 +597,25 @@ bool TextSplit::text_to_words(const string &in) csc = CSC_OTHER; } - if (o_processCJK && csc == CSC_CJK) { - // CJK excluding Katakana character hit. - // Do like at EOF with the current non-cjk data. - if (m_wordLen || m_span.length()) { - if (!doemit(true, it.getBpos())) - return false; - } + if (o_processCJK && csc == CSC_CJK) { + // CJK excluding Katakana character hit. + // Do like at EOF with the current non-cjk data. + if (m_wordLen || m_span.length()) { + if (!doemit(true, it.getBpos())) + return false; + } - // Hand off situation to the cjk routine. - if (!cjk_to_words(&it, &c)) { - LOGERR("Textsplit: scan error in cjk handler\n"); - return false; - } + // Hand off situation to the cjk routine. + if (!cjk_to_words(&it, &c)) { + LOGERR("Textsplit: scan error in cjk handler\n"); + return false; + } - // Check for eof, else c contains the first non-cjk - // character after the cjk sequence, just go on. - if (it.eof()) - break; - } + // Check for eof, else c contains the first non-cjk + // character after the cjk sequence, just go on. + if (it.eof()) + break; + } #ifdef KATAKANA_AS_WORDS // Only needed if we have script transitions inside this @@ -623,73 +630,74 @@ bool TextSplit::text_to_words(const string &in) #endif prev_csc = csc; - int cc = whatcc(c); + char asciirep = 0; + int cc = whatcc(c, &asciirep); - switch (cc) { - case SKIP: - // Special-case soft-hyphen. To work, this depends on the - // fact that only SKIP calls "continue" inside the - // switch. All the others will do the softhyphenpending - // reset after the switch - if (c == 0xad) { - softhyphenpending = true; - } else { - softhyphenpending = false; - } - // Skips the softhyphenpending reset - continue; + switch (cc) { + case SKIP: + // Special-case soft-hyphen. To work, this depends on the + // fact that only SKIP calls "continue" inside the + // switch. All the others will do the softhyphenpending + // reset after the switch + if (c == 0xad) { + softhyphenpending = true; + } else { + softhyphenpending = false; + } + // Skips the softhyphenpending reset + continue; - case DIGIT: - nonalnumcnt = 0; - if (m_wordLen == 0) - m_inNumber = true; - m_wordLen += it.appendchartostring(m_span); - STATS_INC_WORDCHARS; - break; + case DIGIT: + nonalnumcnt = 0; + if (m_wordLen == 0) + m_inNumber = true; + m_wordLen += it.appendchartostring(m_span); + STATS_INC_WORDCHARS; + break; - case SPACE: - nonalnumcnt = 0; - SPACE: - if (m_wordLen || m_span.length()) { - if (!doemit(true, it.getBpos())) - return false; - m_inNumber = false; - } - if (pagepending) { - pagepending = false; - newpage(m_wordpos); - } - break; + case SPACE: + nonalnumcnt = 0; + SPACE: + if (m_wordLen || m_span.length()) { + if (!doemit(true, it.getBpos())) + return false; + m_inNumber = false; + } + if (pagepending) { + pagepending = false; + newpage(m_wordpos); + } + break; - case WILD: - if (m_flags & TXTS_KEEPWILD) - goto NORMALCHAR; - else - goto SPACE; - break; + case WILD: + if (m_flags & TXTS_KEEPWILD) + goto NORMALCHAR; + else + goto SPACE; + break; - case '-': - case '+': - if (m_wordLen == 0) { - // + or - don't start a term except if this looks like - // it's going to be to be a number - if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) { - // -10 - m_inNumber = true; - m_wordLen += it.appendchartostring(m_span); - STATS_INC_WORDCHARS; + case '-': + case '+': + if (m_wordLen == 0) { + // + or - don't start a term except if this looks like + // it's going to be to be a number + if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) { + // -10 + m_inNumber = true; + m_wordLen += it.appendchartostring(m_span); + STATS_INC_WORDCHARS; break; - } - } else if (m_inNumber) { + } + } else if (m_inNumber) { if ((m_span[m_span.length() - 1] == 'e' || - m_span[m_span.length() - 1] == 'E')) { + m_span[m_span.length() - 1] == 'E')) { if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) { m_wordLen += it.appendchartostring(m_span); STATS_INC_WORDCHARS; break; } } - } else { + } else { if (cc == '+') { int nextc = it[it.getCpos()+1]; if (nextc == '+' || nextc == -1 || visiblewhite.find(nextc) @@ -707,43 +715,29 @@ bool TextSplit::text_to_words(const string &in) m_wordStart += it.appendchartostring(m_span); break; } - } - goto SPACE; - break; - - case 0x2010: - // Hyphen is replaced with ascii minus - if (m_wordLen != 0) { - // Treat '-' inside span as glue char - if (!doemit(false, it.getBpos())) - return false; - m_inNumber = false; - m_span += '-'; - m_wordStart++; - break; } goto SPACE; - case '.': - { - // Need a little lookahead here. At worse this gets the end null - int nextc = it[it.getCpos()+1]; - int nextwhat = whatcc(nextc); - if (m_inNumber) { - if (!isdigit(nextwhat, m_flags)) - goto SPACE; + case '.': + { + // Need a little lookahead here. At worse this gets the end null + int nextc = it[it.getCpos()+1]; + int nextwhat = whatcc(nextc); + if (m_inNumber) { + if (!isdigit(nextwhat, m_flags)) + goto SPACE; m_wordLen += it.appendchartostring(m_span); STATS_INC_WORDCHARS; - break; - } else { - // Found '.' while not in number + break; + } else { + // Found '.' while not in number - // Only letters and digits make sense after - if (!isalphanum(nextwhat, m_flags)) - goto SPACE; + // Only letters and digits make sense after + if (!isalphanum(nextwhat, m_flags)) + goto SPACE; - // Keep an initial '.' for catching .net, and .34 (aka - // 0.34) but this adds quite a few spurious terms ! + // Keep an initial '.' for catching .net, and .34 (aka + // 0.34) but this adds quite a few spurious terms ! if (m_span.length() == 0) { // Check for number like .1 if (isdigit(nextwhat, m_flags)) { @@ -764,46 +758,66 @@ bool TextSplit::text_to_words(const string &in) return false; m_wordStart += it.appendchartostring(m_span); } - } - } + } + } break; - case '@': - case '_': - case '\'': - // If in word, potential span: o'brien, jf@dockes.org, - // else just ignore - if (m_wordLen) { - if (!doemit(false, it.getBpos())) - return false; - m_inNumber = false; - m_wordStart += it.appendchartostring(m_span); - } - break; + case 0x2010: + case 0x2019: + case 0x275c: + case 0x02bc: + // Unicode chars which we replace with ascii for + // processing (2010 -> -,others -> '). It happens that + // they all work as glue chars and use the same code, but + // there might be cases needing different processing. + // Hyphen is replaced with ascii minus + if (m_wordLen) { + // Inside span: glue char + if (!doemit(false, it.getBpos())) + return false; + m_inNumber = false; + m_span += asciirep; + m_wordStart++; + break; + } + goto SPACE; - case '#': { - int w = whatcc(it[it.getCpos()+1]); - // Keep it only at the beginning of a word (hashtag), + case '@': + case '_': + case '\'': + // If in word, potential span: o'brien, jf@dockes.org, + // else just ignore + if (m_wordLen) { + if (!doemit(false, it.getBpos())) + return false; + m_inNumber = false; + m_wordStart += it.appendchartostring(m_span); + } + break; + + case '#': { + int w = whatcc(it[it.getCpos()+1]); + // Keep it only at the beginning of a word (hashtag), if (m_wordLen == 0 && isalphanum(w, m_flags)) { m_wordLen += it.appendchartostring(m_span); STATS_INC_WORDCHARS; break; } // or at the end (special case for c# ...) - if (m_wordLen > 0) { - if (w == SPACE || w == '\n' || w == '\r') { - m_wordLen += it.appendchartostring(m_span); - STATS_INC_WORDCHARS; - break; - } - } - goto SPACE; - } - break; + if (m_wordLen > 0) { + if (w == SPACE || w == '\n' || w == '\r') { + m_wordLen += it.appendchartostring(m_span); + STATS_INC_WORDCHARS; + break; + } + } + goto SPACE; + } + break; - case '\n': - case '\r': - if (m_span.length() && *m_span.rbegin() == '-') { + case '\n': + case '\r': + if (m_span.length() && *m_span.rbegin() == '-') { // if '-' is the last char before end of line, we // strip it. We have no way to know if this is added // because of the line split or if it was part of an @@ -815,18 +829,18 @@ bool TextSplit::text_to_words(const string &in) // will strip the trailing '-'. goto SPACE; } else if (softhyphenpending) { - // Don't reset soft-hyphen - continue; - } else { - // Normal case: EOL is white space - goto SPACE; - } - break; + // Don't reset soft-hyphen + continue; + } else { + // Normal case: EOL is white space + goto SPACE; + } + break; - case '\f': - pagepending = true; - goto SPACE; - break; + case '\f': + pagepending = true; + goto SPACE; + break; #ifdef RCL_SPLIT_CAMELCASE // Camelcase handling. @@ -842,8 +856,8 @@ bool TextSplit::text_to_words(const string &in) // both "MySQL manual" and "mysql manual" phrases would // match too. "my sql manual" would not match, but this is // not an issue. - case A_ULETTER: - if (m_span.length() && + case A_ULETTER: + if (m_span.length() && charclasses[(unsigned char)m_span[m_span.length() - 1]] == A_LLETTER) { if (m_wordLen) { @@ -860,7 +874,7 @@ bool TextSplit::text_to_words(const string &in) // acronym (readHTML) or a single letter article (ALittleHelp). // Emit the uppercase word before proceeding case A_LLETTER: - if (m_span.length() && + if (m_span.length() && charclasses[(unsigned char)m_span[m_span.length() - 1]] == A_ULETTER && m_wordLen > 1) { // Multiple upper-case letters. Single letter word @@ -877,21 +891,21 @@ bool TextSplit::text_to_words(const string &in) goto NORMALCHAR; #endif /* CAMELCASE */ - default: - NORMALCHAR: - nonalnumcnt = 0; + default: + NORMALCHAR: + nonalnumcnt = 0; if (m_inNumber && c != 'e' && c != 'E') { m_inNumber = false; } - m_wordLen += it.appendchartostring(m_span); - STATS_INC_WORDCHARS; - break; - } - softhyphenpending = false; + m_wordLen += it.appendchartostring(m_span); + STATS_INC_WORDCHARS; + break; + } + softhyphenpending = false; } if (m_wordLen || m_span.length()) { - if (!doemit(true, it.getBpos())) - return false; + if (!doemit(true, it.getBpos())) + return false; } return true; } @@ -921,64 +935,64 @@ bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp) unsigned int nchars = 0; unsigned int c = 0; for (; !it.eof(); it++) { - c = *it; - if (!UNICODE_IS_CJK(c)) { - // Return to normal handler - break; - } - if (whatcc(c) == SPACE) { - // Flush the ngram buffer and go on - nchars = 0; - continue; - } - if (nchars == o_CJKNgramLen) { - // Offset buffer full, shift it. Might be more efficient - // to have a circular one, but things are complicated - // enough already... - for (unsigned int i = 0; i < nchars-1; i++) { - boffs[i] = boffs[i+1]; - } - } else { - nchars++; - } + c = *it; + if (!UNICODE_IS_CJK(c)) { + // Return to normal handler + break; + } + if (whatcc(c) == SPACE) { + // Flush the ngram buffer and go on + nchars = 0; + continue; + } + if (nchars == o_CJKNgramLen) { + // Offset buffer full, shift it. Might be more efficient + // to have a circular one, but things are complicated + // enough already... + for (unsigned int i = 0; i < nchars-1; i++) { + boffs[i] = boffs[i+1]; + } + } else { + nchars++; + } - // Take note of byte offset for this character. - boffs[nchars-1] = int(it.getBpos()); + // Take note of byte offset for this character. + boffs[nchars-1] = int(it.getBpos()); - // Output all new ngrams: they begin at each existing position - // and end after the new character. onlyspans->only output - // maximum words, nospans=> single chars - if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) { - int btend = int(it.getBpos() + it.getBlen()); - int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0; - int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars; - for (int i = loopbeg; i < loopend; i++) { - if (!takeword(it.buffer().substr(boffs[i], - btend-boffs[i]), - m_wordpos - (nchars-i-1), boffs[i], btend)) { - return false; - } - } + // Output all new ngrams: they begin at each existing position + // and end after the new character. onlyspans->only output + // maximum words, nospans=> single chars + if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) { + int btend = int(it.getBpos() + it.getBlen()); + int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0; + int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars; + for (int i = loopbeg; i < loopend; i++) { + if (!takeword(it.buffer().substr(boffs[i], + btend-boffs[i]), + m_wordpos - (nchars-i-1), boffs[i], btend)) { + return false; + } + } - if ((m_flags & TXTS_ONLYSPANS)) { - // Only spans: don't overlap: flush buffer - nchars = 0; - } - } - // Increase word position by one, other words are at an - // existing position. This could be subject to discussion... - m_wordpos++; + if ((m_flags & TXTS_ONLYSPANS)) { + // Only spans: don't overlap: flush buffer + nchars = 0; + } + } + // Increase word position by one, other words are at an + // existing position. This could be subject to discussion... + m_wordpos++; } // If onlyspans is set, there may be things to flush in the buffer // first if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) { - int btend = int(it.getBpos()); // Current char is out - if (!takeword(it.buffer().substr(boffs[0], btend-boffs[0]), - m_wordpos - nchars, - boffs[0], btend)) { - return false; - } + int btend = int(it.getBpos()); // Current char is out + if (!takeword(it.buffer().substr(boffs[0], btend-boffs[0]), + m_wordpos - nchars, + boffs[0], btend)) { + return false; + } } // Reset state, saving term position, and return the found non-cjk @@ -997,8 +1011,8 @@ class TextSplitCW : public TextSplit { int wcnt; TextSplitCW(Flags flags) : TextSplit(flags), wcnt(0) {} bool takeword(const string &, int, int, int) { - wcnt++; - return true; + wcnt++; + return true; } }; @@ -1013,13 +1027,13 @@ bool TextSplit::hasVisibleWhite(const string &in) { Utf8Iter it(in); for (; !it.eof(); it++) { - unsigned int c = (unsigned char)*it; - if (c == (unsigned int)-1) { - LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n"); - return false; - } - if (visiblewhite.find(c) != visiblewhite.end()) - return true; + unsigned int c = (unsigned char)*it; + if (c == (unsigned int)-1) { + LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n"); + return false; + } + if (visiblewhite.find(c) != visiblewhite.end()) + return true; } return false; } @@ -1033,57 +1047,57 @@ template bool u8stringToStrings(const string &s, T &tokens) enum states {SPACE, TOKEN, INQUOTE, ESCAPE}; states state = SPACE; for (; !it.eof(); it++) { - unsigned int c = *it; - if (visiblewhite.find(c) != visiblewhite.end()) - c = ' '; - if (c == (unsigned int)-1) { - LOGERR("TextSplit::stringToStrings: error while scanning UTF-8 " + unsigned int c = *it; + if (visiblewhite.find(c) != visiblewhite.end()) + c = ' '; + if (c == (unsigned int)-1) { + LOGERR("TextSplit::stringToStrings: error while scanning UTF-8 " "string\n"); - return false; - } + return false; + } - switch (c) { - case '"': - switch(state) { - case SPACE: state = INQUOTE; continue; - case TOKEN: goto push_char; - case ESCAPE: state = INQUOTE; goto push_char; - case INQUOTE: tokens.push_back(current);current.clear(); - state = SPACE; continue; - } - break; - case '\\': - switch(state) { - case SPACE: - case TOKEN: state=TOKEN; goto push_char; - case INQUOTE: state = ESCAPE; continue; - case ESCAPE: state = INQUOTE; goto push_char; - } - break; + switch (c) { + case '"': + switch(state) { + case SPACE: state = INQUOTE; continue; + case TOKEN: goto push_char; + case ESCAPE: state = INQUOTE; goto push_char; + case INQUOTE: tokens.push_back(current);current.clear(); + state = SPACE; continue; + } + break; + case '\\': + switch(state) { + case SPACE: + case TOKEN: state=TOKEN; goto push_char; + case INQUOTE: state = ESCAPE; continue; + case ESCAPE: state = INQUOTE; goto push_char; + } + break; - case ' ': - case '\t': - case '\n': - case '\r': - switch(state) { - case SPACE: continue; - case TOKEN: tokens.push_back(current); current.clear(); - state = SPACE; continue; - case INQUOTE: - case ESCAPE: goto push_char; - } - break; + case ' ': + case '\t': + case '\n': + case '\r': + switch(state) { + case SPACE: continue; + case TOKEN: tokens.push_back(current); current.clear(); + state = SPACE; continue; + case INQUOTE: + case ESCAPE: goto push_char; + } + break; - default: - switch(state) { - case ESCAPE: state = INQUOTE; break; - case SPACE: state = TOKEN; break; - case TOKEN: - case INQUOTE: break; - } - push_char: - it.appendchartostring(current); - } + default: + switch(state) { + case ESCAPE: state = INQUOTE; break; + case SPACE: state = TOKEN; break; + case TOKEN: + case INQUOTE: break; + } + push_char: + it.appendchartostring(current); + } } // End of string. Process residue, and possible error (unfinished quote) @@ -1101,250 +1115,3 @@ bool TextSplit::stringToStrings(const string &s, vector &tokens) return u8stringToStrings >(s, tokens); } -#else // TEST driver -> - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "textsplit.h" -#include "readfile.h" -#include "log.h" - -#include "transcode.h" -#include "unacpp.h" -#include "termproc.h" - -using namespace std; - -class myTermProc : public Rcl::TermProc { - int first; - bool nooutput; -public: - myTermProc() : TermProc(0), first(1), nooutput(false) {} - void setNoOut(bool val) {nooutput = val;} - virtual bool takeword(const string &term, int pos, int bs, int be) - { - if (nooutput) - return true; - FILE *fp = stdout; - if (first) { - fprintf(fp, "%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be"); - first = 0; - } - fprintf(fp, "%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be); - return true; - } -}; - -#define OPT_s 0x1 -#define OPT_w 0x2 -#define OPT_q 0x4 -#define OPT_c 0x8 -#define OPT_k 0x10 -#define OPT_C 0x20 -#define OPT_n 0x40 -#define OPT_S 0x80 -#define OPT_u 0x100 -#define OPT_p 0x200 - -bool dosplit(const string& data, TextSplit::Flags flags, int op_flags) -{ - myTermProc printproc; - - Rcl::TermProc *nxt = &printproc; - -// Rcl::TermProcCommongrams commonproc(nxt, stoplist); -// if (op_flags & OPT_S) -// nxt = &commonproc; - - Rcl::TermProcPrep preproc(nxt); - if (op_flags & OPT_u) - nxt = &preproc; - - Rcl::TextSplitP splitter(nxt, flags); - - if (op_flags & OPT_q) - printproc.setNoOut(true); - - splitter.text_to_words(data); - -#ifdef TEXTSPLIT_STATS - TextSplit::Stats::Values v = splitter.getStats(); - cout << "Average length: " - << v.avglen - << " Standard deviation: " - << v.sigma - << " Coef of variation " - << v.sigma / v.avglen - << endl; -#endif - return true; -} - -static const char *teststrings[] = { - "Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n", - "\"Jean-Francois Dockes\" \n", - "n@d @net .net net@ t@v@c c# c++ o'brien 'o'brien'", - "_network_ some_span", - "data123\n", - "134 +134 -14 0.1 .1 2. -1.5 +1.5 1,2 1.54e10 1,2e30 .1e10 1.e-8\n", - "@^#$(#$(*)\n", - "192.168.4.1 one\n\rtwo\r", - "[olala][ululu] (valeur) (23)\n", - "utf-8 ucs-4© \\nodef\n", - "A b C 2 . +", - "','this\n", - " ,able,test-domain", - " -wl,--export-dynamic", - " ~/.xsession-errors", - "this_very_long_span_this_very_long_span_this_very_long_span", - "soft\xc2\xadhyphen", - "soft\xc2\xad\nhyphen", - "soft\xc2\xad\n\rhyphen", - "real\xe2\x80\x90hyphen", - "real\xe2\x80\x90\nhyphen", - "hyphen-\nminus", -}; -const int teststrings_cnt = sizeof(teststrings)/sizeof(char *); - -static string teststring1 = " nouvel-an "; - -static string thisprog; - -static string usage = - " textsplit [opts] [filename]\n" - " -q : no output\n" - " -s : only spans\n" - " -w : only words\n" - " -n : no numbers\n" - " -k : preserve wildcards (?*)\n" - " -c : just count words\n" - " -u : use unac\n" - " -C [charset] : input charset\n" - " -S [stopfile] : stopfile to use for commongrams\n" - " if filename is 'stdin', will read stdin for data (end with ^D)\n\n" - " textplit -p somephrase : display results from stringToStrings()\n" - " \n" - ; - -static void -Usage(void) -{ - cerr << thisprog << ": usage:\n" << usage; - exit(1); -} - -static int op_flags; - -int main(int argc, char **argv) -{ - string charset, stopfile; - - thisprog = argv[0]; - argc--; argv++; - - while (argc > 0 && **argv == '-') { - (*argv)++; - if (!(**argv)) - /* Cas du "adb - core" */ - Usage(); - while (**argv) - switch (*(*argv)++) { - case 'c': op_flags |= OPT_c; break; - case 'C': op_flags |= OPT_C; if (argc < 2) Usage(); - charset = *(++argv); argc--; - goto b1; - case 'k': op_flags |= OPT_k; break; - case 'n': op_flags |= OPT_n; break; - case 'p': op_flags |= OPT_p; break; - case 'q': op_flags |= OPT_q; break; - case 's': op_flags |= OPT_s; break; - case 'S': op_flags |= OPT_S; if (argc < 2) Usage(); - stopfile = *(++argv); argc--; - goto b1; - case 'u': op_flags |= OPT_u; break; - case 'w': op_flags |= OPT_w; break; - default: Usage(); break; - } - b1: argc--; argv++; - } - - TextSplit::Flags flags = TextSplit::TXTS_NONE; - - if (op_flags&OPT_s) - flags = TextSplit::TXTS_ONLYSPANS; - else if (op_flags&OPT_w) - flags = TextSplit::TXTS_NOSPANS; - if (op_flags & OPT_k) - flags = (TextSplit::Flags)(flags | TextSplit::TXTS_KEEPWILD); - if (op_flags & OPT_n) - TextSplit::noNumbers(); - - Rcl::StopList stoplist; - if (op_flags & OPT_S) { - if (!stoplist.setFile(stopfile)) { - cerr << "Can't read stopfile: " << stopfile << endl; - exit(1); - } - } - string odata, reason; - if (argc == 1) { - const char *filename = *argv++; argc--; - if (op_flags& OPT_p) { - vector tokens; - TextSplit::stringToStrings(filename, tokens); - for (vector::const_iterator it = tokens.begin(); - it != tokens.end(); it++) { - cout << "[" << *it << "] "; - } - cout << endl; - exit(0); - } - if (!strcmp(filename, "stdin")) { - char buf[1024]; - int nread; - while ((nread = read(0, buf, 1024)) > 0) { - odata.append(buf, nread); - } - } else if (!file_to_string(filename, odata, &reason)) { - cerr << "Failed: file_to_string(" << filename << ") failed: " - << reason << endl; - exit(1); - } - } else { - if (op_flags & OPT_p) - Usage(); - for (int i = 0; i < teststrings_cnt; i++) { - cout << endl << teststrings[i] << endl; - dosplit(teststrings[i], flags, op_flags); - } - exit(0); - } - - string& data = odata; - string ndata; - if ((op_flags & OPT_C)) { - if (!transcode(odata, ndata, charset, "UTF-8")) { - cerr << "Failed: transcode error" << endl; - exit(1); - } else { - data = ndata; - } - } - - if (op_flags & OPT_c) { - int n = TextSplit::countWords(data, flags); - cout << n << " words" << endl; - } else { - dosplit(data, flags, op_flags); - } -} -#endif // TEST - diff --git a/tests/html/html.sh b/tests/html/html.sh index 181b2630..0750360d 100755 --- a/tests/html/html.sh +++ b/tests/html/html.sh @@ -11,7 +11,13 @@ recollq '"This is the Mysql reference manual"' # Tests that the charset spec is correctly recognised inside badhtml.html recollq -a 'etonne badhtml' # Tests field extraction/storage and indexing -recollq -m -q "testfield:testfieldvalue" | egrep 'results|^text/html|^testfield =' +recollq -m -q "testfield:testfieldvalue" | \ + egrep 'results|^text/html|^testfield =' + +# Not specifically HTML. apos.html has text where an apostrophe-like +# Unicode character is used in place of ASCII ' . Checks that we +# replace the character for proper span processing +recollq -q '"'imperfections de l"'"oeil'"' # more unaccenting tests recollq -q 'effaranteUTF8HTML' diff --git a/tests/html/html.txt b/tests/html/html.txt index e0b8e113..36354d2e 100644 --- a/tests/html/html.txt +++ b/tests/html/html.txt @@ -11,6 +11,8 @@ text/html [file:///home/dockes/projets/fulltext/testrecoll/html/badhtml.html] [" text/html [file:///home/dockes/projets/fulltext/testrecoll/html/htmlfield.html] [htmlfield.html] 137 bytes testfield = testfieldvalue 1 results +text/html [file:///home/dockes/projets/fulltext/testrecoll/html/apos.html] [apos.html] 344 bytes +1 results text/html [file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html] [Some chars] 330 bytes 1 results text/html [file:///home/dockes/projets/fulltext/testrecoll/html/utf8.html] [Some chars] 330 bytes