Text splitter: move apos and dash character conversions to unac_except_trans.

This was complicated and caused problems with highlight areas position computations in
plaintorich. Also, simplify the code for processing some dangling characters.
This commit is contained in:
Jean-Francois Dockes 2021-11-02 14:32:38 +01:00
parent a24fc7bacc
commit 728129e5ce
3 changed files with 138 additions and 192 deletions

View File

@ -206,32 +206,24 @@ public:
};
static const CharClassInit charClassInitInstance;
static inline int whatcc(unsigned int c, char *asciirep = nullptr)
static inline bool isvisiblewhite(int c)
{
return visiblewhite.find(c) != visiblewhite.end();
}
static inline int whatcc(unsigned int c)
{
if (c <= 127) {
return charclasses[c];
} else {
if (c == 0x2010) {
// Special treatment for hyphen: handle as ascii minus. See
// doc/notes/minus-hyphen-dash.txt
if (asciirep)
*asciirep = '-';
return c;
} else if (c == 0x2019 || c == 0x275c || c == 0x02bc) {
// Things sometimes replacing a single quote. Use single
// quote so that span processing works ok
if (asciirep)
*asciirep = '\'';
return c;
} else if (sskip.find(c) != sskip.end()) {
if (sskip.find(c) != sskip.end()) {
return SKIP;
} else if (spunc.find(c) != spunc.end()) {
return SPACE;
} else {
vector<unsigned int>::iterator it =
lower_bound(vpuncblocks.begin(), vpuncblocks.end(), c);
if (it == vpuncblocks.end())
return LETTER;
auto it = lower_bound(vpuncblocks.begin(), vpuncblocks.end(), c);
if (it == vpuncblocks.end())
return LETTER;
if (c == *it)
return SPACE;
if ((it - vpuncblocks.begin()) % 2 == 1) {
@ -245,16 +237,16 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
// testing whatcc...
#if 0
unsigned int testvalues[] = {'a', '0', 0x80, 0xbf, 0xc0, 0x05c3, 0x1000,
0x2000, 0x2001, 0x206e, 0x206f, 0x20d0, 0x2399,
0x2400, 0x2401, 0x243f, 0x2440, 0xff65};
int ntest = sizeof(testvalues) / sizeof(int);
for (int i = 0; i < ntest; i++) {
int ret = whatcc(testvalues[i]);
printf("Tested value 0x%x, returned value %d %s\n",
testvalues[i], ret, ret == LETTER ? "LETTER" :
ret == SPACE ? "SPACE" : "OTHER");
}
unsigned int testvalues[] = {'a', '0', 0x80, 0xbf, 0xc0, 0x05c3, 0x1000,
0x2000, 0x2001, 0x206e, 0x206f, 0x20d0, 0x2399,
0x2400, 0x2401, 0x243f, 0x2440, 0xff65};
int ntest = sizeof(testvalues) / sizeof(int);
for (int i = 0; i < ntest; i++) {
int ret = whatcc(testvalues[i]);
printf("Tested value 0x%x, returned value %d %s\n",
testvalues[i], ret, ret == LETTER ? "LETTER" :
ret == SPACE ? "SPACE" : "OTHER");
}
#endif
// CJK Unicode character detection. CJK text is indexed using an n-gram
@ -287,16 +279,16 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
// FF00..FFEF; Halfwidth and Fullwidth Forms
// 20000..2A6DF; CJK Unified Ideographs Extension B
// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
#define UNICODE_IS_CJK(p) \
(((p) >= 0x1100 && (p) <= 0x11FF) || \
((p) >= 0x2E80 && (p) <= 0x2EFF) || \
((p) >= 0x3000 && (p) <= 0x9FFF) || \
((p) >= 0xA700 && (p) <= 0xA71F) || \
((p) >= 0xAC00 && (p) <= 0xD7AF) || \
((p) >= 0xF900 && (p) <= 0xFAFF) || \
((p) >= 0xFE30 && (p) <= 0xFE4F) || \
((p) >= 0xFF00 && (p) <= 0xFFEF) || \
((p) >= 0x20000 && (p) <= 0x2A6DF) || \
#define UNICODE_IS_CJK(p) \
(((p) >= 0x1100 && (p) <= 0x11FF) || \
((p) >= 0x2E80 && (p) <= 0x2EFF) || \
((p) >= 0x3000 && (p) <= 0x9FFF) || \
((p) >= 0xA700 && (p) <= 0xA71F) || \
((p) >= 0xAC00 && (p) <= 0xD7AF) || \
((p) >= 0xF900 && (p) <= 0xFAFF) || \
((p) >= 0xFE30 && (p) <= 0xFE4F) || \
((p) >= 0xFF00 && (p) <= 0xFFEF) || \
((p) >= 0x20000 && (p) <= 0x2A6DF) || \
((p) >= 0x2F800 && (p) <= 0x2FA1F))
// We should probably map 'fullwidth ascii variants' and 'halfwidth
@ -304,9 +296,9 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
// filter, KuromojiNormalizeFilter.java
// 309F is Hiragana.
#ifdef KATAKANA_AS_WORDS
#define UNICODE_IS_KATAKANA(p) \
((p) != 0x309F && \
(((p) >= 0x3099 && (p) <= 0x30FF) || \
#define UNICODE_IS_KATAKANA(p) \
((p) != 0x309F && \
(((p) >= 0x3099 && (p) <= 0x30FF) || \
((p) >= 0x31F0 && (p) <= 0x31FF)))
#else
#define UNICODE_IS_KATAKANA(p) false
@ -315,14 +307,14 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
#ifdef HANGUL_AS_WORDS
// If no external tagger is configured, we process HANGUL as generic
// cjk (n-grams)
#define UNICODE_IS_HANGUL(p) ( \
o_exthangultagger && \
(((p) >= 0x1100 && (p) <= 0x11FF) || \
((p) >= 0x3130 && (p) <= 0x318F) || \
((p) >= 0x3200 && (p) <= 0x321e) || \
((p) >= 0x3248 && (p) <= 0x327F) || \
((p) >= 0x3281 && (p) <= 0x32BF) || \
((p) >= 0xAC00 && (p) <= 0xD7AF)) \
#define UNICODE_IS_HANGUL(p) ( \
o_exthangultagger && \
(((p) >= 0x1100 && (p) <= 0x11FF) || \
((p) >= 0x3130 && (p) <= 0x318F) || \
((p) >= 0x3200 && (p) <= 0x321e) || \
((p) >= 0x3248 && (p) <= 0x327F) || \
((p) >= 0x3281 && (p) <= 0x32BF) || \
((p) >= 0xAC00 && (p) <= 0xD7AF)) \
)
#else
#define UNICODE_IS_HANGUL(p) false
@ -351,19 +343,16 @@ bool TextSplit::isNGRAMMED(int c)
}
// This is used to detect katakana/other transitions, which must
// trigger a word split (there is not always a separator, and katakana
// is otherwise treated like other, in the same routine, unless cjk
// This is used to detect katakana/other transitions, which must trigger a word split (there is not
// always a separator, and katakana is otherwise treated like other, in the same routine, unless cjk
// which has its span reader causing a word break)
enum CharSpanClass {CSC_HANGUL, CSC_CJK, CSC_KATAKANA, CSC_OTHER};
std::vector<CharFlags> csc_names {CHARFLAGENTRY(CSC_HANGUL),
CHARFLAGENTRY(CSC_CJK), CHARFLAGENTRY(CSC_KATAKANA),
CHARFLAGENTRY(CSC_OTHER)};
std::vector<CharFlags> csc_names {CHARFLAGENTRY(CSC_HANGUL), CHARFLAGENTRY(CSC_CJK),
CHARFLAGENTRY(CSC_KATAKANA), CHARFLAGENTRY(CSC_OTHER)};
// Final term checkpoint: do some checking (the kind which is simpler
// to do here than in the main loop), then send term to our client.
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
size_t btstart, size_t btend)
// Final term checkpoint: do some checking (the kind which is simpler to do here than in the main
// loop), then send term to our client.
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, size_t btstart, size_t btend)
{
LOGDEB2("TextSplit::emitterm: [" << w << "] pos " << pos << "\n");
@ -378,39 +367,38 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
PRETEND_USE(isspan);
#endif
if (l > 0 && l <= o_maxWordLength) {
// 1 byte word: we index single ascii letters and digits, but
// nothing else. We might want to turn this into a test for a
// single utf8 character instead ?
if (l == 1) {
unsigned int c = ((unsigned int)w[0]) & 0xff;
if (charclasses[c] != A_ULETTER && charclasses[c] != A_LLETTER &&
charclasses[c] != DIGIT &&
(!(m_flags & TXTS_KEEPWILD) || charclasses[c] != WILD)
) {
//cerr << "ERASING single letter term " << c << endl;
return true;
}
}
if (pos != m_prevpos || l != m_prevlen) {
bool ret = takeword(w, pos, int(btstart), int(btend));
m_prevpos = pos;
m_prevlen = int(w.length());
return ret;
}
LOGDEB2("TextSplit::emitterm:dup: [" << w << "] pos " << pos << "\n");
if (l == 0 || l > o_maxWordLength) {
return true;
}
if (l == 1) {
// 1 byte word: we index single ascii letters and digits, but nothing else. We might want to
// turn this into a test for a single utf8 character instead ?
unsigned int c = ((unsigned int)w[0]) & 0xff;
if (charclasses[c] != A_ULETTER && charclasses[c] != A_LLETTER &&
charclasses[c] != DIGIT &&
(!(m_flags & TXTS_KEEPWILD) || charclasses[c] != WILD)
) {
//cerr << "ERASING single letter term " << c << endl;
return true;
}
}
if (pos != m_prevpos || l != m_prevlen) {
bool ret = takeword(w, pos, int(btstart), int(btend));
m_prevpos = pos;
m_prevlen = int(w.length());
return ret;
}
LOGDEB2("TextSplit::emitterm:dup: [" << w << "] pos " << pos << "\n");
return true;
}
// Check for an acronym/abbreviation ie I.B.M. This only works with
// ascii (no non-ascii utf-8 acronym are possible)
// Check for an acronym/abbreviation ie I.B.M. This only works with ascii (we do not detect
// non-ascii utf-8 acronyms)
bool TextSplit::span_is_acronym(string *acronym)
{
bool acron = false;
if (m_wordLen != m_span.length() &&
m_span.length() > 2 && m_span.length() <= 20) {
if (m_wordLen != m_span.length() && m_span.length() > 2 && m_span.length() <= 20) {
acron = true;
// Check odd chars are '.'
for (unsigned int i = 1 ; i < m_span.length(); i += 2) {
@ -439,27 +427,23 @@ bool TextSplit::span_is_acronym(string *acronym)
}
// Generate terms from span. Have to take into account the
// flags: ONLYSPANS, NOSPANS, noNumbers
// Generate terms from span. Have to take into account the flags: ONLYSPANS, NOSPANS, noNumbers
bool TextSplit::words_from_span(size_t bp)
{
#if 0
cerr << "Span: [" << m_span << "] " << " w_i_s size: " <<
m_words_in_span.size() << " : ";
cerr << "Span: [" << m_span << "] " << " bp " << bp <<
" w_i_s size: " << m_words_in_span.size() << " : ";
for (unsigned int i = 0; i < m_words_in_span.size(); i++) {
cerr << " [" << m_words_in_span[i].first << " " <<
m_words_in_span[i].second << "] ";
cerr << " [" << m_words_in_span[i].first << " " << m_words_in_span[i].second << "] ";
}
cerr << endl;
#endif
int spanwords = int(m_words_in_span.size());
// It seems that something like: tv_combo-sample_util.Po@am_quote
// can get the splitter to call doemit with a span of '@' and
// words_in_span==0, which then causes a crash when accessing
// words_in_span[0] if the stl assertions are active (e.g. Fedora
// RPM build). Not too sure what the right fix would be, but for
// now, just defend against it
// It seems that something like: tv_combo-sample_util.Po@am_quote can get the splitter to call
// doemit with a span of '@' and words_in_span==0, which then causes a crash when accessing
// words_in_span[0] if the stl assertions are active (e.g. Fedora RPM build). Not too sure what
// the right fix would be, but for now, just defend against it
if (spanwords == 0) {
return true;
}
@ -467,21 +451,17 @@ bool TextSplit::words_from_span(size_t bp)
// Byte position of the span start
size_t spboffs = bp - m_span.size();
if (o_deHyphenate && spanwords == 2 &&
m_span[m_words_in_span[0].second] == '-') {
if (o_deHyphenate && spanwords == 2 && m_span[m_words_in_span[0].second] == '-') {
unsigned int s0 = m_words_in_span[0].first;
unsigned int l0 = m_words_in_span[0].second - m_words_in_span[0].first;
unsigned int s1 = m_words_in_span[1].first;
unsigned int l1 = m_words_in_span[1].second - m_words_in_span[1].first;
string word = m_span.substr(s0, l0) + m_span.substr(s1, l1);
if (l0 && l1)
emitterm(false, word,
m_spanpos, spboffs, spboffs + m_words_in_span[1].second);
emitterm(false, word, m_spanpos, spboffs, spboffs + m_words_in_span[1].second);
}
for (int i = 0;
i < ((m_flags&TXTS_ONLYSPANS) ? 1 : spanwords);
i++) {
for (int i = 0; i < ((m_flags&TXTS_ONLYSPANS) ? 1 : spanwords); i++) {
int deb = m_words_in_span[i].first;
bool noposinc = m_words_in_span[i].second == deb;
@ -490,8 +470,7 @@ bool TextSplit::words_from_span(size_t bp)
j++) {
int fin = m_words_in_span[j].second;
//cerr << "i " << i << " j " << j << " deb " << deb <<
//" fin " << fin << endl;
//cerr << "i " << i << " j " << j << " deb " << deb << " fin " << fin << endl;
if (fin - deb > int(m_span.size()))
break;
string word(m_span.substr(deb, fin-deb));
@ -519,7 +498,7 @@ bool TextSplit::words_from_span(size_t bp)
*
* @return true if ok, false for error. Splitting should stop in this case.
* @param spanerase Set if the current span is at its end. Process it.
* @param bp The current BYTE position in the stream
* @param bp The current BYTE position in the stream (it's beyond the current span data).
*/
inline bool TextSplit::doemit(bool spanerase, size_t _bp)
{
@ -550,38 +529,13 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
return true;
}
// Span is done (too long or span-terminating character). Produce
// terms and reset it.
// Span is done (too long or span-terminating character). Produce terms and reset it.
string acronym;
if (span_is_acronym(&acronym)) {
if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(), bp))
return false;
}
// Maybe trim at end. These are chars that we might keep
// inside a span, but not at the end.
string::size_type trimsz{0};
while (trimsz < m_span.length()) {
auto c = m_span[m_span.length() - 1 - trimsz];
if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' ||
c == underscoreatend) {
trimsz++;
if (m_words_in_span.size() &&
m_words_in_span.back().second > int(m_span.size())) {
m_words_in_span.back().second = int(m_span.size());
}
if (--bp < 0) {
bp = 0;
}
} else {
break;
}
}
if (trimsz > 0) {
m_span.resize(m_span.length() - trimsz);
}
if (!words_from_span(bp)) {
return false;
}
@ -713,8 +667,7 @@ bool TextSplit::text_to_words(const string &in)
prev_csc = csc;
#endif
char asciirep = 0;
int cc = whatcc(c, &asciirep);
int cc = whatcc(c);
switch (cc) {
case SKIP:
@ -773,7 +726,7 @@ bool TextSplit::text_to_words(const string &in)
}
} else if (m_inNumber) {
if ((m_span[m_span.length() - 1] == 'e' ||
m_span[m_span.length() - 1] == 'E')) {
m_span[m_span.length() - 1] == 'E')) {
if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
m_wordLen += it.appendchartostring(m_span);
STATS_INC_WORDCHARS;
@ -781,17 +734,18 @@ bool TextSplit::text_to_words(const string &in)
}
}
} else {
int nextc = it[it.getCpos()+1];
if (cc == '+') {
int nextc = it[it.getCpos()+1];
if (nextc == '+' || nextc == -1 || visiblewhite.find(nextc)
!= visiblewhite.end()) {
if (nextc == '+' || nextc == -1 || isvisiblewhite(nextc)) {
// someword++[+...] !
m_wordLen += it.appendchartostring(m_span);
STATS_INC_WORDCHARS;
break;
}
} else {
// Treat '-' inside span as glue char
if (nextc == -1 || isvisiblewhite(nextc)) {
goto SPACE;
}
if (!doemit(false, it.getBpos()))
return false;
m_inNumber = false;
@ -827,8 +781,7 @@ bool TextSplit::text_to_words(const string &in)
m_inNumber = true;
m_wordLen += it.appendchartostring(m_span);
} else {
m_words_in_span.
push_back(pair<int,int>(m_wordStart, m_wordStart));
m_words_in_span.push_back(pair<int,int>(m_wordStart, m_wordStart));
m_wordStart += it.appendchartostring(m_span);
}
STATS_INC_WORDCHARS;
@ -845,38 +798,28 @@ bool TextSplit::text_to_words(const string &in)
}
break;
case 0x2010:
case 0x2019:
case 0x2010: // hyphen
case 0x2019: // variations on single quote
case 0x275c:
case 0x02bc:
// Unicode chars which we replace with ascii for
// processing (2010 -> -,others -> '). It happens that
// they all work as glue chars and use the same code, but
// there might be cases needing different processing.
// Hyphen is replaced with ascii minus
if (m_wordLen) {
// Inside span: glue char
if (!doemit(false, it.getBpos()))
return false;
m_inNumber = false;
m_span += asciirep;
m_wordStart++;
break;
}
goto SPACE;
case '@':
case '_': // If underscoreasletter is set, we'll never get this
case '\'':
{
// If in word, potential span: o'brien, jf@dockes.org,
// else just ignore
int nextc = it[it.getCpos()+1];
if (nextc == -1 || isvisiblewhite(nextc)) {
goto SPACE;
}
if (m_wordLen) {
if (!doemit(false, it.getBpos()))
return false;
m_inNumber = false;
m_wordStart += it.appendchartostring(m_span);
}
break;
}
break;
case '#': {
int w = whatcc(it[it.getCpos()+1]);
@ -1107,7 +1050,7 @@ bool TextSplit::cjk_to_words(Utf8Iter& it, unsigned int *cp)
// Specialization for countWords
class TextSplitCW : public TextSplit {
public:
public:
int wcnt;
TextSplitCW(Flags flags) : TextSplit(flags), wcnt(0) {}
bool takeword(const string &, int, int, int) {
@ -1132,7 +1075,7 @@ bool TextSplit::hasVisibleWhite(const string &in)
LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n");
return false;
}
if (visiblewhite.find(c) != visiblewhite.end())
if (isvisiblewhite(c))
return true;
}
return false;
@ -1157,7 +1100,7 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
}
switch (c) {
case '"':
case '"':
switch(state) {
case SPACE: state = INQUOTE; continue;
case TOKEN: goto push_char;
@ -1166,7 +1109,7 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
state = SPACE; continue;
}
break;
case '\\':
case '\\':
switch(state) {
case SPACE:
case TOKEN: state=TOKEN; goto push_char;
@ -1175,25 +1118,25 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
}
break;
case ' ':
case '\t':
case '\n':
case '\r':
case ' ':
case '\t':
case '\n':
case '\r':
switch(state) {
case SPACE: continue;
case TOKEN: tokens.push_back(current); current.clear();
case SPACE: continue;
case TOKEN: tokens.push_back(current); current.clear();
state = SPACE; continue;
case INQUOTE:
case ESCAPE: goto push_char;
}
break;
default:
default:
switch(state) {
case ESCAPE: state = INQUOTE; break;
case SPACE: state = TOKEN; break;
case TOKEN:
case INQUOTE: break;
case ESCAPE: state = INQUOTE; break;
case SPACE: state = TOKEN; break;
case TOKEN:
case INQUOTE: break;
}
push_char:
it.appendchartostring(current);
@ -1214,4 +1157,3 @@ bool TextSplit::stringToStrings(const string &s, vector<string> &tokens)
{
return u8stringToStrings<vector<string> >(s, tokens);
}

View File

@ -375,29 +375,33 @@ indexstemminglanguages = english
# sub-directory.</descr></var>
#defaultcharset = iso-8859-1
# <var name="unac_except_trans" type="string"><brief>A list of characters,
# encoded in UTF-8, which should be handled specially
# when converting text to unaccented lowercase.</brief><descr>For
# example, in Swedish, the letter a with diaeresis has full alphabet
# citizenship and should not be turned into an a.
# Each element in the space-separated list has the special character as
# first element and the translation following. The handling of both the
# lowercase and upper-case versions of a character should be specified, as
# appartenance to the list will turn-off both standard accent and case
# processing. The value is global and affects both indexing and querying.
# <var name="unac_except_trans" type="string">
#
# <brief>A list of characters, encoded in UTF-8, which should be handled specially when converting
# text to unaccented lowercase.</brief>
# <descr>For example, in Swedish, the letter a with diaeresis has full alphabet citizenship and
# should not be turned into an a. Each element in the space-separated list has the special
# character as first element and the translation following. The handling of both the lowercase and
# upper-case versions of a character should be specified, as appartenance to the list will turn-off
# both standard accent and case processing. The value is global and affects both indexing and
# querying. We also convert a few confusing Unicode characters (quotes, hyphen) to their ASCII
# equivalent to avoid "invisible" search failures.
#
# Examples:
# Swedish:
# unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl åå Åå
# unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl åå Åå ' ❜' ʼ' -
# . German:
# unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl
# unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl ' ❜' ʼ' -
# . French: you probably want to decompose oe and ae and nobody would type
# a German ß
# unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl
# unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl ' ❜' ʼ' -
# . The default for all until someone protests follows. These decompositions
# are not performed by unac, but it is unlikely that someone would type the
# composed forms in a search.
# unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl</descr></var>
unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl
# unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl ' ❜' ʼ' -
</descr></var>
unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl ' ❜' ʼ' -
# <var name="maildefcharset" type="string"><brief>Overrides the default
# character set for email messages which don't specify

View File

@ -41,7 +41,7 @@ daemSkippedPaths = \
@RECOLL_TESTDATA@/skipped/real* \
@RECOLL_TESTDATA@/config
unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl
unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl ' ❜' ʼ' -
pdfextrameta = pdf:Producer dc:identifier
pdfextrametafix = @RECOLL_TESTS@/config/pdfemf.py