New text splitter with word accumulator and full partial span generation. Search/Index seem ok. Still a pb with use for highlighting (preview)
This commit is contained in:
parent
f1b132bb12
commit
ece15318ab
13
.hgignore
13
.hgignore
@ -108,12 +108,25 @@ src/recollinstall
|
|||||||
src/sampleconf/rclmon.sh
|
src/sampleconf/rclmon.sh
|
||||||
src/sampleconf/recoll.conf
|
src/sampleconf/recoll.conf
|
||||||
src/utils/alldeps
|
src/utils/alldeps
|
||||||
|
tests/casediac/aspdict.en.rws
|
||||||
|
tests/casediac/idxstatus.txt
|
||||||
|
tests/casediac/index.pid
|
||||||
|
tests/casediac/mimeview
|
||||||
|
tests/casediac/missing
|
||||||
|
tests/casediac/recoll.conf
|
||||||
|
tests/casediac/xapiandb
|
||||||
tests/config/aspdict.en.rws
|
tests/config/aspdict.en.rws
|
||||||
tests/config/history
|
tests/config/history
|
||||||
tests/config/idxstatus.txt
|
tests/config/idxstatus.txt
|
||||||
tests/config/index.pid
|
tests/config/index.pid
|
||||||
tests/config/missing
|
tests/config/missing
|
||||||
tests/config/xapiandb
|
tests/config/xapiandb
|
||||||
|
tests/indexedmimetypes/aspdict.en.rws
|
||||||
tests/indexedmimetypes/idxstatus.txt
|
tests/indexedmimetypes/idxstatus.txt
|
||||||
tests/indexedmimetypes/index.pid
|
tests/indexedmimetypes/index.pid
|
||||||
|
tests/indexedmimetypes/mimeview
|
||||||
|
tests/indexedmimetypes/missing
|
||||||
|
tests/indexedmimetypes/recoll.conf
|
||||||
|
tests/indexedmimetypes/xapiandb
|
||||||
|
tests/xattr/mimeview
|
||||||
website/usermanual/*
|
website/usermanual/*
|
||||||
|
|||||||
@ -36,10 +36,14 @@ using namespace std;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Splitting a text into words. The code in this file works with utf-8
|
* Splitting a text into words. The code in this file works with utf-8
|
||||||
* in a semi-clean way (see uproplist.h). Ascii still gets special treatment.
|
* in a semi-clean way (see uproplist.h). Ascii still gets special
|
||||||
|
* treatment in the sense that many special characters can only be
|
||||||
|
* ascii (e.g. @, _,...). However, this compromise works quite well
|
||||||
|
* while being much more light-weight than a full-blown Unicode
|
||||||
|
* approach (ICU...)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Character classes: we have three main groups, and then some chars
|
// Ascii character classes: we have three main groups, and then some chars
|
||||||
// are their own class because they want special handling.
|
// are their own class because they want special handling.
|
||||||
//
|
//
|
||||||
// We have an array with 256 slots where we keep the character types.
|
// We have an array with 256 slots where we keep the character types.
|
||||||
@ -53,10 +57,10 @@ enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259,
|
|||||||
A_ULETTER=260, A_LLETTER=261, SKIP=262};
|
A_ULETTER=260, A_LLETTER=261, SKIP=262};
|
||||||
static int charclasses[charclasses_size];
|
static int charclasses[charclasses_size];
|
||||||
|
|
||||||
// Real UTF-8 characters are handled with sets holding all characters
|
// Non-ascii UTF-8 characters are handled with sets holding all
|
||||||
// with interesting properties. This is far from full-blown management
|
// characters with interesting properties. This is far from full-blown
|
||||||
// of Unicode properties, but seems to do the job well enough in most
|
// management of Unicode properties, but seems to do the job well
|
||||||
// common cases
|
// enough in most common cases
|
||||||
static vector<unsigned int> vpuncblocks;
|
static vector<unsigned int> vpuncblocks;
|
||||||
static STD_UNORDERED_SET<unsigned int> spunc;
|
static STD_UNORDERED_SET<unsigned int> spunc;
|
||||||
static STD_UNORDERED_SET<unsigned int> visiblewhite;
|
static STD_UNORDERED_SET<unsigned int> visiblewhite;
|
||||||
@ -195,12 +199,12 @@ bool TextSplit::o_processCJK = true;
|
|||||||
unsigned int TextSplit::o_CJKNgramLen = 2;
|
unsigned int TextSplit::o_CJKNgramLen = 2;
|
||||||
bool TextSplit::o_noNumbers = false;
|
bool TextSplit::o_noNumbers = false;
|
||||||
|
|
||||||
// Do some checking (the kind which is simpler to do here than in the
|
// Final term checkpoint: do some checking (the kind which is simpler
|
||||||
// main loop), then send term to our client.
|
// to do here than in the main loop), then send term to our client.
|
||||||
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
||||||
int btstart, int btend)
|
int btstart, int btend)
|
||||||
{
|
{
|
||||||
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
LOGDEB2(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
||||||
|
|
||||||
unsigned int l = w.length();
|
unsigned int l = w.length();
|
||||||
|
|
||||||
@ -236,60 +240,133 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check for an acronym/abbreviation ie I.B.M. This only works with
|
||||||
|
// ascii (no non-ascii utf-8 acronym are possible)
|
||||||
|
bool TextSplit::span_is_acronym(string *acronym)
|
||||||
|
{
|
||||||
|
bool acron = false;
|
||||||
|
|
||||||
|
if (m_wordLen != m_span.length() &&
|
||||||
|
m_span.length() > 2 && m_span.length() <= 20) {
|
||||||
|
acron = true;
|
||||||
|
// Check odd chars are '.'
|
||||||
|
for (unsigned int i = 1 ; i < m_span.length(); i += 2) {
|
||||||
|
if (m_span[i] != '.') {
|
||||||
|
acron = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (acron) {
|
||||||
|
// Check that even chars are letters
|
||||||
|
for (unsigned int i = 0 ; i < m_span.length(); i += 2) {
|
||||||
|
int c = m_span[i];
|
||||||
|
if (!((c >= 'a' && c <= 'z')||(c >= 'A' && c <= 'Z'))) {
|
||||||
|
acron = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (acron) {
|
||||||
|
for (unsigned int i = 0; i < m_span.length(); i += 2) {
|
||||||
|
*acronym += m_span[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return acron;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Generate terms from span. Have to take into account the
|
||||||
|
// flags: ONLYSPANS, NOSPANS, noNumbers
|
||||||
|
bool TextSplit::words_from_span()
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
cerr << "Span: [" << m_span << "] " << " w_i_s size: " <<
|
||||||
|
m_words_in_span.size() << " : ";
|
||||||
|
for (unsigned int i = 0; i < m_words_in_span.size(); i++) {
|
||||||
|
cerr << " [" << m_words_in_span[i].first << " " <<
|
||||||
|
m_words_in_span[i].second << "] ";
|
||||||
|
|
||||||
|
}
|
||||||
|
cerr << endl;
|
||||||
|
#endif
|
||||||
|
unsigned int spanwords = m_words_in_span.size();
|
||||||
|
int pos = m_spanpos;
|
||||||
|
|
||||||
|
for (unsigned int i = 0;
|
||||||
|
i < ((m_flags&TXTS_ONLYSPANS) ? 1 : spanwords);
|
||||||
|
i++, pos++) {
|
||||||
|
|
||||||
|
int deb = m_words_in_span[i].first;
|
||||||
|
|
||||||
|
for (unsigned int j = ((m_flags&TXTS_ONLYSPANS) ? spanwords-1 : i);
|
||||||
|
j < ((m_flags&TXTS_NOSPANS) ? i+1 : spanwords);
|
||||||
|
j++) {
|
||||||
|
|
||||||
|
int fin = m_words_in_span[j].second;
|
||||||
|
//cerr << "i " << i << " j " << j << " deb " << deb <<
|
||||||
|
// " fin " << fin << endl;
|
||||||
|
if (fin - deb > int(m_span.size()))
|
||||||
|
break;
|
||||||
|
string word(m_span.substr(deb, fin-deb));
|
||||||
|
if (!emitterm(j != i+1, word, pos, deb, fin))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A routine called from different places in text_to_words(), to
|
* A method called at word boundaries (different places in
|
||||||
* adjust the current state of the parser, and call the word
|
* text_to_words()), to adjust the current state of the parser, and
|
||||||
* handler/emitter. Emit and reset the current word, possibly emit the current
|
* possibly generate term(s). While inside a span (words linked by
|
||||||
* span (if different). In query mode, words are not emitted, only final spans
|
* glue characters), we just keep track of the word boundaries. Once
|
||||||
|
* actual white-space is reached, we get called with spanerase set to
|
||||||
|
* true, and we process the span, calling the emitterm() routine for
|
||||||
|
* each generated term.
|
||||||
*
|
*
|
||||||
* This is purely for factoring common code from different places in
|
* The object flags can modify our behaviour, deciding if we only emit
|
||||||
* text_to_words().
|
* single words (bill, recoll, org), only spans (bill@recoll.org), or
|
||||||
|
* words and spans (bill@recoll.org, recoll.org, jf, recoll...)
|
||||||
*
|
*
|
||||||
* @return true if ok, false for error. Splitting should stop in this case.
|
* @return true if ok, false for error. Splitting should stop in this case.
|
||||||
* @param spanerase Set if the current span is at its end. Reset it.
|
* @param spanerase Set if the current span is at its end. Process it.
|
||||||
* @param bp The current BYTE position in the stream
|
* @param bp The current BYTE position in the stream
|
||||||
* @param spanemit This is set for intermediate spans: glue char changed.
|
|
||||||
*/
|
*/
|
||||||
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
inline bool TextSplit::doemit(bool spanerase, int bp)
|
||||||
{
|
{
|
||||||
LOGDEB2(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
|
LOGDEB2(("TextSplit::doemit: sper %d bp %d spp %d spanwords %u wS %d wL %d "
|
||||||
"inn %d span [%s]\n",
|
"inn %d span [%s]\n",
|
||||||
spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
|
spanerase, bp, m_spanpos, m_words_in_span.size(),
|
||||||
m_inNumber, m_span.c_str()));
|
m_wordStart, m_wordLen, m_inNumber, m_span.c_str()));
|
||||||
|
|
||||||
// Emit span? When splitting for query, we only emit final spans
|
if (m_wordLen) {
|
||||||
// (spanerase)
|
// We have a current word. Remember it
|
||||||
bool spanemitted = false;
|
|
||||||
if (!(m_flags & TXTS_NOSPANS) &&
|
|
||||||
!((m_wordLen == m_span.length()) &&
|
|
||||||
(o_noNumbers) && m_inNumber) &&
|
|
||||||
((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
|
|
||||||
|
|
||||||
// Check for an acronym/abbreviation ie I.B.M.
|
// Limit max span word count
|
||||||
if (spanerase && m_wordLen != m_span.length() && m_span.length() > 2
|
if (m_words_in_span.size() >= 6) {
|
||||||
&& m_span.length() <= 20) {
|
spanerase = true;
|
||||||
bool acron = true;
|
}
|
||||||
for (unsigned int i = 1 ; i < m_span.length(); i += 2) {
|
|
||||||
if (m_span[i] != '.') {
|
|
||||||
acron = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (acron) {
|
|
||||||
string acronym;
|
|
||||||
for (unsigned int i = 0; i < m_span.length(); i += 2) {
|
|
||||||
acronym += m_span[i];
|
|
||||||
}
|
|
||||||
if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(),
|
|
||||||
bp))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Maybe trim at end. These are chars that we would keep inside
|
m_words_in_span.push_back(pair<int,int>(m_wordStart,
|
||||||
// a span, but not at the end
|
m_wordStart + m_wordLen));
|
||||||
|
m_wordpos++;
|
||||||
|
m_wordLen = m_wordChars = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (spanerase) {
|
||||||
|
// We encountered a span-terminating character. Produce terms.
|
||||||
|
|
||||||
|
string acronym;
|
||||||
|
if (span_is_acronym(&acronym)) {
|
||||||
|
if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(), bp))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Maybe trim at end. These are chars that we might keep
|
||||||
|
// inside a span, but not at the end.
|
||||||
while (m_span.length() > 0) {
|
while (m_span.length() > 0) {
|
||||||
switch (m_span[m_span.length()-1]) {
|
switch (*(m_span.rbegin())) {
|
||||||
case '.':
|
case '.':
|
||||||
case '-':
|
case '-':
|
||||||
case ',':
|
case ',':
|
||||||
@ -297,37 +374,26 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
|||||||
case '_':
|
case '_':
|
||||||
case '\'':
|
case '\'':
|
||||||
m_span.resize(m_span.length()-1);
|
m_span.resize(m_span.length()-1);
|
||||||
|
if (m_words_in_span.back().second > m_span.size())
|
||||||
|
m_words_in_span.back().second = m_span.size();
|
||||||
if (--bp < 0)
|
if (--bp < 0)
|
||||||
bp = 0;
|
bp = 0;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
goto breakloop1;
|
goto breaktrimloop;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
breakloop1:
|
breaktrimloop:
|
||||||
spanemitted = true;
|
|
||||||
if (!emitterm(true, m_span, m_spanpos, bp - m_span.length(), bp))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Emit word if different from span and not 'no words' mode
|
if (!words_from_span()) {
|
||||||
if (!(m_flags & TXTS_ONLYSPANS) && m_wordLen &&
|
return false;
|
||||||
!(o_noNumbers && m_inNumber) &&
|
}
|
||||||
(!spanemitted || m_wordLen != m_span.length())) {
|
|
||||||
string s(m_span.substr(m_wordStart, m_wordLen));
|
|
||||||
if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Adjust state
|
|
||||||
if (m_wordLen) {
|
|
||||||
m_wordpos++;
|
|
||||||
m_wordLen = m_wordChars = 0;
|
|
||||||
}
|
|
||||||
if (spanerase) {
|
|
||||||
discardspan();
|
discardspan();
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
m_wordStart = m_span.length();
|
m_wordStart = m_span.length();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -335,6 +401,7 @@ inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
|
|||||||
|
|
||||||
void TextSplit::discardspan()
|
void TextSplit::discardspan()
|
||||||
{
|
{
|
||||||
|
m_words_in_span.clear();
|
||||||
m_span.erase();
|
m_span.erase();
|
||||||
m_spanpos = m_wordpos;
|
m_spanpos = m_wordpos;
|
||||||
m_wordStart = 0;
|
m_wordStart = 0;
|
||||||
@ -353,9 +420,9 @@ static inline bool isdigit(int what, unsigned int flgs)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef TEXTSPLIT_STATS
|
#ifdef TEXTSPLIT_STATS
|
||||||
#define INC_WORDCHARS ++m_wordChars
|
#define STATS_INC_WORDCHARS ++m_wordChars
|
||||||
#else
|
#else
|
||||||
#define INC_WORDCHARS
|
#define STATS_INC_WORDCHARS
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -380,7 +447,6 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos
|
m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos
|
||||||
= m_spanpos = 0;
|
= m_spanpos = 0;
|
||||||
int curspanglue = 0;
|
|
||||||
bool pagepending = false;
|
bool pagepending = false;
|
||||||
bool softhyphenpending = false;
|
bool softhyphenpending = false;
|
||||||
|
|
||||||
@ -419,6 +485,7 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
}
|
}
|
||||||
|
|
||||||
int cc = whatcc(c);
|
int cc = whatcc(c);
|
||||||
|
|
||||||
switch (cc) {
|
switch (cc) {
|
||||||
case SKIP:
|
case SKIP:
|
||||||
// Special-case soft-hyphen. To work, this depends on the
|
// Special-case soft-hyphen. To work, this depends on the
|
||||||
@ -432,18 +499,18 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
}
|
}
|
||||||
// Skips the softhyphenpending reset
|
// Skips the softhyphenpending reset
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
case DIGIT:
|
case DIGIT:
|
||||||
|
nonalnumcnt = 0;
|
||||||
if (m_wordLen == 0)
|
if (m_wordLen == 0)
|
||||||
m_inNumber = true;
|
m_inNumber = true;
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
INC_WORDCHARS;
|
STATS_INC_WORDCHARS;
|
||||||
nonalnumcnt = 0;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SPACE:
|
case SPACE:
|
||||||
SPACE:
|
|
||||||
curspanglue = 0;
|
|
||||||
nonalnumcnt = 0;
|
nonalnumcnt = 0;
|
||||||
|
SPACE:
|
||||||
if (m_wordLen || m_span.length()) {
|
if (m_wordLen || m_span.length()) {
|
||||||
if (!doemit(true, it.getBpos()))
|
if (!doemit(true, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
@ -464,7 +531,6 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
|
|
||||||
case '-':
|
case '-':
|
||||||
case '+':
|
case '+':
|
||||||
curspanglue = cc;
|
|
||||||
if (m_wordLen == 0) {
|
if (m_wordLen == 0) {
|
||||||
// + or - don't start a term except if this looks like
|
// + or - don't start a term except if this looks like
|
||||||
// it's going to be to be a number
|
// it's going to be to be a number
|
||||||
@ -472,21 +538,38 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
// -10
|
// -10
|
||||||
m_inNumber = true;
|
m_inNumber = true;
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
INC_WORDCHARS;
|
STATS_INC_WORDCHARS;
|
||||||
} else {
|
break;
|
||||||
goto SPACE;
|
|
||||||
}
|
}
|
||||||
} else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
|
} else if (m_inNumber) {
|
||||||
|
if ((m_span[m_span.length() - 1] == 'e' ||
|
||||||
m_span[m_span.length() - 1] == 'E')) {
|
m_span[m_span.length() - 1] == 'E')) {
|
||||||
if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
|
if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
INC_WORDCHARS;
|
STATS_INC_WORDCHARS;
|
||||||
} else {
|
break;
|
||||||
goto SPACE;
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
goto SPACE;
|
if (cc == '+') {
|
||||||
|
int nextc = it[it.getCpos()+1];
|
||||||
|
if (nextc == '+' || nextc == -1 || visiblewhite.find(nextc)
|
||||||
|
!= visiblewhite.end()) {
|
||||||
|
// someword++[+...] !
|
||||||
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
|
STATS_INC_WORDCHARS;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Treat '-' inside span as glue char
|
||||||
|
if (!doemit(false, it.getBpos()))
|
||||||
|
return false;
|
||||||
|
m_inNumber = false;
|
||||||
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
goto SPACE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '.':
|
case '.':
|
||||||
@ -497,120 +580,91 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
if (m_inNumber) {
|
if (m_inNumber) {
|
||||||
if (!isdigit(nextwhat, m_flags))
|
if (!isdigit(nextwhat, m_flags))
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
INC_WORDCHARS;
|
STATS_INC_WORDCHARS;
|
||||||
curspanglue = cc;
|
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
// If . inside a word, it's spanglue, else, it's whitespace.
|
// Found '.' while not in number
|
||||||
// We also keep an initial '.' for catching .net, but this adds
|
|
||||||
// quite a few spurious terms !
|
|
||||||
// Another problem is that something like .x-errs
|
|
||||||
// will be split as .x-errs, x, errs but not x-errs
|
|
||||||
// A final comma in a word will be removed by doemit
|
|
||||||
|
|
||||||
// Only letters and digits make sense after
|
// Only letters and digits make sense after
|
||||||
if (!isalphanum(nextwhat, m_flags))
|
if (!isalphanum(nextwhat, m_flags))
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
|
|
||||||
if (cc == '.') {
|
// Keep an initial '.' for catching .net, and .34 (aka
|
||||||
|
// 0.34) but this adds quite a few spurious terms !
|
||||||
|
if (m_span.length() == 0) {
|
||||||
// Check for number like .1
|
// Check for number like .1
|
||||||
if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
|
if (isdigit(nextwhat, m_flags)) {
|
||||||
m_inNumber = true;
|
m_inNumber = true;
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
|
||||||
INC_WORDCHARS;
|
|
||||||
curspanglue = cc;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
if (m_wordLen) {
|
STATS_INC_WORDCHARS;
|
||||||
// Disputable special case: set spanemit to
|
break;
|
||||||
// true when encountering a '.' while spanglue
|
}
|
||||||
// is '_'. Think of a_b.c Done to
|
|
||||||
// avoid breaking stuff after changing '_'
|
// '.' between words: span glue
|
||||||
// from wordchar to spanglue
|
if (m_wordLen) {
|
||||||
if (!doemit(false, it.getBpos(), curspanglue == '_'))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
curspanglue = cc;
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
// span length could have been adjusted by trimming
|
}
|
||||||
// inside doemit
|
|
||||||
if (m_span.length())
|
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
|
||||||
curspanglue = cc;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
goto SPACE;
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '@':
|
case '@':
|
||||||
if (m_wordLen) {
|
|
||||||
if (!doemit(false, it.getBpos()))
|
|
||||||
return false;
|
|
||||||
curspanglue = cc;
|
|
||||||
m_inNumber = false;
|
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
|
||||||
} else {
|
|
||||||
goto SPACE;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case '_':
|
case '_':
|
||||||
if (m_wordLen) {
|
|
||||||
if (!doemit(false, it.getBpos()))
|
|
||||||
return false;
|
|
||||||
curspanglue = cc;
|
|
||||||
m_inNumber = false;
|
|
||||||
}
|
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
|
||||||
break;
|
|
||||||
case '\'':
|
case '\'':
|
||||||
// If in word, potential span: o'brien, else, this is more
|
// If in word, potential span: o'brien, jf@dockes.org,
|
||||||
// whitespace
|
// else just ignore
|
||||||
if (m_wordLen) {
|
if (m_wordLen) {
|
||||||
if (!doemit(false, it.getBpos()))
|
if (!doemit(false, it.getBpos()))
|
||||||
return false;
|
return false;
|
||||||
curspanglue = cc;
|
|
||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
m_wordStart += it.appendchartostring(m_span);
|
m_wordStart += it.appendchartostring(m_span);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '#':
|
case '#':
|
||||||
// Keep it only at end of word ... Special case for c# you see...
|
// Keep it only at end of word ... Special case for c# you see...
|
||||||
if (m_wordLen > 0) {
|
if (m_wordLen > 0) {
|
||||||
int w = whatcc(it[it.getCpos()+1]);
|
int w = whatcc(it[it.getCpos()+1]);
|
||||||
if (w == SPACE || w == '\n' || w == '\r') {
|
if (w == SPACE || w == '\n' || w == '\r') {
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
INC_WORDCHARS;
|
STATS_INC_WORDCHARS;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\n':
|
case '\n':
|
||||||
case '\r':
|
case '\r':
|
||||||
if ((m_span.length() && m_span[m_span.length() - 1] == '-') ||
|
if (m_span.length() && *m_span.rbegin() == '-') {
|
||||||
softhyphenpending) {
|
// if '-' is the last char before end of line, we
|
||||||
// if '-' is the last char before end of line, just
|
// strip it. We have no way to know if this is added
|
||||||
// ignore the line change. This is the right thing to
|
// because of the line split or if it was part of an
|
||||||
// do almost always. We'd then need a way to check if
|
// actual compound word (would need a dictionary to
|
||||||
// the - was added as part of the word hyphenation, or was
|
// check). As soft-hyphen *should* be used if the '-'
|
||||||
// there in the first place, but this would need a dictionary.
|
// is not part of the text, it is better to properly
|
||||||
|
// process a real compound word, and produce wrong
|
||||||
|
// output from wrong text. The word-emitting routine
|
||||||
|
// will strip the trailing '-'.
|
||||||
|
goto SPACE;
|
||||||
|
} else if (softhyphenpending) {
|
||||||
// Don't reset soft-hyphen
|
// Don't reset soft-hyphen
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
// Handle like a normal separator
|
// Normal case: EOL is white space
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\f':
|
case '\f':
|
||||||
pagepending = true;
|
pagepending = true;
|
||||||
goto SPACE;
|
goto SPACE;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
#ifdef RCL_SPLIT_CAMELCASE
|
#ifdef RCL_SPLIT_CAMELCASE
|
||||||
// Camelcase handling.
|
// Camelcase handling.
|
||||||
// If we get uppercase ascii after lowercase ascii, emit word.
|
// If we get uppercase ascii after lowercase ascii, emit word.
|
||||||
@ -651,15 +705,14 @@ bool TextSplit::text_to_words(const string &in)
|
|||||||
goto NORMALCHAR;
|
goto NORMALCHAR;
|
||||||
#endif /* CAMELCASE */
|
#endif /* CAMELCASE */
|
||||||
|
|
||||||
|
|
||||||
default:
|
default:
|
||||||
NORMALCHAR:
|
NORMALCHAR:
|
||||||
|
nonalnumcnt = 0;
|
||||||
if (m_inNumber && c != 'e' && c != 'E') {
|
if (m_inNumber && c != 'e' && c != 'E') {
|
||||||
m_inNumber = false;
|
m_inNumber = false;
|
||||||
}
|
}
|
||||||
m_wordLen += it.appendchartostring(m_span);
|
m_wordLen += it.appendchartostring(m_span);
|
||||||
INC_WORDCHARS;
|
STATS_INC_WORDCHARS;
|
||||||
nonalnumcnt = 0;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
softhyphenpending = false;
|
softhyphenpending = false;
|
||||||
@ -917,27 +970,73 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static string teststring =
|
#define OPT_s 0x1
|
||||||
"Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n"
|
#define OPT_w 0x2
|
||||||
"\"Jean-Francois Dockes\" <jfd@okyz.com>\n"
|
#define OPT_q 0x4
|
||||||
"n@d @net .net t@v@c c# c++ o'brien 'o'brien' l'ami\n"
|
#define OPT_c 0x8
|
||||||
"data123\n"
|
#define OPT_k 0x10
|
||||||
"134 +134 -14 0.1 .1 2. -1.5 +1.5 1,2 1.54e10 1,2e30 .1e10 1.e-8\n"
|
#define OPT_C 0x20
|
||||||
"@^#$(#$(*)\n"
|
#define OPT_n 0x40
|
||||||
"192.168.4.1 one\n\rtwo\r"
|
#define OPT_S 0x80
|
||||||
"Debut-\ncontinue\n"
|
#define OPT_u 0x100
|
||||||
"[olala][ululu] (valeur) (23)\n"
|
|
||||||
"utf-8 ucs-4© \\nodef\n"
|
bool dosplit(const string& data, TextSplit::Flags flags, int op_flags)
|
||||||
"A b C 2 . +"
|
{
|
||||||
"','this\n"
|
myTermProc printproc;
|
||||||
" ,able,test-domain "
|
|
||||||
" -wl,--export-dynamic "
|
Rcl::TermProc *nxt = &printproc;
|
||||||
" ~/.xsession-errors "
|
|
||||||
"soft\xc2\xadhyphen "
|
// Rcl::TermProcCommongrams commonproc(nxt, stoplist);
|
||||||
"soft\xc2\xad\nhyphen "
|
// if (op_flags & OPT_S)
|
||||||
"soft\xc2\xad\n\rhyphen "
|
// nxt = &commonproc;
|
||||||
"hard-\nhyphen "
|
|
||||||
;
|
Rcl::TermProcPrep preproc(nxt);
|
||||||
|
if (op_flags & OPT_u)
|
||||||
|
nxt = &preproc;
|
||||||
|
|
||||||
|
Rcl::TextSplitP splitter(nxt, flags);
|
||||||
|
|
||||||
|
if (op_flags & OPT_q)
|
||||||
|
printproc.setNoOut(true);
|
||||||
|
|
||||||
|
splitter.text_to_words(data);
|
||||||
|
|
||||||
|
#ifdef TEXTSPLIT_STATS
|
||||||
|
TextSplit::Stats::Values v = splitter.getStats();
|
||||||
|
cout << "Average length: "
|
||||||
|
<< v.avglen
|
||||||
|
<< " Standard deviation: "
|
||||||
|
<< v.sigma
|
||||||
|
<< " Coef of variation "
|
||||||
|
<< v.sigma / v.avglen
|
||||||
|
<< endl;
|
||||||
|
#endif
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char *teststrings[] = {
|
||||||
|
"Un bout de texte \nnormal. 2eme phrase.3eme;quatrieme.\n",
|
||||||
|
"\"Jean-Francois Dockes\" <jfd@okyz.com>\n",
|
||||||
|
"n@d @net .net net@ t@v@c c# c++ o'brien 'o'brien'",
|
||||||
|
"_network_ some_span",
|
||||||
|
"data123\n",
|
||||||
|
"134 +134 -14 0.1 .1 2. -1.5 +1.5 1,2 1.54e10 1,2e30 .1e10 1.e-8\n",
|
||||||
|
"@^#$(#$(*)\n",
|
||||||
|
"192.168.4.1 one\n\rtwo\r",
|
||||||
|
"[olala][ululu] (valeur) (23)\n",
|
||||||
|
"utf-8 ucs-4© \\nodef\n",
|
||||||
|
"A b C 2 . +",
|
||||||
|
"','this\n",
|
||||||
|
" ,able,test-domain",
|
||||||
|
" -wl,--export-dynamic",
|
||||||
|
" ~/.xsession-errors",
|
||||||
|
"this_very_long_span_this_very_long_span_this_very_long_span",
|
||||||
|
"soft\xc2\xadhyphen",
|
||||||
|
"soft\xc2\xad\nhyphen",
|
||||||
|
"soft\xc2\xad\n\rhyphen",
|
||||||
|
"hard-\nhyphen",
|
||||||
|
};
|
||||||
|
const int teststrings_cnt = sizeof(teststrings)/sizeof(char *);
|
||||||
|
|
||||||
static string teststring1 = " nouvel-an ";
|
static string teststring1 = " nouvel-an ";
|
||||||
|
|
||||||
@ -966,15 +1065,6 @@ Usage(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int op_flags;
|
static int op_flags;
|
||||||
#define OPT_s 0x1
|
|
||||||
#define OPT_w 0x2
|
|
||||||
#define OPT_q 0x4
|
|
||||||
#define OPT_c 0x8
|
|
||||||
#define OPT_k 0x10
|
|
||||||
#define OPT_C 0x20
|
|
||||||
#define OPT_n 0x40
|
|
||||||
#define OPT_S 0x80
|
|
||||||
#define OPT_u 0x100
|
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
@ -1043,9 +1133,13 @@ int main(int argc, char **argv)
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
cout << endl << teststring << endl << endl;
|
for (int i = 0; i < teststrings_cnt; i++) {
|
||||||
odata = teststring;
|
cout << endl << teststrings[i] << endl;
|
||||||
|
dosplit(teststrings[i], flags, op_flags);
|
||||||
|
}
|
||||||
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
string& data = odata;
|
string& data = odata;
|
||||||
string ndata;
|
string ndata;
|
||||||
if ((op_flags & OPT_C)) {
|
if ((op_flags & OPT_C)) {
|
||||||
@ -1061,34 +1155,7 @@ int main(int argc, char **argv)
|
|||||||
int n = TextSplit::countWords(data, flags);
|
int n = TextSplit::countWords(data, flags);
|
||||||
cout << n << " words" << endl;
|
cout << n << " words" << endl;
|
||||||
} else {
|
} else {
|
||||||
myTermProc printproc;
|
dosplit(data, flags, op_flags);
|
||||||
|
|
||||||
Rcl::TermProc *nxt = &printproc;
|
|
||||||
|
|
||||||
Rcl::TermProcCommongrams commonproc(nxt, stoplist);
|
|
||||||
if (op_flags & OPT_S)
|
|
||||||
nxt = &commonproc;
|
|
||||||
|
|
||||||
Rcl::TermProcPrep preproc(nxt);
|
|
||||||
if (op_flags & OPT_u)
|
|
||||||
nxt = &preproc;
|
|
||||||
|
|
||||||
Rcl::TextSplitP splitter(nxt, flags);
|
|
||||||
|
|
||||||
if (op_flags & OPT_q)
|
|
||||||
printproc.setNoOut(true);
|
|
||||||
|
|
||||||
splitter.text_to_words(data);
|
|
||||||
#ifdef TEXTSPLIT_STATS
|
|
||||||
TextSplit::Stats::Values v = splitter.getStats();
|
|
||||||
cout << "Average length: "
|
|
||||||
<< v.avglen
|
|
||||||
<< " Standard deviation: "
|
|
||||||
<< v.sigma
|
|
||||||
<< " Coef of variation "
|
|
||||||
<< v.sigma / v.avglen
|
|
||||||
<< endl;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // TEST
|
#endif // TEST
|
||||||
|
|||||||
@ -24,6 +24,7 @@
|
|||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
using std::pair;
|
||||||
|
|
||||||
class Utf8Iter;
|
class Utf8Iter;
|
||||||
|
|
||||||
@ -55,12 +56,19 @@ public:
|
|||||||
o_noNumbers = true;
|
o_noNumbers = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
enum Flags {TXTS_NONE = 0,
|
enum Flags {
|
||||||
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
// Default: will return spans and words (a_b, a, b)
|
||||||
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
|
TXTS_NONE = 0,
|
||||||
TXTS_KEEPWILD = 4 // Handle wildcards as letters
|
// Only return maximum spans (a@b.com, not a, b, or com)
|
||||||
|
TXTS_ONLYSPANS = 1,
|
||||||
|
// Special: Only return atomic words (a, b, com). This is not
|
||||||
|
// used for indexing, but for position computation during
|
||||||
|
// abstract generation,
|
||||||
|
TXTS_NOSPANS = 2,
|
||||||
|
// Handle wildcards as letters. This is used with ONLYSPANS
|
||||||
|
// for parsing a user query (never alone).
|
||||||
|
TXTS_KEEPWILD = 4
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
TextSplit(Flags flags = Flags(TXTS_NONE))
|
TextSplit(Flags flags = Flags(TXTS_NONE))
|
||||||
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
|
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
|
||||||
@ -177,6 +185,8 @@ private:
|
|||||||
// Current span. Might be jf.dockes@wanadoo.f
|
// Current span. Might be jf.dockes@wanadoo.f
|
||||||
string m_span;
|
string m_span;
|
||||||
|
|
||||||
|
vector <pair<unsigned int, unsigned int> > m_words_in_span;
|
||||||
|
|
||||||
// Current word: no punctuation at all in there. Byte offset
|
// Current word: no punctuation at all in there. Byte offset
|
||||||
// relative to the current span and byte length
|
// relative to the current span and byte length
|
||||||
int m_wordStart;
|
int m_wordStart;
|
||||||
@ -207,8 +217,10 @@ private:
|
|||||||
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
||||||
|
|
||||||
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
||||||
bool doemit(bool spanerase, int bp, bool spanemit=false);
|
bool doemit(bool spanerase, int bp);
|
||||||
void discardspan();
|
void discardspan();
|
||||||
|
bool span_is_acronym(std::string *acronym);
|
||||||
|
bool words_from_span();
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user