more textsplit tweaking

This commit is contained in:
dockes 2006-01-28 10:23:55 +00:00
parent c50f023002
commit 8c9eb8c6d3
6 changed files with 118 additions and 114 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.16 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.17 2006-01-28 10:23:55 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -79,7 +79,7 @@ static void setcharclasses()
for (i = 0; i < strlen(blankspace); i++) for (i = 0; i < strlen(blankspace); i++)
charclasses[int(blankspace[i])] = SPACE; charclasses[int(blankspace[i])] = SPACE;
char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*`?"; char seps[] = "!\"$%&()/<=>[\\]^{|}~:;*`?";
for (i = 0; i < strlen(seps); i++) for (i = 0; i < strlen(seps); i++)
charclasses[int(seps[i])] = SPACE; charclasses[int(seps[i])] = SPACE;
@ -91,6 +91,7 @@ static void setcharclasses()
//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl; //for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
for (i = 0; i < sizeof(uniign); i++) for (i = 0; i < sizeof(uniign); i++)
unicign.insert(uniign[i]); unicign.insert(uniign[i]);
unicign.insert((unsigned int)-1);
} }
// Do some cleanup (the kind which is simpler to do here than in the // Do some cleanup (the kind which is simpler to do here than in the
@ -100,9 +101,8 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
{ {
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos)); LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
if (!cb) // Maybe trim end of word. These are chars that we would keep inside
return false; // a word or span, but not at the end
// Maybe trim end of word. These are chars that we would keep inside // Maybe trim end of word. These are chars that we would keep inside
// a word or span, but not at the end // a word or span, but not at the end
while (w.length() > 0) { while (w.length() > 0) {
@ -111,8 +111,9 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
case ',': case ',':
case '@': case '@':
case '\'': case '\'':
w.erase(w.length()-1); w.resize(w.length()-1);
btend--; if (btend < 0) btend=0; if (--btend < 0)
btend=0;
break; break;
default: default:
goto breakloop1; goto breakloop1;
@ -120,30 +121,21 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
} }
breakloop1: breakloop1:
// In addition, it doesn't make sense currently to keep ' at the beginning // Trimming chars at the beginning of string: used to have (buggy)
while (w.length() > 0) { // code to remove , and \ at start of term, didn't seem to be ever called
switch (w[0]) {
case ',':
case '\'':
w.erase(w.length()-1);
btstart++;
break;
default:
goto breakloop2;
}
}
breakloop2:
// 1 char word: we index single letters, but nothing else unsigned int l = w.length();
if (w.length() == 1) { if (l > 0 && l < (unsigned)maxWordLength) {
int c = (int)w[0]; if (l == 1) {
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) { // 1 char word: we index single letters and digits, but
//cerr << "ERASING single letter term " << c << endl; // nothing else
w.erase(); int c = (int)w[0];
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
//cerr << "ERASING single letter term " << c << endl;
return true;
}
} }
} if (pos != prevpos || l != prevterm.length() || w != prevterm) {
if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
if (w != prevterm || pos != prevpos) {
bool ret = cb->takeword(w, pos, btstart, btend); bool ret = cb->takeword(w, pos, btstart, btend);
prevterm = w; prevterm = w;
prevpos = pos; prevpos = pos;
@ -153,11 +145,26 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
return true; return true;
} }
// A routine called from different places in text_to_words(), to adjust /**
// the current state and call the word handler. This is purely for * A routine called from different places in text_to_words(), to
// factoring common code from different places text_to_words() * adjust the current state of the parser, and call the word
bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos, * handler/emitter. Emit and reset the current word, possibly emit the current
bool spanerase, int bp) * span (if different). In query mode, words are not emitted, only final spans
*
* This is purely for factoring common code from different places
* text_to_words().
*
* @return true if ok, false for error. Splitting should stop in this case.
* @param word Word value. This will be empty on return in ALL non-error
* cases
* @param wordpos Term position for word. Always ++ by us.
* @param span Span value
* @param spanpos Term position for the current span
* @param spanerase Set if the current span is at its end. Reset it.
* @param bp The current BYTE position in the stream
*/
inline bool TextSplit::doemit(string &word, int &wordpos, string &span,
int &spanpos, bool spanerase, int bp)
{ {
#if 0 #if 0
cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" << cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" <<
@ -165,43 +172,36 @@ bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
<< endl; << endl;
#endif #endif
// When splitting for query, we only emit final spans // Emit span. When splitting for query, we only emit final spans
if (fq && !spanerase) { if (!fq || spanerase)
wordpos++; if (!emitterm(true, span, spanpos, bp-span.length(), bp))
word.erase(); return false;
return true; // Emit word if different from span and not query mode
}
// Emit span or both word and span if they are different
if (!emitterm(true, span, spanpos, bp-span.length(), bp))
return false;
if (word.length() != span.length() && !fq) if (word.length() != span.length() && !fq)
if (!emitterm(false, word, wordpos, bp-word.length(), bp)) if (!emitterm(false, word, wordpos, bp-word.length(), bp))
return false; return false;
// Adjust state // Adjust state
wordpos++; wordpos++;
if (spanerase) word.clear();
span.erase(); if (spanerase) {
word.erase(); span.clear();
spanpos = wordpos;
}
return true; return true;
} }
static inline int whatcc(unsigned int c) static inline int whatcc(unsigned int c)
{ {
int cc;
if (c <= 127) { if (c <= 127) {
cc = charclasses[c]; return charclasses[c];
} else { } else {
if (c == (unsigned int)-1) if (unicign.find(c) != unicign.end())
cc = SPACE; return SPACE;
else if (unicign.find(c) != unicign.end())
cc = SPACE;
else else
cc = LETTER; return LETTER;
} }
return cc;
} }
/** /**
@ -240,8 +240,6 @@ bool TextSplit::text_to_words(const string &in)
return false; return false;
number = false; number = false;
} }
spanpos = wordpos;
span.erase();
break; break;
case '-': case '-':
case '+': case '+':
@ -259,6 +257,28 @@ bool TextSplit::text_to_words(const string &in)
span += it; span += it;
} }
break; break;
case '.':
case ',':
if (number) {
word += it;
span += it;
break;
} else {
// If . inside a word, keep it, else, this is whitespace.
// A final comma in a word will be removed by doemit
if (cc == '.' && word.length()) {
if (!doemit(word, wordpos, span, spanpos, false,
it.getBpos()))
return false;
// span length could have been adjusted by trimming
// inside doemit
if (span.length())
span += it;
break;
}
}
goto SPACE;
break;
case '@': case '@':
if (word.length()) { if (word.length()) {
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos())) if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
@ -269,6 +289,8 @@ bool TextSplit::text_to_words(const string &in)
span += it; span += it;
break; break;
case '\'': case '\'':
// If in word, potential span: o'brien, else, this is more
// whitespace
if (word.length()) { if (word.length()) {
if (!doemit(word, wordpos, span, spanpos, false, it.getBpos())) if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
return false; return false;
@ -276,31 +298,17 @@ bool TextSplit::text_to_words(const string &in)
span += it; span += it;
} }
break; break;
case '.':
if (number) {
word += it;
} else {
//cerr<<"Got . span: '"<<span<<"' word: '"<<word<<"'"<<endl;
if (word.length()) {
if (!doemit(word, wordpos, span, spanpos, false,
it.getBpos()))
return false;
number = false;
} else
word += it;
}
span += it;
break;
case '#': case '#':
// Keep it only at end of word... // Keep it only at end of word... Special case for c# you see...
if (word.length() > 0 && if (word.length() > 0) {
(whatcc(it[charpos+1]) == SPACE || int w = whatcc(it[charpos+1]);
whatcc(it[charpos+1]) == '\n' || if (w == SPACE || w == '\n' || w == '\r') {
whatcc(it[charpos+1]) == '\r')) { word += it;
word += it; span += it;
span += it; break;
}
} }
goto SPACE;
break; break;
case '\n': case '\n':
case '\r': case '\r':
@ -310,20 +318,19 @@ bool TextSplit::text_to_words(const string &in)
// do almost always. We'd then need a way to check if // do almost always. We'd then need a way to check if
// the - was added as part of the word hyphenation, or was // the - was added as part of the word hyphenation, or was
// there in the first place, but this would need a dictionary. // there in the first place, but this would need a dictionary.
// Also we'd need to check for a soft-hyphen and remove it,
// but this would require more utf-8 magic
} else { } else {
// Handle like a normal separator // Handle like a normal separator
goto SPACE; goto SPACE;
} }
break; break;
case LETTER:
case DIGIT: case DIGIT:
if (word.length() == 0)
number = true;
/* FALLTHROUGH */
case LETTER:
default: default:
if (word.length() == 0) {
if (cc == DIGIT)
number = true;
else
number = false;
}
word += it; word += it;
span += it; span += it;
break; break;
@ -367,21 +374,18 @@ class mySplitterCB : public TextSplitCB {
} }
}; };
static string teststring = static string teststring1 =
"Un bout de texte \n" "Un bout de texte \nnormal. jfd@okyz.com \n"
"normal. " "Ceci. Est;Oui n@d @net .net t@v@c c# c++ -10 o'brien l'ami \n"
"jfd@okyz.com " "a 134 +134 -14 -1.5 +1.5 1.54e10 a @^#$(#$(*) 1,2 1,2e30\n"
"Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami " "192.168.4.1 one\n\rtwo\nthree-\nfour [olala][ululu] 'o'brien' \n"
"a 134 +134 -14 -1.5 +1.5 1.54e10 a " "utf-8 ucs-4© \\nodef\n"
"@^#$(#$(*) " "','this \n"
"192.168.4.1 " "M9R F($AA;F1L:6YG\"0D)\"0D@(\" @(#4P, T)0W)A=&4)\"0D)\"2 @,C4P#0E3"
"one\n\rtwo\nthree-\nfour " " ,able,test-domain "
"[olala][ululu] " " -wl,--export-dynamic "
"'o'brien' "
"utf-8 ucs-4©"
"\n"
; ;
static string teststring1 = "c++ "; static string teststring = " -wl,--export-dynamic ";
static string thisprog; static string thisprog;

View File

@ -1,6 +1,6 @@
#ifndef _TEXTSPLIT_H_INCLUDED_ #ifndef _TEXTSPLIT_H_INCLUDED_
#define _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_
/* @(#$Id: textsplit.h,v 1.8 2005-10-19 10:21:48 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: textsplit.h,v 1.9 2006-01-28 10:23:55 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string> #include <string>
#ifndef NO_NAMESPACES #ifndef NO_NAMESPACES
@ -35,7 +35,7 @@ class TextSplit {
TextSplitCB *cb; TextSplitCB *cb;
int maxWordLength; int maxWordLength;
bool emitterm(bool isspan, std::string &term, int pos, int bs, int be); bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
bool doemit(string &word, int &wordpos, string &span, int spanpos, bool doemit(string &word, int &wordpos, string &span, int &spanpos,
bool spanerase, int bp); bool spanerase, int bp);
public: public:
/** /**

2
src/configure vendored
View File

@ -2296,7 +2296,7 @@ cd ..
m_prefix=$prefix m_prefix=$prefix
test "X$m_prefix" = "XNONE" && m_prefix=/usr/local test "X$m_prefix" = "XNONE" && m_prefix=/usr/local
m_datadir=${m_prefix}/share m_datadir=${m_prefix}/share
QTRECOLL_DATADIR=${m_datadir} QTRECOLL_DATADIR=${m_datadir}/recoll

View File

@ -119,7 +119,7 @@ cd ..
m_prefix=$prefix m_prefix=$prefix
test "X$m_prefix" = "XNONE" && m_prefix=/usr/local test "X$m_prefix" = "XNONE" && m_prefix=/usr/local
m_datadir=${m_prefix}/share m_datadir=${m_prefix}/share
QTRECOLL_DATADIR=${m_datadir} QTRECOLL_DATADIR=${m_datadir}/recoll
AC_SUBST(LIBXAPIAN) AC_SUBST(LIBXAPIAN)
AC_SUBST(XAPIANCXXFLAGS) AC_SUBST(XAPIANCXXFLAGS)

View File

@ -1,5 +1,5 @@
#ifndef lint #ifndef lint
static char rcsid[] = "@(#$Id: xadump.cpp,v 1.9 2006-01-25 08:09:41 dockes Exp $ (C) 2004 J.F.Dockes"; static char rcsid[] = "@(#$Id: xadump.cpp,v 1.10 2006-01-28 10:23:55 dockes Exp $ (C) 2004 J.F.Dockes";
#endif #endif
/* /*
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
@ -147,16 +147,12 @@ int main(int argc, char **argv)
if (op_flags & OPT_i) { if (op_flags & OPT_i) {
for (term = db->termlist_begin(docid); for (term = db->termlist_begin(docid);
term != db->termlist_end(docid);term++) { term != db->termlist_end(docid);term++) {
transcode(*term, printable, "UTF-8", outencoding); cout << "[" << *term << "]" << endl;
cout << "[" << printable << "]" << endl;
} }
} else { } else {
for (term = db->allterms_begin(); for (term = db->allterms_begin();
term != db->allterms_end();term++) { term != db->allterms_end();term++) {
if (transcode(*term, printable, "UTF-8", outencoding)) cout << "utf8[" << *term << "]" << endl;
cout << "[" << printable << "]" << endl;
else
cout << "utf8[" << *term << "]" << endl;
} }
} }
} else if (op_flags & OPT_D) { } else if (op_flags & OPT_D) {

View File

@ -1,6 +1,6 @@
#ifndef _UTF8ITER_H_INCLUDED_ #ifndef _UTF8ITER_H_INCLUDED_
#define _UTF8ITER_H_INCLUDED_ #define _UTF8ITER_H_INCLUDED_
/* @(#$Id: utf8iter.h,v 1.4 2005-12-07 15:41:50 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: utf8iter.h,v 1.5 2006-01-28 10:23:55 dockes Exp $ (C) 2004 J.F.Dockes */
/** /**
* A small helper class to iterate over utf8 strings. This is not an * A small helper class to iterate over utf8 strings. This is not an
@ -44,6 +44,7 @@ class Utf8Iter {
cl = get_cl(pos); cl = get_cl(pos);
if (!poslok(pos, cl)) { if (!poslok(pos, cl)) {
bad = true; bad = true;
pos = s.length();
cl = 0; cl = 0;
return -1; return -1;
} }
@ -91,6 +92,7 @@ class Utf8Iter {
unsigned int val = getvalueat(pos, cl); unsigned int val = getvalueat(pos, cl);
if (val == (unsigned int)-1) { if (val == (unsigned int)-1) {
bad = true; bad = true;
pos = s.length();
cl = 0; cl = 0;
} }
return val; return val;
@ -142,7 +144,9 @@ class Utf8Iter {
return s.substr(pos, cl); return s.substr(pos, cl);
} }
bool eof() { bool eof() {
return bad || pos == s.length(); // Note: we always ensure that pos == s.length() when setting bad to
// true
return pos == s.length();
} }
bool error() { bool error() {
return bad; return bad;