more textsplit tweaking

2006-01-28 10:23:55 +00:00 · 2006-01-28 10:23:55 +00:00 · 8c9eb8c6d3
commit 8c9eb8c6d3
parent c50f023002
6 changed files with 118 additions and 114 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.16 2006-01-23 13:32:28 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.17 2006-01-28 10:23:55 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -79,7 +79,7 @@ static void setcharclasses()
    for (i = 0; i < strlen(blankspace); i++)
 	charclasses[int(blankspace[i])] = SPACE;

-    char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*`?";
+    char seps[] = "!\"$%&()/<=>[\\]^{|}~:;*`?";
    for (i = 0; i  < strlen(seps); i++)
 	charclasses[int(seps[i])] = SPACE;

@ -91,6 +91,7 @@ static void setcharclasses()
    //for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
    for (i = 0; i < sizeof(uniign); i++) 
 	unicign.insert(uniign[i]);
+    unicign.insert((unsigned int)-1);
 }

 // Do some cleanup (the kind which is simpler to do here than in the
@ -100,9 +101,8 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
 {
    LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));

-    if (!cb)
-	return false;
-
+    // Maybe trim end of word. These are chars that we would keep inside 
+    // a word or span, but not at the end
    // Maybe trim end of word. These are chars that we would keep inside 
    // a word or span, but not at the end
    while (w.length() > 0) {
@ -111,8 +111,9 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
 	case ',':
 	case '@':
 	case '\'':
-	    w.erase(w.length()-1);
-	    btend--; if (btend < 0) btend=0;
+	    w.resize(w.length()-1);
+	    if (--btend < 0) 
+		btend=0;
 	    break;
 	default:
 	    goto breakloop1;
@ -120,30 +121,21 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
    }
 breakloop1:

-    // In addition, it doesn't make sense currently to keep ' at the beginning
-    while (w.length() > 0) {
-	switch (w[0]) {
-	case ',':
-	case '\'':
-	    w.erase(w.length()-1);
-	    btstart++;
-	    break;
-	default:
-	    goto breakloop2;
-	}
-    }
- breakloop2:
+    // Trimming chars at the beginning of string: used to have (buggy)
+    // code to remove , and \ at start of term, didn't seem to be ever called

-    // 1 char word: we index single letters, but nothing else
-    if (w.length() == 1) {
-	int c = (int)w[0];
-	if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
-	    //cerr << "ERASING single letter term " << c << endl;
-	    w.erase();
+    unsigned int l = w.length();
+    if (l > 0 && l < (unsigned)maxWordLength) {
+	if (l == 1) {
+	    // 1 char word: we index single letters and digits, but
+	    // nothing else
+	    int c = (int)w[0];
+	    if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
+		//cerr << "ERASING single letter term " << c << endl;
+		return true;
+	    }
 	}
-    }
-    if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
-	if (w != prevterm || pos != prevpos) {
+	if (pos != prevpos || l != prevterm.length() || w != prevterm) {
 	    bool ret = cb->takeword(w, pos, btstart, btend);
 	    prevterm = w;
 	    prevpos = pos;
@ -153,11 +145,26 @@ bool TextSplit::emitterm(bool isspan, string &w, int pos,
    return true;
 }

-// A routine called from different places in text_to_words(), to adjust
-// the current state and call the word handler. This is purely for
-// factoring common code from different places text_to_words()
-bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
-		       bool spanerase, int bp)
+/**
+ * A routine called from different places in text_to_words(), to
+ * adjust the current state of the parser, and call the word
+ * handler/emitter. Emit and reset the current word, possibly emit the current
+ * span (if different). In query mode, words are not emitted, only final spans
+ * 
+ * This is purely for factoring common code from different places
+ * text_to_words(). 
+ * 
+ * @return true if ok, false for error. Splitting should stop in this case.
+ * @param word      Word value. This will be empty on return in ALL non-error
+ *                   cases
+ * @param wordpos   Term position for word. Always ++ by us.
+ * @param span      Span value
+ * @param spanpos   Term position for the current span
+ * @param spanerase Set if the current span is at its end. Reset it.
+ * @param bp        The current BYTE position in the stream
+ */
+inline bool TextSplit::doemit(string &word, int &wordpos, string &span, 
+			      int &spanpos, bool spanerase, int bp)
 {
 #if 0
    cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" <<
@ -165,43 +172,36 @@ bool TextSplit::doemit(string &word, int &wordpos, string &span, int spanpos,
 	 << endl;
 #endif

-    // When splitting for query, we only emit final spans
-    if (fq && !spanerase) {
-	wordpos++;
-	word.erase();
-	return true;
-    }
-
-    // Emit span or both word and span if they are different
-    if (!emitterm(true, span, spanpos, bp-span.length(), bp))
-	return false;
+    // Emit span. When splitting for query, we only emit final spans
+    if (!fq || spanerase)
+	if (!emitterm(true, span, spanpos, bp-span.length(), bp))
+	    return false;
+    // Emit word if different from span and not query mode
    if (word.length() != span.length() && !fq)
 	if (!emitterm(false, word, wordpos, bp-word.length(), bp))
 	    return false;

    // Adjust state
    wordpos++;
-    if (spanerase)
-	span.erase();
-    word.erase();
+    word.clear();
+    if (spanerase) {
+	span.clear();
+	spanpos = wordpos;
+    }

    return true;
 }

 static inline int whatcc(unsigned int c)
 {
-    int cc;
    if (c <= 127) {
-	cc = charclasses[c]; 
+	return charclasses[c]; 
    } else {
-	if (c == (unsigned int)-1)
-	    cc = SPACE;
-	else if (unicign.find(c) != unicign.end())
-	    cc = SPACE;
+	if (unicign.find(c) != unicign.end())
+	    return SPACE;
 	else
-	    cc = LETTER;
+	    return LETTER;
    }
-    return cc;
 }

 /** 
@ -240,8 +240,6 @@ bool TextSplit::text_to_words(const string &in)
 		    return false;
 		number = false;
 	    }
-	    spanpos = wordpos;
-	    span.erase();
 	    break;
 	case '-':
 	case '+':
@ -259,6 +257,28 @@ bool TextSplit::text_to_words(const string &in)
 		span += it;
 	    }
 	    break;
+	case '.':
+	case ',':
+	    if (number) {
+		word += it;
+		span += it;
+		break;
+	    } else {
+		// If . inside a word, keep it, else, this is whitespace. 
+		// A final comma in a word will be removed by doemit
+		if (cc == '.' && word.length()) {
+		    if (!doemit(word, wordpos, span, spanpos, false, 
+				it.getBpos()))
+			return false;
+		    // span length could have been adjusted by trimming
+		    // inside doemit
+		    if (span.length())
+			span += it;
+		    break;
+		}
+	    }
+	    goto SPACE;
+	    break;
 	case '@':
 	    if (word.length()) {
 		if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
@ -269,6 +289,8 @@ bool TextSplit::text_to_words(const string &in)
 	    span += it;
 	    break;
 	case '\'':
+	    // If in word, potential span: o'brien, else, this is more 
+	    // whitespace
 	    if (word.length()) {
 		if (!doemit(word, wordpos, span, spanpos, false, it.getBpos()))
 		    return false;
@ -276,31 +298,17 @@ bool TextSplit::text_to_words(const string &in)
 		span += it;
 	    }
 	    break;
-	case '.':
-	    if (number) {
-		word += it;
-	    } else {
-		//cerr<<"Got . span: '"<<span<<"' word: '"<<word<<"'"<<endl;
-		if (word.length()) {
-		    if (!doemit(word, wordpos, span, spanpos, false, 
-				it.getBpos()))
-			return false;
-		    number = false;
-		} else 
-		    word += it;
-	    }
-	    span += it;
-	    break;
 	case '#': 
-	    // Keep it only at end of word...
-	    if (word.length() > 0 && 
-		(whatcc(it[charpos+1]) == SPACE ||
-		 whatcc(it[charpos+1]) == '\n' || 
-		 whatcc(it[charpos+1]) == '\r')) {
-		word += it;
-		span += it;
+	    // Keep it only at end of word... Special case for c# you see...
+	    if (word.length() > 0) {
+		int w = whatcc(it[charpos+1]);
+		if (w == SPACE || w == '\n' || w == '\r') {
+		    word += it;
+		    span += it;
+		    break;
+		}
 	    }
-		
+	    goto SPACE;
 	    break;
 	case '\n':
 	case '\r':
@ -310,20 +318,19 @@ bool TextSplit::text_to_words(const string &in)
 		// do almost always. We'd then need a way to check if
 		// the - was added as part of the word hyphenation, or was 
 		// there in the first place, but this would need a dictionary.
+		// Also we'd need to check for a soft-hyphen and remove it,
+		// but this would require more utf-8 magic
 	    } else {
 		// Handle like a normal separator
 		goto SPACE;
 	    }
 	    break;
-	case LETTER:
 	case DIGIT:
+	    if (word.length() == 0)
+		number = true;
+	    /* FALLTHROUGH */
+	case LETTER:
 	default:
-	    if (word.length() == 0) {
-		if (cc == DIGIT)
-		    number = true;
-		else
-		    number = false;
-	    }
 	    word += it;
 	    span += it;
 	    break;
@ -367,21 +374,18 @@ class mySplitterCB : public TextSplitCB {
    }
 };

-static string teststring = 
-    "Un bout de texte \n" 
-    "normal. "
-    "jfd@okyz.com "
-    "Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
-    "a 134 +134 -14 -1.5 +1.5 1.54e10 a "
-    "@^#$(#$(*) "
-    "192.168.4.1 "
-    "one\n\rtwo\nthree-\nfour "
-    "[olala][ululu] "
-    "'o'brien' "
-    "utf-8 ucs-4©"
-    "\n"							      
+static string teststring1 = 
+    "Un bout de texte \nnormal. jfd@okyz.com \n"
+    "Ceci. Est;Oui n@d @net .net t@v@c c# c++ -10 o'brien l'ami \n"
+    "a 134 +134 -14 -1.5 +1.5 1.54e10 a @^#$(#$(*) 1,2 1,2e30\n"
+    "192.168.4.1 one\n\rtwo\nthree-\nfour [olala][ululu] 'o'brien' \n"
+    "utf-8 ucs-4© \\nodef\n"
+    "','this \n"
+    "M9R F($AA;F1L:6YG\"0D)\"0D@(\" @(#4P, T)0W)A=&4)\"0D)\"2 @,C4P#0E3"
+    " ,able,test-domain "
+    " -wl,--export-dynamic "
 ;
-static string teststring1 = "c++ ";
+static string teststring = " -wl,--export-dynamic ";

 static string thisprog;

--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -1,6 +1,6 @@
 #ifndef _TEXTSPLIT_H_INCLUDED_
 #define _TEXTSPLIT_H_INCLUDED_
-/* @(#$Id: textsplit.h,v 1.8 2005-10-19 10:21:48 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: textsplit.h,v 1.9 2006-01-28 10:23:55 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>
 #ifndef NO_NAMESPACES
@ -35,7 +35,7 @@ class TextSplit {
    TextSplitCB *cb;
    int maxWordLength;
    bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
-    bool doemit(string &word, int &wordpos, string &span, int spanpos,
+    bool doemit(string &word, int &wordpos, string &span, int &spanpos,
 		bool spanerase, int bp);
 public:
    /**
--- a/src/configure
+++ b/src/configure
@ -2296,7 +2296,7 @@ cd ..
 m_prefix=$prefix
 test "X$m_prefix" = "XNONE" && m_prefix=/usr/local
 m_datadir=${m_prefix}/share
-QTRECOLL_DATADIR=${m_datadir}
+QTRECOLL_DATADIR=${m_datadir}/recoll



--- a/src/configure.ac
+++ b/src/configure.ac
@ -119,7 +119,7 @@ cd ..
 m_prefix=$prefix
 test "X$m_prefix" = "XNONE" && m_prefix=/usr/local
 m_datadir=${m_prefix}/share
-QTRECOLL_DATADIR=${m_datadir}
+QTRECOLL_DATADIR=${m_datadir}/recoll

 AC_SUBST(LIBXAPIAN)
 AC_SUBST(XAPIANCXXFLAGS)
--- a/src/query/xadump.cpp
+++ b/src/query/xadump.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: xadump.cpp,v 1.9 2006-01-25 08:09:41 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: xadump.cpp,v 1.10 2006-01-28 10:23:55 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -147,16 +147,12 @@ int main(int argc, char **argv)
 	    if (op_flags & OPT_i) {
 		for (term = db->termlist_begin(docid);
 		     term != db->termlist_end(docid);term++) {
-		    transcode(*term, printable, "UTF-8", outencoding);
-		    cout << "[" << printable << "]" << endl;
+		    cout << "[" << *term << "]" << endl;
 		}
 	    } else {
 		for (term = db->allterms_begin(); 
 		     term != db->allterms_end();term++) {
-		    if (transcode(*term, printable, "UTF-8", outencoding))
-			cout << "[" << printable << "]" << endl;
-		    else
-			cout << "utf8[" << *term << "]" << endl;
+		    cout << "utf8[" << *term << "]" << endl;
 		}
 	    }
 	} else if (op_flags & OPT_D) {
--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@ -1,6 +1,6 @@
 #ifndef _UTF8ITER_H_INCLUDED_
 #define _UTF8ITER_H_INCLUDED_
-/* @(#$Id: utf8iter.h,v 1.4 2005-12-07 15:41:50 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: utf8iter.h,v 1.5 2006-01-28 10:23:55 dockes Exp $  (C) 2004 J.F.Dockes */

 /** 
 * A small helper class to iterate over utf8 strings. This is not an
@ -44,6 +44,7 @@ class Utf8Iter {
 	cl = get_cl(pos);
 	if (!poslok(pos, cl)) {
 	    bad = true;
+	    pos = s.length();
 	    cl = 0;
 	    return -1;
 	}
@ -91,6 +92,7 @@ class Utf8Iter {
 	unsigned int val = getvalueat(pos, cl);
 	if (val == (unsigned int)-1) {
 	    bad = true;
+	    pos = s.length();
 	    cl = 0;
 	}
 	return val;
@ -142,7 +144,9 @@ class Utf8Iter {
 	return s.substr(pos, cl);
    }
    bool eof() {
-	return bad || pos == s.length();
+	// Note: we always ensure that pos == s.length() when setting bad to 
+	// true
+	return pos == s.length();
    }
    bool error() {
 	return bad;