fixes in textsplit

2005-02-08 09:34:47 +00:00 · 2005-02-08 09:34:47 +00:00 · 4c54a8478f
commit 4c54a8478f
parent 2a020407da
7 changed files with 135 additions and 62 deletions
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.5 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.6 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_TEXTSPLIT

@ -26,7 +26,7 @@ using namespace std;

 // Character classes: we have three main groups, and then some chars
 // are their own class because they want special handling.
-// We have an array with 256 slots where we keep the character states. 
+// We have an array with 256 slots where we keep the character types. 
 // The array could be fully static, but we use a small function to fill it 
 // once.
 enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
@ -37,32 +37,40 @@ static void setcharclasses()
    if (init)
 	return;
    unsigned int i;
-    memset(charclasses, LETTER, sizeof(charclasses));
+    for (i = 0 ; i < 256 ; i ++)
+	charclasses[i] = LETTER;
+
+    for (i = 0; i < ' ';i++)
+	charclasses[i] = SPACE;

    char digits[] = "0123456789";
-    for (i = 0; i  < sizeof(digits); i++)
+    for (i = 0; i  < strlen(digits); i++)
 	charclasses[int(digits[i])] = DIGIT;

    char blankspace[] = "\t\v\f ";
-    for (i = 0; i < sizeof(blankspace); i++)
+    for (i = 0; i < strlen(blankspace); i++)
 	charclasses[int(blankspace[i])] = SPACE;

-    char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*";
-    for (i = 0; i  < sizeof(seps); i++)
+    char seps[] = "!\"$%&()/<=>[\\]^{|}~:;,*`?";
+    for (i = 0; i  < strlen(seps); i++)
 	charclasses[int(seps[i])] = SPACE;

    char special[] = ".@+-,#'\n\r";
-    for (i = 0; i  < sizeof(special); i++)
+    for (i = 0; i  < strlen(special); i++)
 	charclasses[int(special[i])] = special[i];

    init = 1;
+    //for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
 }

-bool TextSplit::emitterm(string &w, int pos, bool doerase,
+// Do some cleanup (the kind which is simpler to do here than in the main loop,
+// then send term to our client.
+bool TextSplit::emitterm(bool isspan, string &w, int pos, bool doerase,
 			 int btstart, int btend)
 {
    LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
-    
+    if (fq && !isspan)
+	return true;
    if (!cb)
 	return false;

@ -73,13 +81,36 @@ bool TextSplit::emitterm(string &w, int pos, bool doerase,
 	case '.':
 	case ',':
 	case '@':
+	case '\'':
 	    w.erase(w.length()-1);
 	    break;
 	default:
-	    goto breakloop;
+	    goto breakloop1;
+	}
+    }
+ breakloop1:
+
+    // In addition, it doesn't make sense currently to keep ' at the beginning
+    while (w.length() > 0) {
+	switch (w[0]) {
+	case ',':
+	case '\'':
+	    w.erase(w.length()-1);
+	    break;
+	default:
+	    goto breakloop2;
+	}
+    }
+ breakloop2:
+
+    // 1 char word: we index single letters, but nothing else
+    if (w.length() == 1) {
+	int c = (int)w[0];
+	if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
+	    //cerr << "ERASING single letter term " << c << endl;
+	    w.erase();
 	}
    }
- breakloop:
    if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
 	bool ret = cb->takeword(w, pos, btstart, btend);
 	if (doerase)
@ -113,10 +144,10 @@ bool TextSplit::text_to_words(const string &in)
 	SPACE:
 	    if (word.length()) {
 		if (span.length() != word.length()) {
-		    if (!emitterm(span, spanpos, true, i-span.length(), i)) 
+		    if (!emitterm(true, span, spanpos, true, i-span.length(), i)) 
 			return false;
 		}
-		if (!emitterm(word, wordpos++, true, i-word.length(), i))
+		if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
 		    return false;
 		number = false;
 	    }
@ -126,42 +157,53 @@ bool TextSplit::text_to_words(const string &in)
 	case '-':
 	case '+':
 	    if (word.length() == 0) {
-		if (i < in.length() || charclasses[int(in[i+1])] == DIGIT) {
+		if (i < in.length() && charclasses[int(in[i+1])] == DIGIT) {
 		    number = true;
 		    word += c;
 		    span += c;
 		}
 	    } else {
 		if (span.length() != word.length()) {
-		    if (!emitterm(span, spanpos, false, i-span.length(), i))
+		    if (!emitterm(true, span, spanpos, false, i-span.length(), i))
 			return false;
 		}
-		if (!emitterm(word, wordpos++, true, i-word.length(), i))
+		if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
 		    return false;
 		number = false;
 		span += c;
 	    }
 	    break;
-	case '\'':
 	case '@':
 	    if (word.length()) {
 		if (span.length() != word.length()) {
-		    if (!emitterm(span, spanpos, false, i-span.length(), i))
+		    if (!emitterm(true, span, spanpos, false, i-span.length(), i))
 			return false;
 		}
-		if (!emitterm(word, wordpos++, true, i-word.length(), i))
+		if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
 		    return false;
 		number = false;
 	    } else
 		word += c;
 	    span += c;
 	    break;
+	case '\'':
+	    if (word.length()) {
+		if (span.length() != word.length()) {
+		    if (!emitterm(true, span, spanpos, false, i-span.length(), i))
+			return false;
+		}
+		if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
+		    return false;
+		number = false;
+		span += c;
+	    }
+	    break;
 	case '.':
 	    if (number) {
 		word += c;
 	    } else {
 		if (word.length()) {
-		    if (!emitterm(word, wordpos++, true, i-word.length(), i))
+		    if (!emitterm(false, word, wordpos++, true, i-word.length(), i))
 			return false;
 		    number = false;
 		} else 
@ -208,9 +250,9 @@ bool TextSplit::text_to_words(const string &in)
    }
    if (word.length()) {
 	if (span.length() != word.length())
-	    if (!emitterm(span, spanpos, true, i-span.length(), i))
+	    if (!emitterm(true, span, spanpos, true, i-span.length(), i))
 		return false;
-	return emitterm(word, wordpos, true, i-word.length(), i);
+	return emitterm(false, word, wordpos, true, i-word.length(), i);
    }
    return true;
 }
@ -220,11 +262,13 @@ bool TextSplit::text_to_words(const string &in)
 #include <unistd.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <stdio.h>

 #include <iostream>

 #include "textsplit.h"
 #include "readfile.h"
+#include "debuglog.h"

 using namespace std;

@ -232,7 +276,7 @@ using namespace std;
 class mySplitterCB : public TextSplitCB {
 public:
    bool takeword(const std::string &term, int pos, int bs, int be) {
-	cout << pos << " " << term << " bs " << bs << " be " << be << endl;
+	printf("%3d %-20s %d %d\n", pos, term.c_str(), bs, be);
 	return true;
    }
 };
@ -240,15 +284,18 @@ class mySplitterCB : public TextSplitCB {
 static string teststring = 
    "jfd@okyz.com "
    "Ceci. Est;Oui 1.24 n@d @net .net t@v@c c# c++ -10 o'brien l'ami "
-    "a 134 +134 -14 -1.5 +1.5 1.54e10 a"
-    "@^#$(#$(*)"
-    "one\n\rtwo\nthree-\nfour"
-    "[olala][ululu]"
-
+    "a 134 +134 -14 -1.5 +1.5 1.54e10 a "
+    "@^#$(#$(*) "
+    "one\n\rtwo\nthree-\nfour "
+    "[olala][ululu] "
+    "'o'brien' "						
+    "\n"							      
 ;

 int main(int argc, char **argv)
 {
+    DebugLog::getdbl()->setloglevel(DEBDEB1);
+    DebugLog::setfilename("stderr");
    mySplitterCB cb;
    TextSplit splitter(&cb);
    if (argc == 2) {
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -1,6 +1,6 @@
 #ifndef _TEXTSPLIT_H_INCLUDED_
 #define _TEXTSPLIT_H_INCLUDED_
-/* @(#$Id: textsplit.h,v 1.4 2005-02-07 13:17:47 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: textsplit.h,v 1.5 2005-02-08 09:34:46 dockes Exp $  (C) 2004 J.F.Dockes */

 #include <string>

@ -22,14 +22,17 @@ class TextSplitCB {
 * but 'ts much simpler this way...
 */
 class TextSplit {
+    bool fq;        // Are we splitting for query or index ?
    TextSplitCB *cb;
    int maxWordLength;
-    bool emitterm(std::string &term, int pos, bool doerase, int, int);
+    bool emitterm(bool isspan, std::string &term, int pos, bool doerase, 
+		  int bs, int be);
 public:
    /**
     * Constructor: just store callback and client data
     */
-    TextSplit(TextSplitCB *t) : cb(t), maxWordLength(40) {}
+    TextSplit(TextSplitCB *t, bool forquery = false) 
+	: fq(forquery), cb(t), maxWordLength(40) {}
    /**
     * Split text, emit words and positions.
     */
--- a/src/qtgui/recoll.pro
+++ b/src/qtgui/recoll.pro
@ -23,11 +23,13 @@ unix {
  UI_DIR = .ui
  MOC_DIR = .moc
  OBJECTS_DIR = .obj
-  LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv
+  LIBS += ../lib/librcl.a -L/usr/local/lib -lxapian -liconv \
+       -lfontconfig -lfreetype -lexpat -lz
  INCLUDEPATH += ../common ../index ../query ../unac ../utils 
+  #QMAKE_LFLAGS_SHAPP += -static
 }

 UNAME = $$system(uname -s)
 contains( UNAME, [lL]inux ) {
 	  LIBS -= -liconv
-}
+}
--- a/src/qtgui/recollmain.ui.h
+++ b/src/qtgui/recollmain.ui.h
@ -80,15 +80,14 @@ class myTextSplitCB : public TextSplitCB {
 static string plaintorich(const string &in, const list<string>& terms,
 			  list<pair<int, int> >&termoffsets)
 {
-#if 0
    {string t;
-	for (list<string>::const_iterator it = terms.begin();it != terms.end();it++) 
-	    t += "'" + *it + "' ";
-	LOGDEB(("plaintorich: term: %s\n", t.c_str()));
+	for (list<string>::const_iterator it = terms.begin();
+	     it != terms.end();it++) t += "'" + *it + "' ";
+	LOGDEB(("plaintorich: terms: %s\n", t.c_str()));
    }
-#endif
+
    myTextSplitCB cb(terms);
-    TextSplit splitter(&cb);
+    TextSplit splitter(&cb, true);
    splitter.text_to_words(in);
    string out1;
    if (cb.tboffs.empty()) {
--- a/src/query/xadump.cpp
+++ b/src/query/xadump.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: xadump.cpp,v 1.3 2005-01-25 14:37:21 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: xadump.cpp,v 1.4 2005-02-08 09:34:47 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif

 #include <strings.h>
@ -45,11 +45,23 @@ static int        op_flags;
 #define OPT_F     0x80
 #define OPT_E     0x100

-Xapian::Database db;
+Xapian::Database *db;
+
+static void cleanup()
+{
+    delete db;
+}
+
+static void sigcleanup(int sig)
+{
+    fprintf(stderr, "sigcleanup\n");
+    cleanup();
+    exit(1);
+}

 int main(int argc, char **argv)
 {
-    string dbdir = "/home/dockes/tmp/xapiandb";
+    string dbdir = "/home/dockes/.recoll/xapiandb";
    string outencoding = "ISO8859-1";
    int docid = 1;
    string aterm;
@ -92,46 +104,57 @@ int main(int argc, char **argv)

    if (argc != 0)
 	Usage();
+    atexit(cleanup);
+    if (signal(SIGHUP, SIG_IGN) != SIG_IGN)
+	signal(SIGHUP, sigcleanup);
+    if (signal(SIGINT, SIG_IGN) != SIG_IGN)
+	signal(SIGINT, sigcleanup);
+    if (signal(SIGQUIT, SIG_IGN) != SIG_IGN)
+	signal(SIGQUIT, sigcleanup);
+    if (signal(SIGTERM, SIG_IGN) != SIG_IGN)
+	signal(SIGTERM, sigcleanup);

    try {
-	db = Xapian::Auto::open(dbdir, Xapian::DB_OPEN);
+	db = new Xapian::Database(dbdir);

-	cout << "DB: ndocs " << db.get_doccount() << " lastdocid " <<
-	    db.get_lastdocid() << " avglength " << db.get_avlength() << endl;
+	cout << "DB: ndocs " << db->get_doccount() << " lastdocid " <<
+	    db->get_lastdocid() << " avglength " << db->get_avlength() << endl;
 	    
 	if (op_flags & OPT_T) {
 	    Xapian::TermIterator term;
 	    string printable;
 	    if (op_flags & OPT_i) {
-		for (term = db.termlist_begin(docid);
-		     term != db.termlist_end(docid);term++) {
+		for (term = db->termlist_begin(docid);
+		     term != db->termlist_end(docid);term++) {
 		    transcode(*term, printable, "UTF-8", outencoding);
-		    cout << printable << endl;
+		    cout << "[" << printable << "]" << endl;
 		}
 	    } else {
-		for (term = db.allterms_begin(); 
-		     term != db.allterms_end();term++) {
-		    transcode(*term, printable, "UTF-8", outencoding);
-		    cout << printable << endl;
+		for (term = db->allterms_begin(); 
+		     term != db->allterms_end();term++) {
+		    if (transcode(*term, printable, "UTF-8", outencoding))
+			cout << "[" << printable << "]" << endl;
+		    else
+			cout << "utf8[" << *term << "]" << endl;
 		}
 	    }
 	} else if (op_flags & OPT_D) {
-	    Xapian::Document doc = db.get_document(docid);
+	    Xapian::Document doc = db->get_document(docid);
 	    string data = doc.get_data();
 	    cout << data << endl;
 	} else if (op_flags & OPT_P) {
 	    Xapian::PostingIterator doc;
-	    for (doc = db.postlist_begin(aterm);
-		 doc != db.postlist_end(aterm);doc++) {
+	    for (doc = db->postlist_begin(aterm);
+		 doc != db->postlist_end(aterm);doc++) {
 		cout << *doc << endl;
 	    }
 		
 	} else if (op_flags & OPT_F) {
 	    cout << "FreqFor " << aterm << " : " <<
-		db.get_termfreq(aterm) << endl;
+		db->get_termfreq(aterm) << endl;
 	} else if (op_flags & OPT_E) {
 	    cout << "Exists " << aterm << " : " <<
-		db.term_exists(aterm) << endl;
+		db->term_exists(aterm) << endl;
 	} 


--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.19 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.20 2005-02-08 09:34:46 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@ -454,7 +454,7 @@ bool Rcl::Db::setQuery(const std::string &querystring)
 	return false;

    wsQData splitData;
-    TextSplit splitter(&splitData);
+    TextSplit splitter(&splitData, true);

    string noacc;
    if (!dumb_string(querystring, noacc)) {
--- a/src/utils/execmd.cpp
+++ b/src/utils/execmd.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: execmd.cpp,v 1.3 2005-02-01 17:20:06 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: execmd.cpp,v 1.4 2005-02-08 09:34:47 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_EXECMD
 #include <unistd.h>
@ -186,7 +186,6 @@ ExecCmd::doexec(const string &cmd, const list<string> args,
 	    while (argv[i]) cerr << argv[i++] << endl;}
 #endif

-	LOGDEB(("ExecCmd::doexec: execvp(%s)\n", cmd.c_str()));
 	execvp(cmd.c_str(), (char *const*)argv);
 	// Hu ho
 	LOGERR(("ExecCmd::doexec: execvp(%s) failed. errno %d\n", cmd.c_str(),