From 869b57eb8c2b0f265b9409486a69c868b0e3afc4 Mon Sep 17 00:00:00 2001
From: dockes <none@none>
Date: Fri, 17 Dec 2004 13:01:01 +0000
Subject: [PATCH] *** empty log message ***

---
 src/common/rclconfig.h    |   4 +-
 src/common/textsplit.cpp  |  54 +++++++++++-------
 src/common/textsplit.h    |  12 ++--
 src/index/recollindex.cpp |  23 +++++++-
 src/query/Makefile        |  17 ++++++
 src/query/xadump.cpp      | 117 ++++++++++++++++++++++++++++++++++++++
 src/rcldb/rcldb.cpp       | 112 +++++++++++++++++++++++++++++++++---
 src/rcldb/rcldb.h         |   8 ++-
 8 files changed, 307 insertions(+), 40 deletions(-)
 create mode 100644 src/query/Makefile
 create mode 100644 src/query/xadump.cpp

diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h
index 23453d5f..0dcadfa3 100644
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@@ -1,6 +1,6 @@
 #ifndef _RCLCONFIG_H_INCLUDED_
 #define _RCLCONFIG_H_INCLUDED_
-/* @(#$Id: rclconfig.h,v 1.2 2004-12-15 15:00:36 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rclconfig.h,v 1.3 2004-12-17 13:01:01 dockes Exp $  (C) 2004 J.F.Dockes */
 
 #include "conftree.h"
 
@@ -30,7 +30,7 @@ class RclConfig {
 	conf->get("defaultcharset", defcharset, keydir);
 	conf->get("defaultlanguage", deflang, keydir);
 	string str;
-	conf->get("guesscharset", deflang, str);
+	conf->get("guesscharset", str, keydir);
 	guesscharset = ConfTree::stringToBool(str);
     }
     bool getConfParam(const string &name, string &value) 
diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp
index d152341a..efa8bc37 100644
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.3 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #ifndef TEST_TEXTSPLIT
 
@@ -57,8 +57,11 @@ static void setcharclasses()
     init = 1;
 }
 
-void TextSplit::emitterm(string &w, int pos, bool doerase = true)
+bool TextSplit::emitterm(string &w, int pos, bool doerase = true)
 {
+    if (!termsink)
+	return false;
+
     // Maybe trim end of word. These are chars that we would keep inside 
     // a word or span, but not at the end
     while (w.length() > 0) {
@@ -73,12 +76,13 @@ void TextSplit::emitterm(string &w, int pos, bool doerase = true)
 	}
     }
  breakloop:
-    if (w.length()) {
-	if (termsink)
-	    termsink(cdata, w, pos);
+    if (w.length() > 0 && w.length() < (unsigned)maxWordLength) {
+	bool ret = termsink(cdata, w, pos);
+	if (doerase)
+	    w.erase();
+	return ret;
     }
-    if (doerase)
-	w.erase();
+    return true;
 }
 
 /* 
@@ -86,7 +90,7 @@ void TextSplit::emitterm(string &w, int pos, bool doerase = true)
  * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, 
  * are handled properly,
  */
-void TextSplit::text_to_words(const string &in)
+bool TextSplit::text_to_words(const string &in)
 {
     setcharclasses();
     string span;
@@ -103,9 +107,11 @@ void TextSplit::text_to_words(const string &in)
 	SPACE:
 	    if (word.length()) {
 		if (span.length() != word.length()) {
-		    emitterm(span, spanpos);
+		    if (!emitterm(span, spanpos)) 
+			return false;
 		}
-		emitterm(word, wordpos++);
+		if (!emitterm(word, wordpos++))
+		    return false;
 		number = false;
 	    }
 	    spanpos = wordpos;
@@ -121,9 +127,11 @@ void TextSplit::text_to_words(const string &in)
 		}
 	    } else {
 		if (span.length() != word.length()) {
-		    emitterm(span, spanpos, false);
+		    if (!emitterm(span, spanpos, false))
+			return false;
 		}
-		emitterm(word, wordpos++);
+		if (!emitterm(word, wordpos++))
+		    return false;
 		number = false;
 		span += c;
 	    }
@@ -132,9 +140,11 @@ void TextSplit::text_to_words(const string &in)
 	case '@':
 	    if (word.length()) {
 		if (span.length() != word.length()) {
-		    emitterm(span, spanpos, false);
+		    if (!emitterm(span, spanpos, false))
+			return false;
 		}
-		emitterm(word, wordpos++);
+		if (!emitterm(word, wordpos++))
+		    return false;
 		number = false;
 	    } else
 		word += c;
@@ -145,7 +155,8 @@ void TextSplit::text_to_words(const string &in)
 		word += c;
 	    } else {
 		if (word.length()) {
-		    emitterm(word, wordpos++);
+		    if (!emitterm(word, wordpos++))
+			return false;
 		    number = false;
 		} else 
 		    word += c;
@@ -155,7 +166,8 @@ void TextSplit::text_to_words(const string &in)
 	case '#': 
 	    // Keep it only at end of word...
 	    if (word.length() > 0 && 
-		(i == in.length() -1 || charclasses[int(in[i+1])] == SPACE)) {
+		(i == in.length() -1 || charclasses[int(in[i+1])] == SPACE ||
+		 in[i+1] == '\n' || in[i+1] == '\r')) {
 		word += c;
 		span += c;
 	    }
@@ -190,9 +202,11 @@ void TextSplit::text_to_words(const string &in)
     }
     if (word.length()) {
 	if (span.length() != word.length())
-	    emitterm(span, spanpos);
-	emitterm(word, wordpos);
+	    if (!emitterm(span, spanpos))
+		return false;
+	return emitterm(word, wordpos);
     }
+    return true;
 }
 
 #else  // TEST driver ->
@@ -208,10 +222,10 @@ void TextSplit::text_to_words(const string &in)
 
 using namespace std;
 
-int termsink(void *, const string &term, int pos)
+bool termsink(void *, const string &term, int pos)
 {
     cout << pos << " " << term << endl;
-    return 0;
+    return true;
 }
 
 
diff --git a/src/common/textsplit.h b/src/common/textsplit.h
index edd9d79b..0f09a24d 100644
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@@ -1,6 +1,6 @@
 #ifndef _TEXTSPLIT_H_INCLUDED_
 #define _TEXTSPLIT_H_INCLUDED_
-/* @(#$Id: textsplit.h,v 1.1 2004-12-14 17:49:11 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: textsplit.h,v 1.2 2004-12-17 13:01:01 dockes Exp $  (C) 2004 J.F.Dockes */
 
 #include <string>
 
@@ -12,20 +12,22 @@
  */
 class TextSplit {
  public:
-    typedef int (*TermSink)(void *cdata, const std::string & term, int pos);
+    typedef bool (*TermSink)(void *cdata, const std::string & term, int pos);
  private:
     TermSink termsink;
     void *cdata;
-    void emitterm(std::string &term, int pos, bool doerase);
+    int maxWordLength;
+    bool emitterm(std::string &term, int pos, bool doerase);
  public:
     /**
      * Constructor: just store callback and client data
      */
-    TextSplit(TermSink t, void *c) : termsink(t), cdata(c) {}
+    TextSplit(TermSink t, void *c) : termsink(t), cdata(c), maxWordLength(40)
+    {}
     /**
      * Split text, emit words and positions.
      */
-    void text_to_words(const std::string &in);
+    bool text_to_words(const std::string &in);
 };
 
 #endif /* _TEXTSPLIT_H_INCLUDED_ */
diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp
index dd8f5979..53d843dd 100644
--- a/src/index/recollindex.cpp
+++ b/src/index/recollindex.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.3 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.4 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 
 #include <sys/stat.h>
@@ -29,20 +29,31 @@ bool textPlainToDoc(RclConfig *conf, const string &fn,
     if (!file_to_string(fn, otext))
 	return false;
 	
-    // Try to guess charset, then convert to utf-8, and fill document fields
+    // Try to guess charset, then convert to utf-8, and fill document
+    // fields The charset guesser really doesnt work well in general
+    // and should be avoided (especially for short documents)
     string charset;
     if (conf->guesscharset) {
 	charset = csguess(otext, conf->defcharset);
     } else
 	charset = conf->defcharset;
     string utf8;
-    if (transcode(otext, charset, utf8, "UTF-8"))
+    cerr << "textPlainToDoc: transcod from " << charset << " to  UTF-8" 
+	 << endl;
+
+    if (!transcode(otext, utf8, charset, "UTF-8")) {
+	cerr << "textPlainToDoc: transcode failed: charset '" << charset
+	     << "' to UTF-8: "<< utf8 << endl;
+	otext.erase();
 	return 0;
+    }
 
     Rcl::Doc out;
     out.origcharset = charset;
     out.text = utf8;
+    //out.text = otext;
     docout = out;
+    cerr << utf8 << endl;
     return true;
 }
 
@@ -183,6 +194,12 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp,
     if (!fun(me->config, fn,  mime, doc))
 	return FsTreeWalker::FtwOk;
 
+    // Set up common fields:
+    doc.mimetype = mime;
+    char ascdate[20];
+    sprintf(ascdate, "%ld", long(stp->st_mtime));
+    doc.mtime = ascdate;
+
     // Set up xapian document, add postings and misc fields, 
     // add to or update database.
     if (!me->db.add(fn, doc))
diff --git a/src/query/Makefile b/src/query/Makefile
new file mode 100644
index 00000000..530cc3d1
--- /dev/null
+++ b/src/query/Makefile
@@ -0,0 +1,17 @@
+
+CXXFLAGS = -Wall -g -I. -I../index -I../utils -I../common -I/usr/local/include
+
+
+PROGS = xadump
+all: $(PROGS)
+
+XADUMP_OBJS= xadump.o transcode.o
+xadump : $(XADUMP_OBJS)
+	$(CXX) $(CXXFLAGS) -o xadump $(XADUMP_OBJS) \
+	       -L/usr/local/lib -lxapian -liconv
+
+transcode.o : ../index/transcode.cpp ../index/transcode.h
+	$(CXX) $(CXXFLAGS) -c -o transcode.o ../index/transcode.cpp
+
+clean: 
+	rm -f *.o $(PROGS)
diff --git a/src/query/xadump.cpp b/src/query/xadump.cpp
new file mode 100644
index 00000000..ab149447
--- /dev/null
+++ b/src/query/xadump.cpp
@@ -0,0 +1,117 @@
+#ifndef lint
+static char rcsid[] = "@(#$Id: xadump.cpp,v 1.1 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes";
+#endif
+
+#include <strings.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "transcode.h"
+
+using namespace std;
+
+#include "xapian.h"
+
+static string thisprog;
+
+static string usage =
+    " -d <dbdir> -e <output encoding>"
+    "  \n\n"
+    ;
+
+static void
+Usage(void)
+{
+    cerr << thisprog  << ": usage:\n" << usage;
+    exit(1);
+}
+
+static int        op_flags;
+#define OPT_d	  0x1 
+#define OPT_e     0x2
+#define OPT_i     0x4
+#define OPT_T     0x8
+#define OPT_D     0x10
+
+int main(int argc, char **argv)
+{
+    string dbdir = "/home/dockes/tmp/xapiandb";
+    string outencoding = "ISO8859-1";
+    int docid = 1;
+
+    thisprog = argv[0];
+    argc--; argv++;
+
+    while (argc > 0 && **argv == '-') {
+	(*argv)++;
+	if (!(**argv))
+	    /* Cas du "adb - core" */
+	    Usage();
+	while (**argv)
+	    switch (*(*argv)++) {
+	    case 'T':	op_flags |= OPT_T; break;
+	    case 'D':	op_flags |= OPT_D; break;
+	    case 'd':	op_flags |= OPT_d; if (argc < 2)  Usage();
+		dbdir = *(++argv);
+		argc--; 
+		goto b1;
+	    case 'e':	op_flags |= OPT_d; if (argc < 2)  Usage();
+		outencoding = *(++argv);
+		argc--; 
+		goto b1;
+	    case 'i':	op_flags |= OPT_i; if (argc < 2)  Usage();
+		if (sscanf(*(++argv), "%d", &docid) != 1) Usage();
+		argc--; 
+		goto b1;
+	    default: Usage();	break;
+	    }
+    b1: argc--; argv++;
+    }
+
+    if (argc != 0)
+	Usage();
+
+    Xapian::Database db;
+
+    try {
+	db = Xapian::Auto::open(dbdir, Xapian::DB_OPEN);
+
+	if (op_flags & OPT_T) {
+	    Xapian::TermIterator term;
+	    string printable;
+	    if (op_flags & OPT_i) {
+		for (term = db.termlist_begin(docid);
+		     term != db.termlist_end(docid);term++) {
+		    transcode(*term, printable, "UTF-8", outencoding);
+		    cout << printable << endl;
+		}
+	    } else {
+		for (term = db.allterms_begin(); 
+		     term != db.allterms_end();term++) {
+		    transcode(*term, printable, "UTF-8", outencoding);
+		    cout << printable << endl;
+		}
+	    }
+	} else if (op_flags & OPT_D) {
+	    Xapian::Document doc = db.get_document(docid);
+	    string data = doc.get_data();
+	    cout << data << endl;
+	}
+
+
+
+
+
+    } catch (const Xapian::Error &e) {
+	cout << "Exception: " << e.get_msg() << endl;
+    } catch (const string &s) {
+	cout << "Exception: " << s << endl;
+    } catch (const char *s) {
+	cout << "Exception: " << s << endl;
+    } catch (...) {
+	cout << "Caught unknown exception" << endl;
+    }
+    exit(0);
+}
diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
index ea6dcf6d..6b706e95 100644
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.3 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 
 #include <sys/stat.h>
@@ -11,6 +11,8 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.2 2004-12-15 15:00:36 dockes Exp $
 using namespace std;
 
 #include "rcldb.h"
+#include "textsplit.h"
+#include "transcode.h"
 
 #include "xapian.h"
 
@@ -29,7 +31,7 @@ class Native {
 
 Rcl::Db::Db() 
 {
-    //    pdata = new Native;
+    pdata = new Native;
 }
 
 Rcl::Db::~Db()
@@ -37,6 +39,8 @@ Rcl::Db::~Db()
     if (pdata == 0)
 	return;
     Native *ndb = (Native *)pdata;
+    cerr << "Db::~Db: isopen " << ndb->isopen << " iswritable " <<
+	ndb->iswritable << endl;
     try {
 	// There is nothing to do for an ro db.
 	if (ndb->isopen == false || ndb->iswritable == false) {
@@ -58,10 +62,11 @@ Rcl::Db::~Db()
 
 bool Rcl::Db::open(const string& dir, OpenMode mode)
 {
-    return true;
     if (pdata == 0)
 	return false;
     Native *ndb = (Native *)pdata;
+    cerr << "Db::open: isopen " << ndb->isopen << " iswritable " <<
+	ndb->iswritable << endl;
     try {
 	switch (mode) {
 	case DbUpd:
@@ -95,10 +100,11 @@ bool Rcl::Db::open(const string& dir, OpenMode mode)
 
 bool Rcl::Db::close()
 {
-    return true;
     if (pdata == 0)
 	return false;
     Native *ndb = (Native *)pdata;
+    cerr << "Db::open: isopen " << ndb->isopen << " iswritable " <<
+	ndb->iswritable << endl;
     if (ndb->isopen == false)
 	return true;
     try {
@@ -125,9 +131,103 @@ bool Rcl::Db::close()
     return false;
 }
 
+// A small class to hold state while splitting text
+class wsData {
+ public:
+    Xapian::Document &doc;
+    Xapian::termpos basepos; // Base for document section
+    Xapian::termpos curpos;  // Last position sent to callback
+    wsData(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
+    {}
+};
+
+bool splitCb(void *cdata, const std::string &term, int pos)
+{
+    wsData *data = (wsData*)cdata;
+    cerr << "splitCb: term " << term << endl;
+    try {
+	// 1 is the value for wdfinc in index_text when called from omindex
+	// TOBEDONE: check what this is used for
+	data->curpos = pos;
+	data->doc.add_posting(term, data->basepos + data->curpos, 1);
+	string printable;
+	transcode(term, printable, "UTF-8", "ISO8859-1");
+	cerr << "Adding " << printable << endl;
+    } catch (...) {
+	cerr << "Error occurred during add_posting" << endl;
+	return false;
+    }
+    return true;
+}
+
 bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc)
 {
-    return true;
+    if (pdata == 0)
+	return false;
+    Native *ndb = (Native *)pdata;
+
+    Xapian::Document newdocument;
+
+    // Document data record. omindex has the following nl separated fields:
+    // - url
+    // - sample
+    // - caption (title limited to 100 chars)
+    // - mime type 
+    string record = "url=file:/" + fn;
+    record += "\nmtime=" + doc.mtime;
+    record += "\nsample=";
+    record += "\ncaption=" + doc.title;
+    record += "\nmtype=" + doc.mimetype;
+    record += "\n";
+    newdocument.set_data(record);
+
+    // TOBEDONE:
+    // Need to add stuff here to unaccent and lowercase the data: use unac 
+    // for accents, and do it by hand for upper / lower. Note lowercasing is
+    // only for ascii letters anyway, so it's just A-Z -> a-z
+
+    wsData splitData(newdocument);
+
+    TextSplit splitter(splitCb, &splitData);
+
+    splitter.text_to_words(doc.title);
+
+    splitData.basepos += splitData.curpos + 100;
+    splitter.text_to_words(doc.text);
+
+    splitData.basepos += splitData.curpos + 100;
+    splitter.text_to_words(doc.keywords);
+
+    splitData.basepos += splitData.curpos + 100;
+    splitter.text_to_words(doc.abstract);
+
+    newdocument.add_term("T" + doc.mimetype);
+    newdocument.add_term("P" + fn);
+
+#if 0    
+    if (dupes == DUPE_replace) {
+	// If this document has already been indexed, update the existing
+	// entry.
+	try {
+	    Xapian::docid did = db.replace_document(urlterm, newdocument);
+	    if (did < updated.size()) {
+		updated[did] = true;
+		cout << "updated." << endl;
+	    } else {
+		cout << "added." << endl;
+	    }
+	} catch (...) {
+	    // FIXME: is this ever actually needed?
+	    db.add_document(newdocument);
+	    cout << "added (failed re-seek for duplicate)." << endl;
+	}
+    } else 
+#endif
+	{
+	    ndb->wdb.add_document(newdocument);
+	    // cout << "added." << endl;
+	}
+  return true;
 }
 
 
@@ -140,5 +240,3 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp)
     // - fetch doc (get_document(docid)
     // - check date field, maybe skip
 }
-
-
diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h
index 71b60a7c..d60ee6eb 100644
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@@ -1,6 +1,6 @@
 #ifndef _DB_H_INCLUDED_
 #define _DB_H_INCLUDED_
-/* @(#$Id: rcldb.h,v 1.2 2004-12-15 15:00:36 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rcldb.h,v 1.3 2004-12-17 13:01:01 dockes Exp $  (C) 2004 J.F.Dockes */
 
 #include <string>
 
@@ -13,11 +13,13 @@ namespace Rcl {
  */
 class Doc {
  public:
+    string mimetype;
+    string mtime;       // Modification time as decimal ascii
     string origcharset;
     string title;
-    string abstract;
-    string keywords;
     string text;
+    string keywords;
+    string abstract;
 };
 
 /**