From 869b57eb8c2b0f265b9409486a69c868b0e3afc4 Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 17 Dec 2004 13:01:01 +0000 Subject: [PATCH] *** empty log message *** --- src/common/rclconfig.h | 4 +- src/common/textsplit.cpp | 54 +++++++++++------- src/common/textsplit.h | 12 ++-- src/index/recollindex.cpp | 23 +++++++- src/query/Makefile | 17 ++++++ src/query/xadump.cpp | 117 ++++++++++++++++++++++++++++++++++++++ src/rcldb/rcldb.cpp | 112 +++++++++++++++++++++++++++++++++--- src/rcldb/rcldb.h | 8 ++- 8 files changed, 307 insertions(+), 40 deletions(-) create mode 100644 src/query/Makefile create mode 100644 src/query/xadump.cpp diff --git a/src/common/rclconfig.h b/src/common/rclconfig.h index 23453d5f..0dcadfa3 100644 --- a/src/common/rclconfig.h +++ b/src/common/rclconfig.h @@ -1,6 +1,6 @@ #ifndef _RCLCONFIG_H_INCLUDED_ #define _RCLCONFIG_H_INCLUDED_ -/* @(#$Id: rclconfig.h,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rclconfig.h,v 1.3 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes */ #include "conftree.h" @@ -30,7 +30,7 @@ class RclConfig { conf->get("defaultcharset", defcharset, keydir); conf->get("defaultlanguage", deflang, keydir); string str; - conf->get("guesscharset", deflang, str); + conf->get("guesscharset", str, keydir); guesscharset = ConfTree::stringToBool(str); } bool getConfParam(const string &name, string &value) diff --git a/src/common/textsplit.cpp b/src/common/textsplit.cpp index d152341a..efa8bc37 100644 --- a/src/common/textsplit.cpp +++ b/src/common/textsplit.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.3 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.4 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #ifndef TEST_TEXTSPLIT @@ -57,8 +57,11 @@ static void setcharclasses() init = 1; } -void TextSplit::emitterm(string &w, int pos, bool doerase = true) +bool TextSplit::emitterm(string &w, int pos, bool doerase = true) { + if (!termsink) + return false; + // Maybe trim end of word. These are chars that we would keep inside // a word or span, but not at the end while (w.length() > 0) { @@ -73,12 +76,13 @@ void TextSplit::emitterm(string &w, int pos, bool doerase = true) } } breakloop: - if (w.length()) { - if (termsink) - termsink(cdata, w, pos); + if (w.length() > 0 && w.length() < (unsigned)maxWordLength) { + bool ret = termsink(cdata, w, pos); + if (doerase) + w.erase(); + return ret; } - if (doerase) - w.erase(); + return true; } /* @@ -86,7 +90,7 @@ void TextSplit::emitterm(string &w, int pos, bool doerase = true) * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, * are handled properly, */ -void TextSplit::text_to_words(const string &in) +bool TextSplit::text_to_words(const string &in) { setcharclasses(); string span; @@ -103,9 +107,11 @@ void TextSplit::text_to_words(const string &in) SPACE: if (word.length()) { if (span.length() != word.length()) { - emitterm(span, spanpos); + if (!emitterm(span, spanpos)) + return false; } - emitterm(word, wordpos++); + if (!emitterm(word, wordpos++)) + return false; number = false; } spanpos = wordpos; @@ -121,9 +127,11 @@ void TextSplit::text_to_words(const string &in) } } else { if (span.length() != word.length()) { - emitterm(span, spanpos, false); + if (!emitterm(span, spanpos, false)) + return false; } - emitterm(word, wordpos++); + if (!emitterm(word, wordpos++)) + return false; number = false; span += c; } @@ -132,9 +140,11 @@ void TextSplit::text_to_words(const string &in) case '@': if (word.length()) { if (span.length() != word.length()) { - emitterm(span, spanpos, false); + if (!emitterm(span, spanpos, false)) + return false; } - emitterm(word, wordpos++); + if (!emitterm(word, wordpos++)) + return false; number = false; } else word += c; @@ -145,7 +155,8 @@ void TextSplit::text_to_words(const string &in) word += c; } else { if (word.length()) { - emitterm(word, wordpos++); + if (!emitterm(word, wordpos++)) + return false; number = false; } else word += c; @@ -155,7 +166,8 @@ void TextSplit::text_to_words(const string &in) case '#': // Keep it only at end of word... if (word.length() > 0 && - (i == in.length() -1 || charclasses[int(in[i+1])] == SPACE)) { + (i == in.length() -1 || charclasses[int(in[i+1])] == SPACE || + in[i+1] == '\n' || in[i+1] == '\r')) { word += c; span += c; } @@ -190,9 +202,11 @@ void TextSplit::text_to_words(const string &in) } if (word.length()) { if (span.length() != word.length()) - emitterm(span, spanpos); - emitterm(word, wordpos); + if (!emitterm(span, spanpos)) + return false; + return emitterm(word, wordpos); } + return true; } #else // TEST driver -> @@ -208,10 +222,10 @@ void TextSplit::text_to_words(const string &in) using namespace std; -int termsink(void *, const string &term, int pos) +bool termsink(void *, const string &term, int pos) { cout << pos << " " << term << endl; - return 0; + return true; } diff --git a/src/common/textsplit.h b/src/common/textsplit.h index edd9d79b..0f09a24d 100644 --- a/src/common/textsplit.h +++ b/src/common/textsplit.h @@ -1,6 +1,6 @@ #ifndef _TEXTSPLIT_H_INCLUDED_ #define _TEXTSPLIT_H_INCLUDED_ -/* @(#$Id: textsplit.h,v 1.1 2004-12-14 17:49:11 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: textsplit.h,v 1.2 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -12,20 +12,22 @@ */ class TextSplit { public: - typedef int (*TermSink)(void *cdata, const std::string & term, int pos); + typedef bool (*TermSink)(void *cdata, const std::string & term, int pos); private: TermSink termsink; void *cdata; - void emitterm(std::string &term, int pos, bool doerase); + int maxWordLength; + bool emitterm(std::string &term, int pos, bool doerase); public: /** * Constructor: just store callback and client data */ - TextSplit(TermSink t, void *c) : termsink(t), cdata(c) {} + TextSplit(TermSink t, void *c) : termsink(t), cdata(c), maxWordLength(40) + {} /** * Split text, emit words and positions. */ - void text_to_words(const std::string &in); + bool text_to_words(const std::string &in); }; #endif /* _TEXTSPLIT_H_INCLUDED_ */ diff --git a/src/index/recollindex.cpp b/src/index/recollindex.cpp index dd8f5979..53d843dd 100644 --- a/src/index/recollindex.cpp +++ b/src/index/recollindex.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.3 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.4 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -29,20 +29,31 @@ bool textPlainToDoc(RclConfig *conf, const string &fn, if (!file_to_string(fn, otext)) return false; - // Try to guess charset, then convert to utf-8, and fill document fields + // Try to guess charset, then convert to utf-8, and fill document + // fields The charset guesser really doesnt work well in general + // and should be avoided (especially for short documents) string charset; if (conf->guesscharset) { charset = csguess(otext, conf->defcharset); } else charset = conf->defcharset; string utf8; - if (transcode(otext, charset, utf8, "UTF-8")) + cerr << "textPlainToDoc: transcod from " << charset << " to UTF-8" + << endl; + + if (!transcode(otext, utf8, charset, "UTF-8")) { + cerr << "textPlainToDoc: transcode failed: charset '" << charset + << "' to UTF-8: "<< utf8 << endl; + otext.erase(); return 0; + } Rcl::Doc out; out.origcharset = charset; out.text = utf8; + //out.text = otext; docout = out; + cerr << utf8 << endl; return true; } @@ -183,6 +194,12 @@ indexfile(void *cdata, const std::string &fn, const struct stat *stp, if (!fun(me->config, fn, mime, doc)) return FsTreeWalker::FtwOk; + // Set up common fields: + doc.mimetype = mime; + char ascdate[20]; + sprintf(ascdate, "%ld", long(stp->st_mtime)); + doc.mtime = ascdate; + // Set up xapian document, add postings and misc fields, // add to or update database. if (!me->db.add(fn, doc)) diff --git a/src/query/Makefile b/src/query/Makefile new file mode 100644 index 00000000..530cc3d1 --- /dev/null +++ b/src/query/Makefile @@ -0,0 +1,17 @@ + +CXXFLAGS = -Wall -g -I. -I../index -I../utils -I../common -I/usr/local/include + + +PROGS = xadump +all: $(PROGS) + +XADUMP_OBJS= xadump.o transcode.o +xadump : $(XADUMP_OBJS) + $(CXX) $(CXXFLAGS) -o xadump $(XADUMP_OBJS) \ + -L/usr/local/lib -lxapian -liconv + +transcode.o : ../index/transcode.cpp ../index/transcode.h + $(CXX) $(CXXFLAGS) -c -o transcode.o ../index/transcode.cpp + +clean: + rm -f *.o $(PROGS) diff --git a/src/query/xadump.cpp b/src/query/xadump.cpp new file mode 100644 index 00000000..ab149447 --- /dev/null +++ b/src/query/xadump.cpp @@ -0,0 +1,117 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: xadump.cpp,v 1.1 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes"; +#endif + +#include + +#include +#include +#include + +#include "transcode.h" + +using namespace std; + +#include "xapian.h" + +static string thisprog; + +static string usage = + " -d -e " + " \n\n" + ; + +static void +Usage(void) +{ + cerr << thisprog << ": usage:\n" << usage; + exit(1); +} + +static int op_flags; +#define OPT_d 0x1 +#define OPT_e 0x2 +#define OPT_i 0x4 +#define OPT_T 0x8 +#define OPT_D 0x10 + +int main(int argc, char **argv) +{ + string dbdir = "/home/dockes/tmp/xapiandb"; + string outencoding = "ISO8859-1"; + int docid = 1; + + thisprog = argv[0]; + argc--; argv++; + + while (argc > 0 && **argv == '-') { + (*argv)++; + if (!(**argv)) + /* Cas du "adb - core" */ + Usage(); + while (**argv) + switch (*(*argv)++) { + case 'T': op_flags |= OPT_T; break; + case 'D': op_flags |= OPT_D; break; + case 'd': op_flags |= OPT_d; if (argc < 2) Usage(); + dbdir = *(++argv); + argc--; + goto b1; + case 'e': op_flags |= OPT_d; if (argc < 2) Usage(); + outencoding = *(++argv); + argc--; + goto b1; + case 'i': op_flags |= OPT_i; if (argc < 2) Usage(); + if (sscanf(*(++argv), "%d", &docid) != 1) Usage(); + argc--; + goto b1; + default: Usage(); break; + } + b1: argc--; argv++; + } + + if (argc != 0) + Usage(); + + Xapian::Database db; + + try { + db = Xapian::Auto::open(dbdir, Xapian::DB_OPEN); + + if (op_flags & OPT_T) { + Xapian::TermIterator term; + string printable; + if (op_flags & OPT_i) { + for (term = db.termlist_begin(docid); + term != db.termlist_end(docid);term++) { + transcode(*term, printable, "UTF-8", outencoding); + cout << printable << endl; + } + } else { + for (term = db.allterms_begin(); + term != db.allterms_end();term++) { + transcode(*term, printable, "UTF-8", outencoding); + cout << printable << endl; + } + } + } else if (op_flags & OPT_D) { + Xapian::Document doc = db.get_document(docid); + string data = doc.get_data(); + cout << data << endl; + } + + + + + + } catch (const Xapian::Error &e) { + cout << "Exception: " << e.get_msg() << endl; + } catch (const string &s) { + cout << "Exception: " << s << endl; + } catch (const char *s) { + cout << "Exception: " << s << endl; + } catch (...) { + cout << "Caught unknown exception" << endl; + } + exit(0); +} diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index ea6dcf6d..6b706e95 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.3 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -11,6 +11,8 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.2 2004-12-15 15:00:36 dockes Exp $ using namespace std; #include "rcldb.h" +#include "textsplit.h" +#include "transcode.h" #include "xapian.h" @@ -29,7 +31,7 @@ class Native { Rcl::Db::Db() { - // pdata = new Native; + pdata = new Native; } Rcl::Db::~Db() @@ -37,6 +39,8 @@ Rcl::Db::~Db() if (pdata == 0) return; Native *ndb = (Native *)pdata; + cerr << "Db::~Db: isopen " << ndb->isopen << " iswritable " << + ndb->iswritable << endl; try { // There is nothing to do for an ro db. if (ndb->isopen == false || ndb->iswritable == false) { @@ -58,10 +62,11 @@ Rcl::Db::~Db() bool Rcl::Db::open(const string& dir, OpenMode mode) { - return true; if (pdata == 0) return false; Native *ndb = (Native *)pdata; + cerr << "Db::open: isopen " << ndb->isopen << " iswritable " << + ndb->iswritable << endl; try { switch (mode) { case DbUpd: @@ -95,10 +100,11 @@ bool Rcl::Db::open(const string& dir, OpenMode mode) bool Rcl::Db::close() { - return true; if (pdata == 0) return false; Native *ndb = (Native *)pdata; + cerr << "Db::open: isopen " << ndb->isopen << " iswritable " << + ndb->iswritable << endl; if (ndb->isopen == false) return true; try { @@ -125,9 +131,103 @@ bool Rcl::Db::close() return false; } +// A small class to hold state while splitting text +class wsData { + public: + Xapian::Document &doc; + Xapian::termpos basepos; // Base for document section + Xapian::termpos curpos; // Last position sent to callback + wsData(Xapian::Document &d) : doc(d), basepos(1), curpos(0) + {} +}; + +bool splitCb(void *cdata, const std::string &term, int pos) +{ + wsData *data = (wsData*)cdata; + cerr << "splitCb: term " << term << endl; + try { + // 1 is the value for wdfinc in index_text when called from omindex + // TOBEDONE: check what this is used for + data->curpos = pos; + data->doc.add_posting(term, data->basepos + data->curpos, 1); + string printable; + transcode(term, printable, "UTF-8", "ISO8859-1"); + cerr << "Adding " << printable << endl; + } catch (...) { + cerr << "Error occurred during add_posting" << endl; + return false; + } + return true; +} + bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc) { - return true; + if (pdata == 0) + return false; + Native *ndb = (Native *)pdata; + + Xapian::Document newdocument; + + // Document data record. omindex has the following nl separated fields: + // - url + // - sample + // - caption (title limited to 100 chars) + // - mime type + string record = "url=file:/" + fn; + record += "\nmtime=" + doc.mtime; + record += "\nsample="; + record += "\ncaption=" + doc.title; + record += "\nmtype=" + doc.mimetype; + record += "\n"; + newdocument.set_data(record); + + // TOBEDONE: + // Need to add stuff here to unaccent and lowercase the data: use unac + // for accents, and do it by hand for upper / lower. Note lowercasing is + // only for ascii letters anyway, so it's just A-Z -> a-z + + wsData splitData(newdocument); + + TextSplit splitter(splitCb, &splitData); + + splitter.text_to_words(doc.title); + + splitData.basepos += splitData.curpos + 100; + splitter.text_to_words(doc.text); + + splitData.basepos += splitData.curpos + 100; + splitter.text_to_words(doc.keywords); + + splitData.basepos += splitData.curpos + 100; + splitter.text_to_words(doc.abstract); + + newdocument.add_term("T" + doc.mimetype); + newdocument.add_term("P" + fn); + +#if 0 + if (dupes == DUPE_replace) { + // If this document has already been indexed, update the existing + // entry. + try { + Xapian::docid did = db.replace_document(urlterm, newdocument); + if (did < updated.size()) { + updated[did] = true; + cout << "updated." << endl; + } else { + cout << "added." << endl; + } + } catch (...) { + // FIXME: is this ever actually needed? + db.add_document(newdocument); + cout << "added (failed re-seek for duplicate)." << endl; + } + } else +#endif + { + ndb->wdb.add_document(newdocument); + // cout << "added." << endl; + } + return true; } @@ -140,5 +240,3 @@ bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) // - fetch doc (get_document(docid) // - check date field, maybe skip } - - diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 71b60a7c..d60ee6eb 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -1,6 +1,6 @@ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.2 2004-12-15 15:00:36 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.3 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes */ #include @@ -13,11 +13,13 @@ namespace Rcl { */ class Doc { public: + string mimetype; + string mtime; // Modification time as decimal ascii string origcharset; string title; - string abstract; - string keywords; string text; + string keywords; + string abstract; }; /**