From 0dde2934d6129d211eb7dd843f8978d0c18b6a58 Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 17 Dec 2004 15:50:48 +0000 Subject: [PATCH] very basic indexing working --- src/query/xadump.cpp | 39 +++++++++++++++++++++++++++++------ src/rcldb/rcldb.cpp | 48 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 71 insertions(+), 16 deletions(-) diff --git a/src/query/xadump.cpp b/src/query/xadump.cpp index ab149447..89154c44 100644 --- a/src/query/xadump.cpp +++ b/src/query/xadump.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: xadump.cpp,v 1.1 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: xadump.cpp,v 1.2 2004-12-17 15:50:48 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -34,12 +34,19 @@ static int op_flags; #define OPT_i 0x4 #define OPT_T 0x8 #define OPT_D 0x10 +#define OPT_t 0x20 +#define OPT_P 0x40 +#define OPT_F 0x80 +#define OPT_E 0x100 + +Xapian::Database db; int main(int argc, char **argv) { string dbdir = "/home/dockes/tmp/xapiandb"; string outencoding = "ISO8859-1"; int docid = 1; + string aterm; thisprog = argv[0]; argc--; argv++; @@ -51,8 +58,11 @@ int main(int argc, char **argv) Usage(); while (**argv) switch (*(*argv)++) { - case 'T': op_flags |= OPT_T; break; case 'D': op_flags |= OPT_D; break; + case 'E': op_flags |= OPT_E; break; + case 'F': op_flags |= OPT_F; break; + case 'P': op_flags |= OPT_P; break; + case 'T': op_flags |= OPT_T; break; case 'd': op_flags |= OPT_d; if (argc < 2) Usage(); dbdir = *(++argv); argc--; @@ -65,6 +75,10 @@ int main(int argc, char **argv) if (sscanf(*(++argv), "%d", &docid) != 1) Usage(); argc--; goto b1; + case 't': op_flags |= OPT_t; if (argc < 2) Usage(); + aterm = *(++argv); + argc--; + goto b1; default: Usage(); break; } b1: argc--; argv++; @@ -73,11 +87,12 @@ int main(int argc, char **argv) if (argc != 0) Usage(); - Xapian::Database db; - try { db = Xapian::Auto::open(dbdir, Xapian::DB_OPEN); + cout << "DB: ndocs " << db.get_doccount() << " lastdocid " << + db.get_lastdocid() << " avglength " << db.get_avlength() << endl; + if (op_flags & OPT_T) { Xapian::TermIterator term; string printable; @@ -98,8 +113,20 @@ int main(int argc, char **argv) Xapian::Document doc = db.get_document(docid); string data = doc.get_data(); cout << data << endl; - } - + } else if (op_flags & OPT_P) { + Xapian::PostingIterator doc; + for (doc = db.postlist_begin(aterm); + doc != db.postlist_end(aterm);doc++) { + cout << *doc << endl; + } + + } else if (op_flags & OPT_F) { + cout << "FreqFor " << aterm << " : " << + db.get_termfreq(aterm) << endl; + } else if (op_flags & OPT_E) { + cout << "Exists " << aterm << " : " << + db.term_exists(aterm) << endl; + } diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index bea32639..dac4f55b 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.4 2004-12-17 15:36:13 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.5 2004-12-17 15:50:48 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -13,6 +13,7 @@ using namespace std; #include "rcldb.h" #include "textsplit.h" #include "transcode.h" +#include "unacpp.h" #include "xapian.h" @@ -162,6 +163,25 @@ bool splitCb(void *cdata, const std::string &term, int pos) return true; } +// Unaccent and lowercase data: use unac +// for accents, and do it by hand for upper / lower. Note lowercasing is +// only for ascii letters anyway, so it's just A-Z -> a-z +bool dumb_string(const string &in, string &out) +{ + string inter; + out.erase(); + if (!unac_cpp(in, inter)) + return false; + out.resize(inter.length()); + for (unsigned int i = 0; i < inter.length(); i++) { + if (inter[i] >= 'A' && inter[i] <= 'Z') + out += inter[i] + 'a' - 'A'; + else + out += inter[i]; + } + return true; +} + bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc) { if (pdata == 0) @@ -183,25 +203,33 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc) record += "\n"; newdocument.set_data(record); - // TOBEDONE: - // Need to add stuff here to unaccent and lowercase the data: use unac - // for accents, and do it by hand for upper / lower. Note lowercasing is - // only for ascii letters anyway, so it's just A-Z -> a-z - wsData splitData(newdocument); TextSplit splitter(splitCb, &splitData); - splitter.text_to_words(doc.title); + string noacc; + if (!unac_cpp(doc.title, noacc)) { + return false; + } + splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; - splitter.text_to_words(doc.text); + if (!dumb_string(doc.text, noacc)) { + return false; + } + splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; - splitter.text_to_words(doc.keywords); + if (!dumb_string(doc.keywords, noacc)) { + return false; + } + splitter.text_to_words(noacc); splitData.basepos += splitData.curpos + 100; - splitter.text_to_words(doc.abstract); + if (!dumb_string(doc.abstract, noacc)) { + return false; + } + splitter.text_to_words(noacc); newdocument.add_term("T" + doc.mimetype); string pathterm = "P" + fn;