From 3c66f8e964d5ae187192ffec41a715e69b6c40f0 Mon Sep 17 00:00:00 2001 From: dockes Date: Fri, 17 Dec 2004 15:36:13 +0000 Subject: [PATCH] *** empty log message *** --- src/common/unacpp.cpp | 81 +++++++++++++++++++++++++++++++++++++++++++ src/common/unacpp.h | 11 ++++++ src/rcldb/rcldb.cpp | 71 ++++++++++++++++++++++++++----------- 3 files changed, 142 insertions(+), 21 deletions(-) create mode 100644 src/common/unacpp.cpp create mode 100644 src/common/unacpp.h diff --git a/src/common/unacpp.cpp b/src/common/unacpp.cpp new file mode 100644 index 00000000..501aaebe --- /dev/null +++ b/src/common/unacpp.cpp @@ -0,0 +1,81 @@ +#ifndef lint +static char rcsid[] = "@(#$Id: unacpp.cpp,v 1.1 2004-12-17 15:36:13 dockes Exp $ (C) 2004 J.F.Dockes"; +#endif + +#ifndef TEST_UNACPP + +#include + +#include + +using std::string; + +#include "unacpp.h" +#include "unac.h" + + +bool unac_cpp(const std::string &in, std::string &out, const char *encoding) +{ + char *cout = 0; + size_t out_len; + + if (unac_string(encoding, in.c_str(), in.length(), &cout, &out_len) < 0) { + char cerrno[20]; + sprintf(cerrno, "%d", errno); + out = string("unac_string failed, errno : ") + cerrno; + return false; + } + out.assign(cout, out_len); + return true; +} + +#else // not testing + +#include +#include +#include +#include + +#include + +using namespace std; + +#include "unacpp.h" +#include "readfile.h" + +int main(int argc, char **argv) +{ + if (argc != 4) { + cerr << "Usage: unacpp " << endl; + exit(1); + } + const char *encoding = argv[1]; + string ifn = argv[2]; + const char *ofn = argv[3]; + + string odata; + if (!file_to_string(ifn, odata)) { + cerr << "file_to_string: " << odata << endl; + exit(1); + } + string ndata; + if (!unac_cpp(odata, ndata, encoding)) { + cerr << "unac: " << ndata << endl; + exit(1); + } + + int fd = open(ofn, O_CREAT|O_EXCL|O_WRONLY, 0666); + if (fd < 0) { + cerr << "Open/Create " << ofn << " failed: " << strerror(errno) + << endl; + exit(1); + } + if (write(fd, ndata.c_str(), ndata.length()) != (int)ndata.length()) { + cerr << "Write(2) failed: " << strerror(errno) << endl; + exit(1); + } + close(fd); + exit(0); +} + +#endif diff --git a/src/common/unacpp.h b/src/common/unacpp.h new file mode 100644 index 00000000..90bad9c9 --- /dev/null +++ b/src/common/unacpp.h @@ -0,0 +1,11 @@ +#ifndef _UNACPP_H_INCLUDED_ +#define _UNACPP_H_INCLUDED_ +/* @(#$Id: unacpp.h,v 1.1 2004-12-17 15:36:13 dockes Exp $ (C) 2004 J.F.Dockes */ + +#include + +// A small wrapper for unac.c +extern bool unac_cpp(const std::string &in, std::string &out, + const char *encoding = "UTF-8"); + +#endif /* _UNACPP_H_INCLUDED_ */ diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 6b706e95..bea32639 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.3 2004-12-17 13:01:01 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.4 2004-12-17 15:36:13 dockes Exp $ (C) 2004 J.F.Dockes"; #endif #include @@ -144,15 +144,17 @@ class wsData { bool splitCb(void *cdata, const std::string &term, int pos) { wsData *data = (wsData*)cdata; - cerr << "splitCb: term " << term << endl; + + // cerr << "splitCb: term " << term << endl; + //string printable; + //transcode(term, printable, "UTF-8", "ISO8859-1"); + //cerr << "Adding " << printable << endl; + try { // 1 is the value for wdfinc in index_text when called from omindex // TOBEDONE: check what this is used for data->curpos = pos; data->doc.add_posting(term, data->basepos + data->curpos, 1); - string printable; - transcode(term, printable, "UTF-8", "ISO8859-1"); - cerr << "Adding " << printable << endl; } catch (...) { cerr << "Error occurred during add_posting" << endl; return false; @@ -202,41 +204,68 @@ bool Rcl::Db::add(const string &fn, const Rcl::Doc &doc) splitter.text_to_words(doc.abstract); newdocument.add_term("T" + doc.mimetype); - newdocument.add_term("P" + fn); + string pathterm = "P" + fn; + newdocument.add_term(pathterm); -#if 0 - if (dupes == DUPE_replace) { + if (1 /*dupes == DUPE_replace*/) { // If this document has already been indexed, update the existing // entry. try { - Xapian::docid did = db.replace_document(urlterm, newdocument); + Xapian::docid did = ndb->wdb.replace_document(pathterm, + newdocument); +#if 0 if (did < updated.size()) { updated[did] = true; - cout << "updated." << endl; + //cout << "updated." << endl; } else { - cout << "added." << endl; + //cout << "added." << endl; } +#endif } catch (...) { // FIXME: is this ever actually needed? - db.add_document(newdocument); - cout << "added (failed re-seek for duplicate)." << endl; + ndb->wdb.add_document(newdocument); + //cout << "added (failed re-seek for duplicate)." << endl; } - } else -#endif - { + } else { + try { ndb->wdb.add_document(newdocument); // cout << "added." << endl; + } catch (...) { + cerr << "Got exception while adding doc" << endl; + return false; } + } return true; } bool Rcl::Db::needUpdate(const string &filename, const struct stat *stp) { + if (pdata == 0) + return false; + Native *ndb = (Native *)pdata; + + string pathterm = "P" + filename; + if (!ndb->wdb.term_exists(pathterm)) + return true; + Xapian::PostingIterator doc; + try { + Xapian::PostingIterator did = ndb->wdb.postlist_begin(pathterm); + if (did == ndb->wdb.postlist_end(pathterm)) + return true; + Xapian::Document doc = ndb->wdb.get_document(*did); + string data = doc.get_data(); + //cout << "DOCUMENT EXISTS " << data << endl; + const char *cp = strstr(data.c_str(), "mtime="); + cp += 6; + long mtime = atol(cp); + if (mtime >= stp->st_mtime) { + // cerr << "DOCUMENT UP TO DATE" << endl; + return false; + } + } catch (...) { + return true; + } + return true; - // TOBEDONE: Check if file has already been indexed, and has changed since - // - Make path term, - // - query db: postlist_begin->docid - // - fetch doc (get_document(docid) - // - check date field, maybe skip }