use explicit parent udi term instead of Qterm structure to express parent-child relationship

2008-07-29 06:25:29 +00:00 · 2008-07-29 06:25:29 +00:00 · 24ac62eb86
commit 24ac62eb86
parent 3109a33f4a
5 changed files with 91 additions and 61 deletions
--- a/src/index/indexer.cpp
+++ b/src/index/indexer.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: indexer.cpp,v 1.67 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: indexer.cpp,v 1.68 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -390,7 +390,10 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
    // without mime type will not be purged from the db, resulting
    // in possible 'cannot intern file' messages at query time...
    char cbuf[100]; 
-    // Document signature
+    // Document signature. This is based on mtime and size and used
    // for the uptodate check (the value computed here is checked
    // against the stored one). Changing the computation forces a full
    // reindex of course.
    sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
    string sig = cbuf;
    string udi;
@ -398,6 +401,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
    if (!m_db.needUpdate(udi, sig)) {
 	LOGDEB(("processone: up to date: %s\n", fn.c_str()));
 	if (m_updater) {
 	    // Status bar update, abort request etc.
 	    m_updater->status.fn = fn;
 	    if (!m_updater->update()) {
 		return FsTreeWalker::FtwStop;
@ -422,14 +426,18 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
 		ercnt, charset.c_str(), path_getsimple(fn).c_str()));
    }
    LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n",
-	    path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(), "UTF-8"));
+	     path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(), 
 	     "UTF-8"));
    string parent_udi;
    make_udi(fn, "", parent_udi);
    Rcl::Doc doc;
    const string plus("+");
    char ascdate[20];
    sprintf(ascdate, "%ld", long(stp->st_mtime));
    FileInterner::Status fis = FileInterner::FIAgain;
    bool hadNullIpath = false;
    Rcl::Doc doc;
    const string plus = "+";
    char ascdate[20];
    sprintf(ascdate, "%ld", long(stp->st_mtime));
    while (fis == FileInterner::FIAgain) {
 	doc.erase();
 	string ipath;
@ -468,6 +476,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
 	    hadNullIpath = true;
 	else
 	    doc.ipath = ipath;
 	doc.url = string("file://") + fn;
 	// Note that the filter may have its own idea of the file name 
@ -484,10 +493,11 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
 	sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
 	doc.sig = cbuf;
-	// Add document to database
+	// Add document to database. If there is an ipath, add it as a children
 	// of the file document.
 	string udi;
 	make_udi(fn, ipath, udi);
-	if (!m_db.add(udi, doc)) 
+	if (!m_db.addOrUpdate(udi, ipath.empty() ? "" : parent_udi, doc)) 
 	    return FsTreeWalker::FtwError;
 	// Tell what we are doing and check for interrupt request
@ -520,9 +530,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
 	// Document signature for up to date checks.
 	sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
 	fileDoc.sig = cbuf;
-	string udi;
+	if (!m_db.addOrUpdate(parent_udi, "", fileDoc)) 
 	make_udi(fn, "", udi);
 	if (!m_db.add(udi, fileDoc)) 
 	    return FsTreeWalker::FtwError;
    }
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.136 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.137 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -80,39 +80,41 @@ namespace Rcl {
 // found in document)
 const static string rclSyntAbs("?!#@");
-// Compute the unique term used to link documents to their file-system source:
+// Compute the unique term used to link documents to their origin. 
-// Hashed path + possible internal path
+// "Q" + external udi
 static inline string make_uniterm(const string& udi)
 {
    string uniterm("Q");
    uniterm.append(udi);
    return uniterm;
 }
 // Compute parent term used to link documents to their parent document (if any)
 // "" + parent external udi
 static inline string make_parentterm(const string& udi)
 {
    // I prefer to be in possible conflict with omega than with
    // user-defined fields (Xxxx) that we also allow. "F" is currently
    // not used by omega (2008-07)
    string pterm("F");
    pterm.append(udi);
    return pterm;
 }
 /* See comment in class declaration: return all subdocuments of a
- * document given by its unique path id */
+ * document given by its unique id. 
-bool Db::Native::subDocs(const string &uniterm, vector<Xapian::docid>& docids) 
+*/
 bool Db::Native::subDocs(const string &udi, vector<Xapian::docid>& docids) 
 {
    LOGDEB2(("subDocs: [%s]\n", uniterm.c_str()));
    docids.clear();
    string ermsg;
    string pterm = make_parentterm(udi);
    for (int tries = 0; tries < 2; tries++) {
 	try {
-	    Xapian::TermIterator it = db.allterms_begin(); 
+	    Xapian::PostingIterator it = db.postlist_begin(pterm);
-	    it.skip_to(uniterm);
+	    for (; it != db.postlist_end(pterm); it++) {
-	    // Don't return the doc itself:
+		docids.push_back(*it);
 	    it++;
 	    for (; it != db.allterms_end(); it++) {
 		LOGDEB2(("subDocs: testing [%s]\n", (*it).c_str()));
 		// If current term does not begin with uniterm or has
 		// another |, not the same file
 		if ((*it).find(uniterm) != 0 || 
 		    (*it).find_last_of("|") != uniterm.length()-1)
 		    break;
 		docids.push_back(*(db.postlist_begin(*it)));
 	    }
-	    LOGDEB2(("Db::Native::subDocs: returning %d ids\n", docids.size()));
+	    LOGDEB(("Db::Native::subDocs: returning %d ids\n", docids.size()));
 	    return true;
 	} catch (const Xapian::DatabaseModifiedError &e) {
 	    LOGDEB(("Db::subDocs: got modified error. reopen/retry\n"));
@ -800,9 +802,11 @@ static const int MB = 1024 * 1024;
 // the title abstract and body and add special terms for file name,
 // date, mime type ... , create the document data record (more
 // metadata), and update database
-bool Db::add(const string &udi, const Doc &idoc)
+bool Db::addOrUpdate(const string &udi, const string &parent_udi,
 		     const Doc &idoc)
 {
-    LOGDEB1(("Db::add: udi %s\n", udi.c_str()));
+    LOGDEB(("Db::add: udi [%s] parent [%s]\n", 
 	     udi.c_str(), parent_udi.c_str()));
    if (m_ndb == 0)
 	return false;
    static int first = 1;
@ -927,7 +931,11 @@ bool Db::add(const string &udi, const Doc &idoc)
    // checks, and unique id for the replace_document() call.
    string uniterm = make_uniterm(udi);
    newdocument.add_term(uniterm);
-
+    // Parent term. This is used to find all descendents, mostly to delete them 
    // when the parent goes away
    if (!parent_udi.empty()) {
 	newdocument.add_term(make_parentterm(parent_udi));
    }
    // Dates etc...
    time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : 
 			doc.dmtime.c_str());
@ -1091,7 +1099,7 @@ bool Db::needUpdate(const string &udi, const string& sig)
 	    // Set the existence flag for all the subdocs (if any)
 	    vector<Xapian::docid> docids;
-	    if (!m_ndb->subDocs(uniterm, docids)) {
+	    if (!m_ndb->subDocs(udi, docids)) {
 		LOGERR(("Rcl::Db::needUpdate: can't get subdocs list\n"));
 		return true;
 	    }
@ -1193,9 +1201,9 @@ bool Db::purge()
 	    } catch (const Xapian::DocNotFoundError &) {
 		LOGDEB(("Db::purge: document #%d not found\n", docid));
 	    } catch (const Xapian::Error &e) {
-		LOGERR(("Db::purge: document #%d: %s\n", e.get_msg().c_str()));
+		LOGERR(("Db::purge: document #%d: %s\n", docid, e.get_msg().c_str()));
 	    } catch (...) {
-		LOGERR(("Db::purge: document #%d: unknown error\n"));
+		LOGERR(("Db::purge: document #%d: unknown error\n", docid));
 	    }
 	}
    }
@ -1224,7 +1232,7 @@ bool Db::purgeFile(const string &udi)
 	LOGDEB(("purgeFile: delete docid %d\n", *docid));
 	db.delete_document(*docid);
 	vector<Xapian::docid> docids;
-	m_ndb->subDocs(uniterm, docids);
+	m_ndb->subDocs(udi, docids);
 	LOGDEB(("purgeFile: subdocs cnt %d\n", docids.size()));
 	for (vector<Xapian::docid>::iterator it = docids.begin();
 	     it != docids.end(); it++) {
--- a/src/rcldb/rcldb.h
+++ b/src/rcldb/rcldb.h
@ -16,7 +16,7 @@
 */
 #ifndef _DB_H_INCLUDED_
 #define _DB_H_INCLUDED_
-/* @(#$Id: rcldb.h,v 1.58 2008-07-28 12:24:15 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: rcldb.h,v 1.59 2008-07-29 06:25:29 dockes Exp $  (C) 2004 J.F.Dockes */
 #include <string>
 #include <list>
@ -43,9 +43,12 @@ using std::vector;
 // The main goal is simplicity and good matching to usage inside the recoll
 // user interface. In other words, this is not exhaustive or well-designed or 
 // reusable.
-
+//
-
+// Unique Document Identifier: unically identifies a document in its
-struct stat;
+// source storage (file system or other). Used for up to date checks
 // etc. "udi". Our user is responsible for making sure it's not too
 // big, cause it's stored as a Xapian term (< 150 bytes would be
 // reasonable)
 #ifndef NO_NAMESPACES
 namespace Rcl {
@ -103,14 +106,17 @@ class Db {
    /* Update-related methods ******************************************/
-    /** Test if the db entry for the given filename/stat is up to date. This
+    /** Test if the db entry for the given udi is up to date. This
     * has the side-effect of setting the existence flag for the file document
-     * and all subdocs if any (for later use by 'purge()') */
+     * and all subdocs if any (for later use by 'purge()') 
     */
    bool needUpdate(const string &udi, const string& sig);
-    /** Add document. The Doc class should have been filled as much as
+    /** Add or update document. The Doc class should have been filled as much as
-      * possible depending on the document type */
+      * possible depending on the document type. parent_udi is only
-    bool add(const string &udi, const Doc &doc);
+      * use for subdocs, else set it to empty */
    bool addOrUpdate(const string &udi, const string &parent_udi, 
 		     const Doc &doc);
    /** Delete document(s) for given UDI, including subdocs */
    bool purgeFile(const string &udi);
--- a/src/rcldb/rcldb_p.h
+++ b/src/rcldb/rcldb_p.h
@ -4,7 +4,7 @@
 #include "xapian.h"
 namespace Rcl {
-/* @(#$Id: rcldb_p.h,v 1.2 2008-07-28 08:42:52 dockes Exp $  (C) 2007 J.F.Dockes */
+/* @(#$Id: rcldb_p.h,v 1.3 2008-07-29 06:25:29 dockes Exp $  (C) 2007 J.F.Dockes */
 // Generic Xapian exception catching code. We do this quite often,
 // and I have no idea how to do this except for a macro
@ -51,16 +51,22 @@ class Db::Native {
    bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
-    /** Compute list of subdocuments for a given path (given by hash) 
+    /** Compute list of subdocuments for a given udi. We look for documents 
-     *  We look for all Q terms beginning with the path/hash
+     * indexed by a parent term matching the udi, the posting list for the 
-     *  As suggested by James Aylett, a better method would be to add 
+     * parentterm(udi)  (As suggested by James Aylett)
-     *  a single term (ie: XP/path/to/file) to all subdocs, then finding
+     *
-     *  them would be a simple matter of retrieving the posting list for the
+     * Note that this is not currently recursive: all subdocs are supposed 
-     *  term. There would still be a need for the current Qterm though, as a
+     * to be children of the file doc.
-     *  unique term for replace_document, and for retrieving by
+     * Ie: in a mail folder, all messages, attachments, attachments of
-     *  path/ipath (history)
+     * attached messages etc. must have the folder file document as
     * parent. 
     * Parent-child relationships are defined by the indexer (rcldb user)
     * 
     * The file-system indexer currently works this way (flatly), 
     * subDocs() could be relatively easily changed to support full recursivity
     * if needed.
     */
-    bool subDocs(const string &uniterm, vector<Xapian::docid>& docids);
+    bool subDocs(const string &udi, vector<Xapian::docid>& docids);
 };
 }
--- a/src/rcldb/rcldoc.h
+++ b/src/rcldb/rcldoc.h
@ -16,7 +16,7 @@
 */
 #ifndef _RCLDOC_H_INCLUDED_
 #define _RCLDOC_H_INCLUDED_
-/* @(#$Id: rcldoc.h,v 1.5 2008-07-28 12:24:15 dockes Exp $  (C) 2006 J.F.Dockes */
+/* @(#$Id: rcldoc.h,v 1.6 2008-07-29 06:25:29 dockes Exp $  (C) 2006 J.F.Dockes */
 #include <string>
 #include <map>
@ -75,9 +75,11 @@ class Doc {
    // Doc text size. Index: from text.length(). Query: set by rcldb from
    // index doc data.
    string dbytes;
-    // Doc signature. Used for up to date checks. This is opaque, and
+
-    // could just as well be ctime, size, ctime+size, md5, whatever.
+    // Doc signature. Used for up to date checks. 
    // Index: set by Db::Add caller. Query: set from doc data.
    // This is opaque to rcldb, and could just as well be ctime, size,
    // ctime+size, md5, whatever.
    string sig;
    // The following fields don't go to the db record