use explicit parent udi term instead of Qterm structure to express parent-child relationship

This commit is contained in:
dockes 2008-07-29 06:25:29 +00:00
parent 3109a33f4a
commit 24ac62eb86
5 changed files with 91 additions and 61 deletions

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.67 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.68 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -390,7 +390,10 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
// without mime type will not be purged from the db, resulting
// in possible 'cannot intern file' messages at query time...
char cbuf[100];
// Document signature
// Document signature. This is based on mtime and size and used
// for the uptodate check (the value computed here is checked
// against the stored one). Changing the computation forces a full
// reindex of course.
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
string sig = cbuf;
string udi;
@ -398,6 +401,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
if (!m_db.needUpdate(udi, sig)) {
LOGDEB(("processone: up to date: %s\n", fn.c_str()));
if (m_updater) {
// Status bar update, abort request etc.
m_updater->status.fn = fn;
if (!m_updater->update()) {
return FsTreeWalker::FtwStop;
@ -422,14 +426,18 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
ercnt, charset.c_str(), path_getsimple(fn).c_str()));
}
LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n",
path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(), "UTF-8"));
path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(),
"UTF-8"));
string parent_udi;
make_udi(fn, "", parent_udi);
Rcl::Doc doc;
const string plus("+");
char ascdate[20];
sprintf(ascdate, "%ld", long(stp->st_mtime));
FileInterner::Status fis = FileInterner::FIAgain;
bool hadNullIpath = false;
Rcl::Doc doc;
const string plus = "+";
char ascdate[20];
sprintf(ascdate, "%ld", long(stp->st_mtime));
while (fis == FileInterner::FIAgain) {
doc.erase();
string ipath;
@ -468,6 +476,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
hadNullIpath = true;
else
doc.ipath = ipath;
doc.url = string("file://") + fn;
// Note that the filter may have its own idea of the file name
@ -484,10 +493,11 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
doc.sig = cbuf;
// Add document to database
// Add document to database. If there is an ipath, add it as a children
// of the file document.
string udi;
make_udi(fn, ipath, udi);
if (!m_db.add(udi, doc))
if (!m_db.addOrUpdate(udi, ipath.empty() ? "" : parent_udi, doc))
return FsTreeWalker::FtwError;
// Tell what we are doing and check for interrupt request
@ -520,9 +530,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
// Document signature for up to date checks.
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
fileDoc.sig = cbuf;
string udi;
make_udi(fn, "", udi);
if (!m_db.add(udi, fileDoc))
if (!m_db.addOrUpdate(parent_udi, "", fileDoc))
return FsTreeWalker::FtwError;
}

View File

@ -1,5 +1,5 @@
#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.136 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes";
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.137 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@ -80,39 +80,41 @@ namespace Rcl {
// found in document)
const static string rclSyntAbs("?!#@");
// Compute the unique term used to link documents to their file-system source:
// Hashed path + possible internal path
// Compute the unique term used to link documents to their origin.
// "Q" + external udi
static inline string make_uniterm(const string& udi)
{
string uniterm("Q");
uniterm.append(udi);
return uniterm;
}
// Compute parent term used to link documents to their parent document (if any)
// "" + parent external udi
static inline string make_parentterm(const string& udi)
{
// I prefer to be in possible conflict with omega than with
// user-defined fields (Xxxx) that we also allow. "F" is currently
// not used by omega (2008-07)
string pterm("F");
pterm.append(udi);
return pterm;
}
/* See comment in class declaration: return all subdocuments of a
* document given by its unique path id */
bool Db::Native::subDocs(const string &uniterm, vector<Xapian::docid>& docids)
* document given by its unique id.
*/
bool Db::Native::subDocs(const string &udi, vector<Xapian::docid>& docids)
{
LOGDEB2(("subDocs: [%s]\n", uniterm.c_str()));
docids.clear();
string ermsg;
string pterm = make_parentterm(udi);
for (int tries = 0; tries < 2; tries++) {
try {
Xapian::TermIterator it = db.allterms_begin();
it.skip_to(uniterm);
// Don't return the doc itself:
it++;
for (; it != db.allterms_end(); it++) {
LOGDEB2(("subDocs: testing [%s]\n", (*it).c_str()));
// If current term does not begin with uniterm or has
// another |, not the same file
if ((*it).find(uniterm) != 0 ||
(*it).find_last_of("|") != uniterm.length()-1)
break;
docids.push_back(*(db.postlist_begin(*it)));
Xapian::PostingIterator it = db.postlist_begin(pterm);
for (; it != db.postlist_end(pterm); it++) {
docids.push_back(*it);
}
LOGDEB2(("Db::Native::subDocs: returning %d ids\n", docids.size()));
LOGDEB(("Db::Native::subDocs: returning %d ids\n", docids.size()));
return true;
} catch (const Xapian::DatabaseModifiedError &e) {
LOGDEB(("Db::subDocs: got modified error. reopen/retry\n"));
@ -800,9 +802,11 @@ static const int MB = 1024 * 1024;
// the title abstract and body and add special terms for file name,
// date, mime type ... , create the document data record (more
// metadata), and update database
bool Db::add(const string &udi, const Doc &idoc)
bool Db::addOrUpdate(const string &udi, const string &parent_udi,
const Doc &idoc)
{
LOGDEB1(("Db::add: udi %s\n", udi.c_str()));
LOGDEB(("Db::add: udi [%s] parent [%s]\n",
udi.c_str(), parent_udi.c_str()));
if (m_ndb == 0)
return false;
static int first = 1;
@ -927,7 +931,11 @@ bool Db::add(const string &udi, const Doc &idoc)
// checks, and unique id for the replace_document() call.
string uniterm = make_uniterm(udi);
newdocument.add_term(uniterm);
// Parent term. This is used to find all descendents, mostly to delete them
// when the parent goes away
if (!parent_udi.empty()) {
newdocument.add_term(make_parentterm(parent_udi));
}
// Dates etc...
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
doc.dmtime.c_str());
@ -1091,7 +1099,7 @@ bool Db::needUpdate(const string &udi, const string& sig)
// Set the existence flag for all the subdocs (if any)
vector<Xapian::docid> docids;
if (!m_ndb->subDocs(uniterm, docids)) {
if (!m_ndb->subDocs(udi, docids)) {
LOGERR(("Rcl::Db::needUpdate: can't get subdocs list\n"));
return true;
}
@ -1193,9 +1201,9 @@ bool Db::purge()
} catch (const Xapian::DocNotFoundError &) {
LOGDEB(("Db::purge: document #%d not found\n", docid));
} catch (const Xapian::Error &e) {
LOGERR(("Db::purge: document #%d: %s\n", e.get_msg().c_str()));
LOGERR(("Db::purge: document #%d: %s\n", docid, e.get_msg().c_str()));
} catch (...) {
LOGERR(("Db::purge: document #%d: unknown error\n"));
LOGERR(("Db::purge: document #%d: unknown error\n", docid));
}
}
}
@ -1224,7 +1232,7 @@ bool Db::purgeFile(const string &udi)
LOGDEB(("purgeFile: delete docid %d\n", *docid));
db.delete_document(*docid);
vector<Xapian::docid> docids;
m_ndb->subDocs(uniterm, docids);
m_ndb->subDocs(udi, docids);
LOGDEB(("purgeFile: subdocs cnt %d\n", docids.size()));
for (vector<Xapian::docid>::iterator it = docids.begin();
it != docids.end(); it++) {

View File

@ -16,7 +16,7 @@
*/
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.58 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes */
/* @(#$Id: rcldb.h,v 1.59 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
@ -43,9 +43,12 @@ using std::vector;
// The main goal is simplicity and good matching to usage inside the recoll
// user interface. In other words, this is not exhaustive or well-designed or
// reusable.
struct stat;
//
// Unique Document Identifier: unically identifies a document in its
// source storage (file system or other). Used for up to date checks
// etc. "udi". Our user is responsible for making sure it's not too
// big, cause it's stored as a Xapian term (< 150 bytes would be
// reasonable)
#ifndef NO_NAMESPACES
namespace Rcl {
@ -103,14 +106,17 @@ class Db {
/* Update-related methods ******************************************/
/** Test if the db entry for the given filename/stat is up to date. This
/** Test if the db entry for the given udi is up to date. This
* has the side-effect of setting the existence flag for the file document
* and all subdocs if any (for later use by 'purge()') */
* and all subdocs if any (for later use by 'purge()')
*/
bool needUpdate(const string &udi, const string& sig);
/** Add document. The Doc class should have been filled as much as
* possible depending on the document type */
bool add(const string &udi, const Doc &doc);
/** Add or update document. The Doc class should have been filled as much as
* possible depending on the document type. parent_udi is only
* use for subdocs, else set it to empty */
bool addOrUpdate(const string &udi, const string &parent_udi,
const Doc &doc);
/** Delete document(s) for given UDI, including subdocs */
bool purgeFile(const string &udi);

View File

@ -4,7 +4,7 @@
#include "xapian.h"
namespace Rcl {
/* @(#$Id: rcldb_p.h,v 1.2 2008-07-28 08:42:52 dockes Exp $ (C) 2007 J.F.Dockes */
/* @(#$Id: rcldb_p.h,v 1.3 2008-07-29 06:25:29 dockes Exp $ (C) 2007 J.F.Dockes */
// Generic Xapian exception catching code. We do this quite often,
// and I have no idea how to do this except for a macro
@ -51,16 +51,22 @@ class Db::Native {
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
/** Compute list of subdocuments for a given path (given by hash)
* We look for all Q terms beginning with the path/hash
* As suggested by James Aylett, a better method would be to add
* a single term (ie: XP/path/to/file) to all subdocs, then finding
* them would be a simple matter of retrieving the posting list for the
* term. There would still be a need for the current Qterm though, as a
* unique term for replace_document, and for retrieving by
* path/ipath (history)
/** Compute list of subdocuments for a given udi. We look for documents
* indexed by a parent term matching the udi, the posting list for the
* parentterm(udi) (As suggested by James Aylett)
*
* Note that this is not currently recursive: all subdocs are supposed
* to be children of the file doc.
* Ie: in a mail folder, all messages, attachments, attachments of
* attached messages etc. must have the folder file document as
* parent.
* Parent-child relationships are defined by the indexer (rcldb user)
*
* The file-system indexer currently works this way (flatly),
* subDocs() could be relatively easily changed to support full recursivity
* if needed.
*/
bool subDocs(const string &uniterm, vector<Xapian::docid>& docids);
bool subDocs(const string &udi, vector<Xapian::docid>& docids);
};
}

View File

@ -16,7 +16,7 @@
*/
#ifndef _RCLDOC_H_INCLUDED_
#define _RCLDOC_H_INCLUDED_
/* @(#$Id: rcldoc.h,v 1.5 2008-07-28 12:24:15 dockes Exp $ (C) 2006 J.F.Dockes */
/* @(#$Id: rcldoc.h,v 1.6 2008-07-29 06:25:29 dockes Exp $ (C) 2006 J.F.Dockes */
#include <string>
#include <map>
@ -75,9 +75,11 @@ class Doc {
// Doc text size. Index: from text.length(). Query: set by rcldb from
// index doc data.
string dbytes;
// Doc signature. Used for up to date checks. This is opaque, and
// could just as well be ctime, size, ctime+size, md5, whatever.
// Doc signature. Used for up to date checks.
// Index: set by Db::Add caller. Query: set from doc data.
// This is opaque to rcldb, and could just as well be ctime, size,
// ctime+size, md5, whatever.
string sig;
// The following fields don't go to the db record