use explicit parent udi term instead of Qterm structure to express parent-child relationship
This commit is contained in:
parent
3109a33f4a
commit
24ac62eb86
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.67 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.68 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -390,7 +390,10 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
// without mime type will not be purged from the db, resulting
|
||||
// in possible 'cannot intern file' messages at query time...
|
||||
char cbuf[100];
|
||||
// Document signature
|
||||
// Document signature. This is based on mtime and size and used
|
||||
// for the uptodate check (the value computed here is checked
|
||||
// against the stored one). Changing the computation forces a full
|
||||
// reindex of course.
|
||||
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
|
||||
string sig = cbuf;
|
||||
string udi;
|
||||
@ -398,6 +401,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
if (!m_db.needUpdate(udi, sig)) {
|
||||
LOGDEB(("processone: up to date: %s\n", fn.c_str()));
|
||||
if (m_updater) {
|
||||
// Status bar update, abort request etc.
|
||||
m_updater->status.fn = fn;
|
||||
if (!m_updater->update()) {
|
||||
return FsTreeWalker::FtwStop;
|
||||
@ -422,14 +426,18 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
ercnt, charset.c_str(), path_getsimple(fn).c_str()));
|
||||
}
|
||||
LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n",
|
||||
path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(), "UTF-8"));
|
||||
path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(),
|
||||
"UTF-8"));
|
||||
|
||||
string parent_udi;
|
||||
make_udi(fn, "", parent_udi);
|
||||
Rcl::Doc doc;
|
||||
const string plus("+");
|
||||
char ascdate[20];
|
||||
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
||||
|
||||
FileInterner::Status fis = FileInterner::FIAgain;
|
||||
bool hadNullIpath = false;
|
||||
Rcl::Doc doc;
|
||||
const string plus = "+";
|
||||
char ascdate[20];
|
||||
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
||||
while (fis == FileInterner::FIAgain) {
|
||||
doc.erase();
|
||||
string ipath;
|
||||
@ -468,6 +476,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
hadNullIpath = true;
|
||||
else
|
||||
doc.ipath = ipath;
|
||||
|
||||
doc.url = string("file://") + fn;
|
||||
|
||||
// Note that the filter may have its own idea of the file name
|
||||
@ -484,10 +493,11 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
|
||||
doc.sig = cbuf;
|
||||
|
||||
// Add document to database
|
||||
// Add document to database. If there is an ipath, add it as a children
|
||||
// of the file document.
|
||||
string udi;
|
||||
make_udi(fn, ipath, udi);
|
||||
if (!m_db.add(udi, doc))
|
||||
if (!m_db.addOrUpdate(udi, ipath.empty() ? "" : parent_udi, doc))
|
||||
return FsTreeWalker::FtwError;
|
||||
|
||||
// Tell what we are doing and check for interrupt request
|
||||
@ -520,9 +530,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
// Document signature for up to date checks.
|
||||
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
|
||||
fileDoc.sig = cbuf;
|
||||
string udi;
|
||||
make_udi(fn, "", udi);
|
||||
if (!m_db.add(udi, fileDoc))
|
||||
if (!m_db.addOrUpdate(parent_udi, "", fileDoc))
|
||||
return FsTreeWalker::FtwError;
|
||||
}
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.136 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.137 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -80,39 +80,41 @@ namespace Rcl {
|
||||
// found in document)
|
||||
const static string rclSyntAbs("?!#@");
|
||||
|
||||
// Compute the unique term used to link documents to their file-system source:
|
||||
// Hashed path + possible internal path
|
||||
// Compute the unique term used to link documents to their origin.
|
||||
// "Q" + external udi
|
||||
static inline string make_uniterm(const string& udi)
|
||||
{
|
||||
string uniterm("Q");
|
||||
uniterm.append(udi);
|
||||
return uniterm;
|
||||
}
|
||||
// Compute parent term used to link documents to their parent document (if any)
|
||||
// "" + parent external udi
|
||||
static inline string make_parentterm(const string& udi)
|
||||
{
|
||||
// I prefer to be in possible conflict with omega than with
|
||||
// user-defined fields (Xxxx) that we also allow. "F" is currently
|
||||
// not used by omega (2008-07)
|
||||
string pterm("F");
|
||||
pterm.append(udi);
|
||||
return pterm;
|
||||
}
|
||||
|
||||
/* See comment in class declaration: return all subdocuments of a
|
||||
* document given by its unique path id */
|
||||
bool Db::Native::subDocs(const string &uniterm, vector<Xapian::docid>& docids)
|
||||
* document given by its unique id.
|
||||
*/
|
||||
bool Db::Native::subDocs(const string &udi, vector<Xapian::docid>& docids)
|
||||
{
|
||||
LOGDEB2(("subDocs: [%s]\n", uniterm.c_str()));
|
||||
docids.clear();
|
||||
|
||||
string ermsg;
|
||||
string pterm = make_parentterm(udi);
|
||||
for (int tries = 0; tries < 2; tries++) {
|
||||
try {
|
||||
Xapian::TermIterator it = db.allterms_begin();
|
||||
it.skip_to(uniterm);
|
||||
// Don't return the doc itself:
|
||||
it++;
|
||||
for (; it != db.allterms_end(); it++) {
|
||||
LOGDEB2(("subDocs: testing [%s]\n", (*it).c_str()));
|
||||
// If current term does not begin with uniterm or has
|
||||
// another |, not the same file
|
||||
if ((*it).find(uniterm) != 0 ||
|
||||
(*it).find_last_of("|") != uniterm.length()-1)
|
||||
break;
|
||||
docids.push_back(*(db.postlist_begin(*it)));
|
||||
Xapian::PostingIterator it = db.postlist_begin(pterm);
|
||||
for (; it != db.postlist_end(pterm); it++) {
|
||||
docids.push_back(*it);
|
||||
}
|
||||
LOGDEB2(("Db::Native::subDocs: returning %d ids\n", docids.size()));
|
||||
LOGDEB(("Db::Native::subDocs: returning %d ids\n", docids.size()));
|
||||
return true;
|
||||
} catch (const Xapian::DatabaseModifiedError &e) {
|
||||
LOGDEB(("Db::subDocs: got modified error. reopen/retry\n"));
|
||||
@ -800,9 +802,11 @@ static const int MB = 1024 * 1024;
|
||||
// the title abstract and body and add special terms for file name,
|
||||
// date, mime type ... , create the document data record (more
|
||||
// metadata), and update database
|
||||
bool Db::add(const string &udi, const Doc &idoc)
|
||||
bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||
const Doc &idoc)
|
||||
{
|
||||
LOGDEB1(("Db::add: udi %s\n", udi.c_str()));
|
||||
LOGDEB(("Db::add: udi [%s] parent [%s]\n",
|
||||
udi.c_str(), parent_udi.c_str()));
|
||||
if (m_ndb == 0)
|
||||
return false;
|
||||
static int first = 1;
|
||||
@ -927,7 +931,11 @@ bool Db::add(const string &udi, const Doc &idoc)
|
||||
// checks, and unique id for the replace_document() call.
|
||||
string uniterm = make_uniterm(udi);
|
||||
newdocument.add_term(uniterm);
|
||||
|
||||
// Parent term. This is used to find all descendents, mostly to delete them
|
||||
// when the parent goes away
|
||||
if (!parent_udi.empty()) {
|
||||
newdocument.add_term(make_parentterm(parent_udi));
|
||||
}
|
||||
// Dates etc...
|
||||
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
||||
doc.dmtime.c_str());
|
||||
@ -1091,7 +1099,7 @@ bool Db::needUpdate(const string &udi, const string& sig)
|
||||
|
||||
// Set the existence flag for all the subdocs (if any)
|
||||
vector<Xapian::docid> docids;
|
||||
if (!m_ndb->subDocs(uniterm, docids)) {
|
||||
if (!m_ndb->subDocs(udi, docids)) {
|
||||
LOGERR(("Rcl::Db::needUpdate: can't get subdocs list\n"));
|
||||
return true;
|
||||
}
|
||||
@ -1193,9 +1201,9 @@ bool Db::purge()
|
||||
} catch (const Xapian::DocNotFoundError &) {
|
||||
LOGDEB(("Db::purge: document #%d not found\n", docid));
|
||||
} catch (const Xapian::Error &e) {
|
||||
LOGERR(("Db::purge: document #%d: %s\n", e.get_msg().c_str()));
|
||||
LOGERR(("Db::purge: document #%d: %s\n", docid, e.get_msg().c_str()));
|
||||
} catch (...) {
|
||||
LOGERR(("Db::purge: document #%d: unknown error\n"));
|
||||
LOGERR(("Db::purge: document #%d: unknown error\n", docid));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1224,7 +1232,7 @@ bool Db::purgeFile(const string &udi)
|
||||
LOGDEB(("purgeFile: delete docid %d\n", *docid));
|
||||
db.delete_document(*docid);
|
||||
vector<Xapian::docid> docids;
|
||||
m_ndb->subDocs(uniterm, docids);
|
||||
m_ndb->subDocs(udi, docids);
|
||||
LOGDEB(("purgeFile: subdocs cnt %d\n", docids.size()));
|
||||
for (vector<Xapian::docid>::iterator it = docids.begin();
|
||||
it != docids.end(); it++) {
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _DB_H_INCLUDED_
|
||||
#define _DB_H_INCLUDED_
|
||||
/* @(#$Id: rcldb.h,v 1.58 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: rcldb.h,v 1.59 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
@ -43,9 +43,12 @@ using std::vector;
|
||||
// The main goal is simplicity and good matching to usage inside the recoll
|
||||
// user interface. In other words, this is not exhaustive or well-designed or
|
||||
// reusable.
|
||||
|
||||
|
||||
struct stat;
|
||||
//
|
||||
// Unique Document Identifier: unically identifies a document in its
|
||||
// source storage (file system or other). Used for up to date checks
|
||||
// etc. "udi". Our user is responsible for making sure it's not too
|
||||
// big, cause it's stored as a Xapian term (< 150 bytes would be
|
||||
// reasonable)
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
namespace Rcl {
|
||||
@ -103,14 +106,17 @@ class Db {
|
||||
|
||||
/* Update-related methods ******************************************/
|
||||
|
||||
/** Test if the db entry for the given filename/stat is up to date. This
|
||||
/** Test if the db entry for the given udi is up to date. This
|
||||
* has the side-effect of setting the existence flag for the file document
|
||||
* and all subdocs if any (for later use by 'purge()') */
|
||||
* and all subdocs if any (for later use by 'purge()')
|
||||
*/
|
||||
bool needUpdate(const string &udi, const string& sig);
|
||||
|
||||
/** Add document. The Doc class should have been filled as much as
|
||||
* possible depending on the document type */
|
||||
bool add(const string &udi, const Doc &doc);
|
||||
/** Add or update document. The Doc class should have been filled as much as
|
||||
* possible depending on the document type. parent_udi is only
|
||||
* use for subdocs, else set it to empty */
|
||||
bool addOrUpdate(const string &udi, const string &parent_udi,
|
||||
const Doc &doc);
|
||||
|
||||
/** Delete document(s) for given UDI, including subdocs */
|
||||
bool purgeFile(const string &udi);
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#include "xapian.h"
|
||||
|
||||
namespace Rcl {
|
||||
/* @(#$Id: rcldb_p.h,v 1.2 2008-07-28 08:42:52 dockes Exp $ (C) 2007 J.F.Dockes */
|
||||
/* @(#$Id: rcldb_p.h,v 1.3 2008-07-29 06:25:29 dockes Exp $ (C) 2007 J.F.Dockes */
|
||||
|
||||
// Generic Xapian exception catching code. We do this quite often,
|
||||
// and I have no idea how to do this except for a macro
|
||||
@ -51,16 +51,22 @@ class Db::Native {
|
||||
|
||||
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
|
||||
|
||||
/** Compute list of subdocuments for a given path (given by hash)
|
||||
* We look for all Q terms beginning with the path/hash
|
||||
* As suggested by James Aylett, a better method would be to add
|
||||
* a single term (ie: XP/path/to/file) to all subdocs, then finding
|
||||
* them would be a simple matter of retrieving the posting list for the
|
||||
* term. There would still be a need for the current Qterm though, as a
|
||||
* unique term for replace_document, and for retrieving by
|
||||
* path/ipath (history)
|
||||
/** Compute list of subdocuments for a given udi. We look for documents
|
||||
* indexed by a parent term matching the udi, the posting list for the
|
||||
* parentterm(udi) (As suggested by James Aylett)
|
||||
*
|
||||
* Note that this is not currently recursive: all subdocs are supposed
|
||||
* to be children of the file doc.
|
||||
* Ie: in a mail folder, all messages, attachments, attachments of
|
||||
* attached messages etc. must have the folder file document as
|
||||
* parent.
|
||||
* Parent-child relationships are defined by the indexer (rcldb user)
|
||||
*
|
||||
* The file-system indexer currently works this way (flatly),
|
||||
* subDocs() could be relatively easily changed to support full recursivity
|
||||
* if needed.
|
||||
*/
|
||||
bool subDocs(const string &uniterm, vector<Xapian::docid>& docids);
|
||||
bool subDocs(const string &udi, vector<Xapian::docid>& docids);
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _RCLDOC_H_INCLUDED_
|
||||
#define _RCLDOC_H_INCLUDED_
|
||||
/* @(#$Id: rcldoc.h,v 1.5 2008-07-28 12:24:15 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||
/* @(#$Id: rcldoc.h,v 1.6 2008-07-29 06:25:29 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
@ -75,9 +75,11 @@ class Doc {
|
||||
// Doc text size. Index: from text.length(). Query: set by rcldb from
|
||||
// index doc data.
|
||||
string dbytes;
|
||||
// Doc signature. Used for up to date checks. This is opaque, and
|
||||
// could just as well be ctime, size, ctime+size, md5, whatever.
|
||||
|
||||
// Doc signature. Used for up to date checks.
|
||||
// Index: set by Db::Add caller. Query: set from doc data.
|
||||
// This is opaque to rcldb, and could just as well be ctime, size,
|
||||
// ctime+size, md5, whatever.
|
||||
string sig;
|
||||
|
||||
// The following fields don't go to the db record
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user