use explicit parent udi term instead of Qterm structure to express parent-child relationship
This commit is contained in:
parent
3109a33f4a
commit
24ac62eb86
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.67 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.68 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -390,7 +390,10 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
|||||||
// without mime type will not be purged from the db, resulting
|
// without mime type will not be purged from the db, resulting
|
||||||
// in possible 'cannot intern file' messages at query time...
|
// in possible 'cannot intern file' messages at query time...
|
||||||
char cbuf[100];
|
char cbuf[100];
|
||||||
// Document signature
|
// Document signature. This is based on mtime and size and used
|
||||||
|
// for the uptodate check (the value computed here is checked
|
||||||
|
// against the stored one). Changing the computation forces a full
|
||||||
|
// reindex of course.
|
||||||
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
|
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
|
||||||
string sig = cbuf;
|
string sig = cbuf;
|
||||||
string udi;
|
string udi;
|
||||||
@ -398,6 +401,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
|||||||
if (!m_db.needUpdate(udi, sig)) {
|
if (!m_db.needUpdate(udi, sig)) {
|
||||||
LOGDEB(("processone: up to date: %s\n", fn.c_str()));
|
LOGDEB(("processone: up to date: %s\n", fn.c_str()));
|
||||||
if (m_updater) {
|
if (m_updater) {
|
||||||
|
// Status bar update, abort request etc.
|
||||||
m_updater->status.fn = fn;
|
m_updater->status.fn = fn;
|
||||||
if (!m_updater->update()) {
|
if (!m_updater->update()) {
|
||||||
return FsTreeWalker::FtwStop;
|
return FsTreeWalker::FtwStop;
|
||||||
@ -422,14 +426,18 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
|||||||
ercnt, charset.c_str(), path_getsimple(fn).c_str()));
|
ercnt, charset.c_str(), path_getsimple(fn).c_str()));
|
||||||
}
|
}
|
||||||
LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n",
|
LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n",
|
||||||
path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(), "UTF-8"));
|
path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(),
|
||||||
|
"UTF-8"));
|
||||||
|
|
||||||
|
string parent_udi;
|
||||||
|
make_udi(fn, "", parent_udi);
|
||||||
|
Rcl::Doc doc;
|
||||||
|
const string plus("+");
|
||||||
|
char ascdate[20];
|
||||||
|
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
||||||
|
|
||||||
FileInterner::Status fis = FileInterner::FIAgain;
|
FileInterner::Status fis = FileInterner::FIAgain;
|
||||||
bool hadNullIpath = false;
|
bool hadNullIpath = false;
|
||||||
Rcl::Doc doc;
|
|
||||||
const string plus = "+";
|
|
||||||
char ascdate[20];
|
|
||||||
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
|
||||||
while (fis == FileInterner::FIAgain) {
|
while (fis == FileInterner::FIAgain) {
|
||||||
doc.erase();
|
doc.erase();
|
||||||
string ipath;
|
string ipath;
|
||||||
@ -468,6 +476,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
|||||||
hadNullIpath = true;
|
hadNullIpath = true;
|
||||||
else
|
else
|
||||||
doc.ipath = ipath;
|
doc.ipath = ipath;
|
||||||
|
|
||||||
doc.url = string("file://") + fn;
|
doc.url = string("file://") + fn;
|
||||||
|
|
||||||
// Note that the filter may have its own idea of the file name
|
// Note that the filter may have its own idea of the file name
|
||||||
@ -484,10 +493,11 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
|||||||
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
|
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
|
||||||
doc.sig = cbuf;
|
doc.sig = cbuf;
|
||||||
|
|
||||||
// Add document to database
|
// Add document to database. If there is an ipath, add it as a children
|
||||||
|
// of the file document.
|
||||||
string udi;
|
string udi;
|
||||||
make_udi(fn, ipath, udi);
|
make_udi(fn, ipath, udi);
|
||||||
if (!m_db.add(udi, doc))
|
if (!m_db.addOrUpdate(udi, ipath.empty() ? "" : parent_udi, doc))
|
||||||
return FsTreeWalker::FtwError;
|
return FsTreeWalker::FtwError;
|
||||||
|
|
||||||
// Tell what we are doing and check for interrupt request
|
// Tell what we are doing and check for interrupt request
|
||||||
@ -520,9 +530,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
|||||||
// Document signature for up to date checks.
|
// Document signature for up to date checks.
|
||||||
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
|
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
|
||||||
fileDoc.sig = cbuf;
|
fileDoc.sig = cbuf;
|
||||||
string udi;
|
if (!m_db.addOrUpdate(parent_udi, "", fileDoc))
|
||||||
make_udi(fn, "", udi);
|
|
||||||
if (!m_db.add(udi, fileDoc))
|
|
||||||
return FsTreeWalker::FtwError;
|
return FsTreeWalker::FtwError;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.136 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.137 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -80,39 +80,41 @@ namespace Rcl {
|
|||||||
// found in document)
|
// found in document)
|
||||||
const static string rclSyntAbs("?!#@");
|
const static string rclSyntAbs("?!#@");
|
||||||
|
|
||||||
// Compute the unique term used to link documents to their file-system source:
|
// Compute the unique term used to link documents to their origin.
|
||||||
// Hashed path + possible internal path
|
// "Q" + external udi
|
||||||
static inline string make_uniterm(const string& udi)
|
static inline string make_uniterm(const string& udi)
|
||||||
{
|
{
|
||||||
string uniterm("Q");
|
string uniterm("Q");
|
||||||
uniterm.append(udi);
|
uniterm.append(udi);
|
||||||
return uniterm;
|
return uniterm;
|
||||||
}
|
}
|
||||||
|
// Compute parent term used to link documents to their parent document (if any)
|
||||||
|
// "" + parent external udi
|
||||||
|
static inline string make_parentterm(const string& udi)
|
||||||
|
{
|
||||||
|
// I prefer to be in possible conflict with omega than with
|
||||||
|
// user-defined fields (Xxxx) that we also allow. "F" is currently
|
||||||
|
// not used by omega (2008-07)
|
||||||
|
string pterm("F");
|
||||||
|
pterm.append(udi);
|
||||||
|
return pterm;
|
||||||
|
}
|
||||||
|
|
||||||
/* See comment in class declaration: return all subdocuments of a
|
/* See comment in class declaration: return all subdocuments of a
|
||||||
* document given by its unique path id */
|
* document given by its unique id.
|
||||||
bool Db::Native::subDocs(const string &uniterm, vector<Xapian::docid>& docids)
|
*/
|
||||||
|
bool Db::Native::subDocs(const string &udi, vector<Xapian::docid>& docids)
|
||||||
{
|
{
|
||||||
LOGDEB2(("subDocs: [%s]\n", uniterm.c_str()));
|
LOGDEB2(("subDocs: [%s]\n", uniterm.c_str()));
|
||||||
docids.clear();
|
|
||||||
|
|
||||||
string ermsg;
|
string ermsg;
|
||||||
|
string pterm = make_parentterm(udi);
|
||||||
for (int tries = 0; tries < 2; tries++) {
|
for (int tries = 0; tries < 2; tries++) {
|
||||||
try {
|
try {
|
||||||
Xapian::TermIterator it = db.allterms_begin();
|
Xapian::PostingIterator it = db.postlist_begin(pterm);
|
||||||
it.skip_to(uniterm);
|
for (; it != db.postlist_end(pterm); it++) {
|
||||||
// Don't return the doc itself:
|
docids.push_back(*it);
|
||||||
it++;
|
|
||||||
for (; it != db.allterms_end(); it++) {
|
|
||||||
LOGDEB2(("subDocs: testing [%s]\n", (*it).c_str()));
|
|
||||||
// If current term does not begin with uniterm or has
|
|
||||||
// another |, not the same file
|
|
||||||
if ((*it).find(uniterm) != 0 ||
|
|
||||||
(*it).find_last_of("|") != uniterm.length()-1)
|
|
||||||
break;
|
|
||||||
docids.push_back(*(db.postlist_begin(*it)));
|
|
||||||
}
|
}
|
||||||
LOGDEB2(("Db::Native::subDocs: returning %d ids\n", docids.size()));
|
LOGDEB(("Db::Native::subDocs: returning %d ids\n", docids.size()));
|
||||||
return true;
|
return true;
|
||||||
} catch (const Xapian::DatabaseModifiedError &e) {
|
} catch (const Xapian::DatabaseModifiedError &e) {
|
||||||
LOGDEB(("Db::subDocs: got modified error. reopen/retry\n"));
|
LOGDEB(("Db::subDocs: got modified error. reopen/retry\n"));
|
||||||
@ -800,9 +802,11 @@ static const int MB = 1024 * 1024;
|
|||||||
// the title abstract and body and add special terms for file name,
|
// the title abstract and body and add special terms for file name,
|
||||||
// date, mime type ... , create the document data record (more
|
// date, mime type ... , create the document data record (more
|
||||||
// metadata), and update database
|
// metadata), and update database
|
||||||
bool Db::add(const string &udi, const Doc &idoc)
|
bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||||
|
const Doc &idoc)
|
||||||
{
|
{
|
||||||
LOGDEB1(("Db::add: udi %s\n", udi.c_str()));
|
LOGDEB(("Db::add: udi [%s] parent [%s]\n",
|
||||||
|
udi.c_str(), parent_udi.c_str()));
|
||||||
if (m_ndb == 0)
|
if (m_ndb == 0)
|
||||||
return false;
|
return false;
|
||||||
static int first = 1;
|
static int first = 1;
|
||||||
@ -927,7 +931,11 @@ bool Db::add(const string &udi, const Doc &idoc)
|
|||||||
// checks, and unique id for the replace_document() call.
|
// checks, and unique id for the replace_document() call.
|
||||||
string uniterm = make_uniterm(udi);
|
string uniterm = make_uniterm(udi);
|
||||||
newdocument.add_term(uniterm);
|
newdocument.add_term(uniterm);
|
||||||
|
// Parent term. This is used to find all descendents, mostly to delete them
|
||||||
|
// when the parent goes away
|
||||||
|
if (!parent_udi.empty()) {
|
||||||
|
newdocument.add_term(make_parentterm(parent_udi));
|
||||||
|
}
|
||||||
// Dates etc...
|
// Dates etc...
|
||||||
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
||||||
doc.dmtime.c_str());
|
doc.dmtime.c_str());
|
||||||
@ -1091,7 +1099,7 @@ bool Db::needUpdate(const string &udi, const string& sig)
|
|||||||
|
|
||||||
// Set the existence flag for all the subdocs (if any)
|
// Set the existence flag for all the subdocs (if any)
|
||||||
vector<Xapian::docid> docids;
|
vector<Xapian::docid> docids;
|
||||||
if (!m_ndb->subDocs(uniterm, docids)) {
|
if (!m_ndb->subDocs(udi, docids)) {
|
||||||
LOGERR(("Rcl::Db::needUpdate: can't get subdocs list\n"));
|
LOGERR(("Rcl::Db::needUpdate: can't get subdocs list\n"));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -1193,9 +1201,9 @@ bool Db::purge()
|
|||||||
} catch (const Xapian::DocNotFoundError &) {
|
} catch (const Xapian::DocNotFoundError &) {
|
||||||
LOGDEB(("Db::purge: document #%d not found\n", docid));
|
LOGDEB(("Db::purge: document #%d not found\n", docid));
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
LOGERR(("Db::purge: document #%d: %s\n", e.get_msg().c_str()));
|
LOGERR(("Db::purge: document #%d: %s\n", docid, e.get_msg().c_str()));
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
LOGERR(("Db::purge: document #%d: unknown error\n"));
|
LOGERR(("Db::purge: document #%d: unknown error\n", docid));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1224,7 +1232,7 @@ bool Db::purgeFile(const string &udi)
|
|||||||
LOGDEB(("purgeFile: delete docid %d\n", *docid));
|
LOGDEB(("purgeFile: delete docid %d\n", *docid));
|
||||||
db.delete_document(*docid);
|
db.delete_document(*docid);
|
||||||
vector<Xapian::docid> docids;
|
vector<Xapian::docid> docids;
|
||||||
m_ndb->subDocs(uniterm, docids);
|
m_ndb->subDocs(udi, docids);
|
||||||
LOGDEB(("purgeFile: subdocs cnt %d\n", docids.size()));
|
LOGDEB(("purgeFile: subdocs cnt %d\n", docids.size()));
|
||||||
for (vector<Xapian::docid>::iterator it = docids.begin();
|
for (vector<Xapian::docid>::iterator it = docids.begin();
|
||||||
it != docids.end(); it++) {
|
it != docids.end(); it++) {
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _DB_H_INCLUDED_
|
#ifndef _DB_H_INCLUDED_
|
||||||
#define _DB_H_INCLUDED_
|
#define _DB_H_INCLUDED_
|
||||||
/* @(#$Id: rcldb.h,v 1.58 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes */
|
/* @(#$Id: rcldb.h,v 1.59 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <list>
|
#include <list>
|
||||||
@ -43,9 +43,12 @@ using std::vector;
|
|||||||
// The main goal is simplicity and good matching to usage inside the recoll
|
// The main goal is simplicity and good matching to usage inside the recoll
|
||||||
// user interface. In other words, this is not exhaustive or well-designed or
|
// user interface. In other words, this is not exhaustive or well-designed or
|
||||||
// reusable.
|
// reusable.
|
||||||
|
//
|
||||||
|
// Unique Document Identifier: unically identifies a document in its
|
||||||
struct stat;
|
// source storage (file system or other). Used for up to date checks
|
||||||
|
// etc. "udi". Our user is responsible for making sure it's not too
|
||||||
|
// big, cause it's stored as a Xapian term (< 150 bytes would be
|
||||||
|
// reasonable)
|
||||||
|
|
||||||
#ifndef NO_NAMESPACES
|
#ifndef NO_NAMESPACES
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
@ -103,14 +106,17 @@ class Db {
|
|||||||
|
|
||||||
/* Update-related methods ******************************************/
|
/* Update-related methods ******************************************/
|
||||||
|
|
||||||
/** Test if the db entry for the given filename/stat is up to date. This
|
/** Test if the db entry for the given udi is up to date. This
|
||||||
* has the side-effect of setting the existence flag for the file document
|
* has the side-effect of setting the existence flag for the file document
|
||||||
* and all subdocs if any (for later use by 'purge()') */
|
* and all subdocs if any (for later use by 'purge()')
|
||||||
|
*/
|
||||||
bool needUpdate(const string &udi, const string& sig);
|
bool needUpdate(const string &udi, const string& sig);
|
||||||
|
|
||||||
/** Add document. The Doc class should have been filled as much as
|
/** Add or update document. The Doc class should have been filled as much as
|
||||||
* possible depending on the document type */
|
* possible depending on the document type. parent_udi is only
|
||||||
bool add(const string &udi, const Doc &doc);
|
* use for subdocs, else set it to empty */
|
||||||
|
bool addOrUpdate(const string &udi, const string &parent_udi,
|
||||||
|
const Doc &doc);
|
||||||
|
|
||||||
/** Delete document(s) for given UDI, including subdocs */
|
/** Delete document(s) for given UDI, including subdocs */
|
||||||
bool purgeFile(const string &udi);
|
bool purgeFile(const string &udi);
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
#include "xapian.h"
|
#include "xapian.h"
|
||||||
|
|
||||||
namespace Rcl {
|
namespace Rcl {
|
||||||
/* @(#$Id: rcldb_p.h,v 1.2 2008-07-28 08:42:52 dockes Exp $ (C) 2007 J.F.Dockes */
|
/* @(#$Id: rcldb_p.h,v 1.3 2008-07-29 06:25:29 dockes Exp $ (C) 2007 J.F.Dockes */
|
||||||
|
|
||||||
// Generic Xapian exception catching code. We do this quite often,
|
// Generic Xapian exception catching code. We do this quite often,
|
||||||
// and I have no idea how to do this except for a macro
|
// and I have no idea how to do this except for a macro
|
||||||
@ -51,16 +51,22 @@ class Db::Native {
|
|||||||
|
|
||||||
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
|
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
|
||||||
|
|
||||||
/** Compute list of subdocuments for a given path (given by hash)
|
/** Compute list of subdocuments for a given udi. We look for documents
|
||||||
* We look for all Q terms beginning with the path/hash
|
* indexed by a parent term matching the udi, the posting list for the
|
||||||
* As suggested by James Aylett, a better method would be to add
|
* parentterm(udi) (As suggested by James Aylett)
|
||||||
* a single term (ie: XP/path/to/file) to all subdocs, then finding
|
*
|
||||||
* them would be a simple matter of retrieving the posting list for the
|
* Note that this is not currently recursive: all subdocs are supposed
|
||||||
* term. There would still be a need for the current Qterm though, as a
|
* to be children of the file doc.
|
||||||
* unique term for replace_document, and for retrieving by
|
* Ie: in a mail folder, all messages, attachments, attachments of
|
||||||
* path/ipath (history)
|
* attached messages etc. must have the folder file document as
|
||||||
|
* parent.
|
||||||
|
* Parent-child relationships are defined by the indexer (rcldb user)
|
||||||
|
*
|
||||||
|
* The file-system indexer currently works this way (flatly),
|
||||||
|
* subDocs() could be relatively easily changed to support full recursivity
|
||||||
|
* if needed.
|
||||||
*/
|
*/
|
||||||
bool subDocs(const string &uniterm, vector<Xapian::docid>& docids);
|
bool subDocs(const string &udi, vector<Xapian::docid>& docids);
|
||||||
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
*/
|
*/
|
||||||
#ifndef _RCLDOC_H_INCLUDED_
|
#ifndef _RCLDOC_H_INCLUDED_
|
||||||
#define _RCLDOC_H_INCLUDED_
|
#define _RCLDOC_H_INCLUDED_
|
||||||
/* @(#$Id: rcldoc.h,v 1.5 2008-07-28 12:24:15 dockes Exp $ (C) 2006 J.F.Dockes */
|
/* @(#$Id: rcldoc.h,v 1.6 2008-07-29 06:25:29 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <map>
|
#include <map>
|
||||||
@ -75,9 +75,11 @@ class Doc {
|
|||||||
// Doc text size. Index: from text.length(). Query: set by rcldb from
|
// Doc text size. Index: from text.length(). Query: set by rcldb from
|
||||||
// index doc data.
|
// index doc data.
|
||||||
string dbytes;
|
string dbytes;
|
||||||
// Doc signature. Used for up to date checks. This is opaque, and
|
|
||||||
// could just as well be ctime, size, ctime+size, md5, whatever.
|
// Doc signature. Used for up to date checks.
|
||||||
// Index: set by Db::Add caller. Query: set from doc data.
|
// Index: set by Db::Add caller. Query: set from doc data.
|
||||||
|
// This is opaque to rcldb, and could just as well be ctime, size,
|
||||||
|
// ctime+size, md5, whatever.
|
||||||
string sig;
|
string sig;
|
||||||
|
|
||||||
// The following fields don't go to the db record
|
// The following fields don't go to the db record
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user