From a1b7018cfdf374b17c80377c1c10a7fb213af6da Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 25 May 2013 11:26:57 +0200 Subject: [PATCH] Fix problems which occurred when using functions like open-parents with multiple indexes containing identical paths (udis) --- src/internfile/internfile.cpp | 16 +++---- src/internfile/internfile.h | 6 +-- src/qtgui/rclmain_w.cpp | 3 +- src/query/docseq.cpp | 14 ++++--- src/query/docseqhist.cpp | 5 ++- src/rcldb/rcldb.cpp | 78 +++++++++++++++++++++-------------- src/rcldb/rcldb.h | 16 +++++-- src/rcldb/rcldb_p.h | 8 ++-- src/rcldb/rcldoc.cpp | 1 + src/rcldb/rcldoc.h | 11 ++++- 10 files changed, 99 insertions(+), 59 deletions(-) diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index 09d0b126..45bc293e 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -120,14 +120,12 @@ void FileInterner::reapXAttrs(const string& path) // This is used when the user wants to retrieve a search result doc's parent // (ie message having a given attachment) -bool FileInterner::getEnclosing(const string &url, const string &ipath, - string &eurl, string &eipath, string& udi) +bool FileInterner::getEnclosingUDI(const Rcl::Doc &doc, string& udi) { - eurl = url; - eipath = ipath; + LOGDEB(("FileInterner::getEnclosingUDI(): url [%s] ipath [%s]\n", + doc.url.c_str(), doc.ipath.c_str())); + string eipath = doc.ipath; string::size_type colon; - LOGDEB(("FileInterner::getEnclosing(): url [%s] ipath [%s]\n", - url.c_str(), eipath.c_str())); if (eipath.empty()) return false; if ((colon = eipath.find_last_of(cstr_isep)) != string::npos) { @@ -135,10 +133,8 @@ bool FileInterner::getEnclosing(const string &url, const string &ipath, } else { eipath.erase(); } - make_udi(url_gpath(eurl), eipath, udi); - - LOGDEB(("FileInterner::getEnclosing() after: [%s]\n", eipath.c_str())); - return true; + + make_udi(url_gpath(doc.idxurl.empty() ? doc.url : doc.idxurl), eipath, udi); } string FileInterner::getLastIpathElt(const string& ipath) diff --git a/src/internfile/internfile.h b/src/internfile/internfile.h index d11ead80..b7c9489c 100644 --- a/src/internfile/internfile.h +++ b/src/internfile/internfile.h @@ -195,14 +195,14 @@ class FileInterner { } /** - * Get immediate parent for document. + * Get UDI for immediate parent for document. * * This is not in general the same as the "parent" document used * with Rcl::Db::addOrUpdate(). The latter is the enclosing file, * this would be for exemple the email containing the attachment. + * This is in internfile because of the ipath computation. */ - static bool getEnclosing(const string &url, const string &ipath, - string &eurl, string &eipath, string& udi); + static bool getEnclosingUDI(const Rcl::Doc &doc, string& udi); /** Return last element in ipath, like basename */ static std::string getLastIpathElt(const std::string& ipath); diff --git a/src/qtgui/rclmain_w.cpp b/src/qtgui/rclmain_w.cpp index 177df634..9b22e63d 100644 --- a/src/qtgui/rclmain_w.cpp +++ b/src/qtgui/rclmain_w.cpp @@ -430,7 +430,8 @@ void RclMain::viewUrl() (const char *)qurl.fragment().toLocal8Bit(), udi); Rcl::Doc doc; - if (!rcldb->getDoc(udi, doc) || doc.pc == -1) + Rcl::Doc idxdoc; // idxdoc.idxi == 0 -> works with base index only + if (!rcldb->getDoc(udi, idxdoc, doc) || doc.pc == -1) return; // StartNativeViewer needs a db source to call getEnclosing() on. diff --git a/src/query/docseq.cpp b/src/query/docseq.cpp index 327bf0ad..f2fb7468 100644 --- a/src/query/docseq.cpp +++ b/src/query/docseq.cpp @@ -38,14 +38,16 @@ int DocSequence::getSeqSlice(int offs, int cnt, vector& result) bool DocSequence::getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc) { - // Note: no need for setQuery here, we're just passing through a - // query-independant request - + Rcl::Db *db = getDb(); + if (db == 0) { + LOGERR(("DocSequence::getEnclosing: no db\n")); + return false; + } string udi; - if (!FileInterner::getEnclosing(doc.url, doc.ipath, pdoc.url, pdoc.ipath, - udi)) + if (!FileInterner::getEnclosingUDI(doc, udi)) return false; - bool dbret = getDb()->getDoc(udi, pdoc); + + bool dbret = db->getDoc(udi, doc, pdoc); return dbret && pdoc.pc != -1; } diff --git a/src/query/docseqhist.cpp b/src/query/docseqhist.cpp index 0025360e..b32f369f 100644 --- a/src/query/docseqhist.cpp +++ b/src/query/docseqhist.cpp @@ -136,7 +136,10 @@ bool DocSequenceHistory::getDoc(int num, Rcl::Doc &doc, string *sh) } else sh->erase(); } - bool ret = m_db->getDoc(m_it->udi, doc); + + // For now history does not store an index id. Use empty doc as ref. + Rcl::Doc idxdoc; + bool ret = m_db->getDoc(m_it->udi, idxdoc, doc); if (!ret || doc.pc == -1) { doc.url = "UNKNOWN"; doc.ipath = ""; diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 3ccb0c3e..02a88e4c 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -219,20 +219,25 @@ void Db::Native::maybeStartThreads() /* See comment in class declaration: return all subdocuments of a * document given by its unique id. */ -bool Db::Native::subDocs(const string &udi, vector& docids) +bool Db::Native::subDocs(const string &udi, int idxi, + vector& docids) { LOGDEB2(("subDocs: [%s]\n", uniterm.c_str())); string pterm = make_parentterm(udi); - + vector candidates; XAPTRY(docids.clear(); - docids.insert(docids.begin(), xrdb.postlist_begin(pterm), - xrdb.postlist_end(pterm)), + candidates.insert(candidates.begin(), xrdb.postlist_begin(pterm), + xrdb.postlist_end(pterm)), xrdb, m_rcldb->m_reason); - if (!m_rcldb->m_reason.empty()) { LOGERR(("Rcl::Db::subDocs: %s\n", m_rcldb->m_reason.c_str())); return false; } else { + for (unsigned int i = 0; i < candidates.size(); i++) { + if (whatDbIdx(candidates[i]) == (size_t)idxi) { + docids.push_back(candidates[i]); + } + } LOGDEB0(("Db::Native::subDocs: returning %d ids\n", docids.size())); return true; } @@ -259,11 +264,11 @@ bool Db::Native::xdocToUdi(Xapian::Document& xdoc, string &udi) } // Check if doc given by udi is indexed by term -bool Db::Native::hasTerm(const string& udi, const string& term) +bool Db::Native::hasTerm(const string& udi, int idxi, const string& term) { LOGDEB2(("Native::hasTerm: udi [%s] term [%s]\n",udi.c_str(),term.c_str())); Xapian::Document xdoc; - if (getDoc(udi, xdoc)) { + if (getDoc(udi, idxi, xdoc)) { Xapian::TermIterator xit; XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);, @@ -279,20 +284,23 @@ bool Db::Native::hasTerm(const string& udi, const string& term) return false; } -// Retrieve Xapian document, given udi -Xapian::docid Db::Native::getDoc(const string& udi, Xapian::Document& xdoc) +// Retrieve Xapian document, given udi. There may be several identical udis +// if we are using multiple indexes. +Xapian::docid Db::Native::getDoc(const string& udi, int idxi, + Xapian::Document& xdoc) { string uniterm = make_uniterm(udi); for (int tries = 0; tries < 2; tries++) { try { - Xapian::PostingIterator docid = xrdb.postlist_begin(uniterm); - if (docid == xrdb.postlist_end(uniterm)) { - // Udi not in Db. - return 0; - } else { + Xapian::PostingIterator docid; + for (docid = xrdb.postlist_begin(uniterm); + docid != xrdb.postlist_end(uniterm); docid++) { xdoc = xrdb.get_document(*docid); - return *docid; + if (whatDbIdx(*docid) == (size_t)idxi) + return *docid; } + // Udi not in Db. + return 0; } catch (const Xapian::DatabaseModifiedError &e) { m_rcldb->m_reason = e.get_msg(); xrdb.reopen(); @@ -314,23 +322,27 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, if (!parms.ok()) return false; - // Set xdocid at once so that we can call whatDbIdx() doc.xdocid = docid; doc.haspages = hasPages(docid); // Compute what index this comes from, and check for path translations string dbdir = m_rcldb->m_basedir; + doc.idxi = 0; if (!m_rcldb->m_extraDbs.empty()) { - unsigned int idxi = m_rcldb->whatDbIdx(doc); + unsigned int idxi = whatDbIdx(docid); // idxi is in [0, extraDbs.size()]. 0 is for the main index, // idxi-1 indexes into the additional dbs array. if (idxi) { dbdir = m_rcldb->m_extraDbs[idxi - 1]; + doc.idxi = idxi; } } - parms.get(Doc::keyurl, doc.url); + parms.get(Doc::keyurl, doc.idxurl); + doc.url = doc.idxurl; m_rcldb->m_config->urlrewrite(dbdir, doc.url); + if (!doc.url.compare(doc.idxurl)) + doc.idxurl.clear(); // Special cases: parms.get(Doc::keytp, doc.mimetype); @@ -549,7 +561,7 @@ bool Db::Native::purgeFileWrite(bool orphansOnly, const string& udi, xwdb.delete_document(*docid); } vector docids; - subDocs(udi, docids); + subDocs(udi, 0, docids); LOGDEB(("purgeFile: subdocs cnt %d\n", docids.size())); for (vector::iterator it = docids.begin(); it != docids.end(); it++) { @@ -864,14 +876,19 @@ bool Db::rmQueryDb(const string &dir) // modulo of the docid against the db count. Ref: // http://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID size_t Db::whatDbIdx(const Doc& doc) +{ + return m_ndb->whatDbIdx(doc.xdocid); +} + +size_t Db::Native::whatDbIdx(Xapian::docid id) { LOGDEB1(("Db::whatDbIdx: xdocid %lu, %u extraDbs\n", - (unsigned long)doc.xdocid, m_extraDbs.size())); - if (doc.xdocid == 0) + (unsigned long)id, m_extraDbs.size())); + if (id == 0) return (size_t)-1; - if (m_extraDbs.size() == 0) + if (m_rcldb->m_extraDbs.size() == 0) return 0; - return (doc.xdocid - 1) % (m_extraDbs.size() + 1); + return (id - 1) % (m_rcldb->m_extraDbs.size() + 1); } bool Db::testDbDir(const string &dir, bool *stripped_p) @@ -1556,7 +1573,7 @@ bool Db::needUpdate(const string &udi, const string& sig, bool *existed) // Set the existence flag for all the subdocs (if any) vector docids; - if (!m_ndb->subDocs(udi, docids)) { + if (!m_ndb->subDocs(udi, 0, docids)) { LOGERR(("Rcl::Db::needUpdate: can't get subdocs\n")); return true; } @@ -1808,7 +1825,7 @@ bool Db::dbStats(DbStats& res) // by the GUI history feature and by open parent/getenclosing // ! The return value is always true except for fatal errors. Document // existence should be tested by looking at doc.pc -bool Db::getDoc(const string &udi, Doc &doc) +bool Db::getDoc(const string &udi, const Doc& idxdoc, Doc &doc) { LOGDEB(("Db:getDoc: [%s]\n", udi.c_str())); if (m_ndb == 0) @@ -1820,7 +1837,8 @@ bool Db::getDoc(const string &udi, Doc &doc) doc.pc = 100; Xapian::Document xdoc; Xapian::docid docid; - if ((docid = m_ndb->getDoc(udi, xdoc))) { + int idxi = idxdoc.idxi; + if ((docid = m_ndb->getDoc(udi, idxi, xdoc))) { string data = xdoc.get_data(); doc.meta[Rcl::Doc::keyudi] = udi; return m_ndb->dbDataToRclDoc(docid, data, doc); @@ -1845,7 +1863,7 @@ bool Db::hasSubDocs(const Doc &idoc) return false; } vector docids; - if (!m_ndb->subDocs(inudi, docids)) { + if (!m_ndb->subDocs(inudi, idoc.idxi, docids)) { LOGDEB(("Db:getSubDocs: lower level subdocs failed\n")); return false; } @@ -1853,7 +1871,7 @@ bool Db::hasSubDocs(const Doc &idoc) return true; // Check if doc has an has_children term - if (m_ndb->hasTerm(inudi, has_children_term)) + if (m_ndb->hasTerm(inudi, idoc.idxi, has_children_term)) return true; return false; } @@ -1879,7 +1897,7 @@ bool Db::getSubDocs(const Doc &idoc, vector& subdocs) } else { // See if we have a parent term Xapian::Document xdoc; - if (!m_ndb->getDoc(inudi, xdoc)) { + if (!m_ndb->getDoc(inudi, idoc.idxi, xdoc)) { LOGERR(("Db::getSubDocs: can't get Xapian document\n")); return false; } @@ -1902,7 +1920,7 @@ bool Db::getSubDocs(const Doc &idoc, vector& subdocs) // Retrieve all subdoc xapian ids for the root vector docids; - if (!m_ndb->subDocs(rootudi, docids)) { + if (!m_ndb->subDocs(rootudi, idoc.idxi, docids)) { LOGDEB(("Db:getSubDocs: lower level subdocs failed\n")); return false; } diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 5a2b9312..00f26101 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -228,9 +228,12 @@ class Db { /* Update-related methods ******************************************/ /** Test if the db entry for the given udi is up to date (by - * comparing the input and stored sigs). - * Side-effect: set the existence flag for the file document - * and all subdocs if any (for later use by 'purge()') + * comparing the input and stored sigs). This is used both when + * indexing and querying (before opening a document using stale info), + * **This assumes that the udi pertains to the main index (idxi==0).** + * Side-effect when the db is writeable: set the existence flag + * for the file document and all subdocs if any (for later use by + * 'purge()') */ bool needUpdate(const string &udi, const string& sig, bool *existed=0); @@ -355,8 +358,13 @@ class Db { /** Get document for given udi * * Used by the 'history' feature, and to retrieve ancestor documents. + * @param udi the unique document identifier + * @param idxdoc used when there are several index as an opaque way to pass + * the index id. Use a doc from the same index + * (e.g.: when looking for parent), + * @param doc the output doc */ - bool getDoc(const string &udi, Doc &doc); + bool getDoc(const string &udi, const Doc& idxdoc, Doc &doc); /** Test if documents has sub-documents. * diff --git a/src/rcldb/rcldb_p.h b/src/rcldb/rcldb_p.h index da9859f9..4ba7b693 100644 --- a/src/rcldb/rcldb_p.h +++ b/src/rcldb/rcldb_p.h @@ -120,12 +120,14 @@ class Db::Native { bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc); + size_t whatDbIdx(Xapian::docid id); + /** Retrieve Xapian::docid, given unique document identifier, * using the posting list for the derived term. * * @return 0 if not found */ - Xapian::docid getDoc(const string& udi, Xapian::Document& xdoc); + Xapian::docid getDoc(const string& udi, int idxi, Xapian::Document& xdoc); /** Retrieve unique document identifier for given Xapian document, * using the document termlist @@ -133,7 +135,7 @@ class Db::Native { bool xdocToUdi(Xapian::Document& xdoc, string &udi); /** Check if doc is indexed by term */ - bool hasTerm(const string& udi, const string& term); + bool hasTerm(const string& udi, int idxi, const string& term); /** Compute list of subdocuments for a given udi. We look for documents * indexed by a parent term matching the udi, the posting list for the @@ -149,7 +151,7 @@ class Db::Native { * indexer (rcldb user), using the ipath. * */ - bool subDocs(const string &udi, vector& docids); + bool subDocs(const string &udi, int idxi, vector& docids); /** Check if a page position list is defined */ bool hasPages(Xapian::docid id); diff --git a/src/rcldb/rcldoc.cpp b/src/rcldb/rcldoc.cpp index bdcb3102..f550c9f4 100644 --- a/src/rcldb/rcldoc.cpp +++ b/src/rcldb/rcldoc.cpp @@ -48,6 +48,7 @@ namespace Rcl { void Doc::dump(bool dotext) const { LOGDEB(("Rcl::Doc::dump: url: [%s]\n", url.c_str())); + LOGDEB(("Rcl::Doc::dump: idxurl: [%s]\n", idxurl.c_str())); LOGDEB(("Rcl::Doc::dump: ipath: [%s]\n", ipath.c_str())); LOGDEB(("Rcl::Doc::dump: mimetype: [%s]\n", mimetype.c_str())); LOGDEB(("Rcl::Doc::dump: fmtime: [%s]\n", fmtime.c_str())); diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h index a7b62479..e37f0045 100644 --- a/src/rcldb/rcldoc.h +++ b/src/rcldb/rcldoc.h @@ -48,6 +48,12 @@ class Doc { // Query: from doc data. string url; + // When we do path translation for documents from external indexes, we + // save the original path: + string idxurl; + // And the originating db. 0 is base, 1 first external etc. + int idxi; + // Internal path for multi-doc files. Ascii // Set by FsIndexer::processone string ipath; @@ -142,13 +148,16 @@ class Doc { sig.erase(); text.erase(); + pc = 0; xdocid = 0; + idxi = 0; haspages = false; haschildren = false; } Doc() - : syntabs(false), pc(0), xdocid(0), haspages(false), haschildren(false) + : idxi(0), syntabs(false), pc(0), xdocid(0), + haspages(false), haschildren(false) { } /** Get value for named field. If value pointer is 0, just test existence */