diff --git a/src/index/indexer.cpp b/src/index/indexer.cpp index 4475c122..7bbb9e67 100644 --- a/src/index/indexer.cpp +++ b/src/index/indexer.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: indexer.cpp,v 1.67 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: indexer.cpp,v 1.68 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -390,7 +390,10 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, // without mime type will not be purged from the db, resulting // in possible 'cannot intern file' messages at query time... char cbuf[100]; - // Document signature + // Document signature. This is based on mtime and size and used + // for the uptodate check (the value computed here is checked + // against the stored one). Changing the computation forces a full + // reindex of course. sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime); string sig = cbuf; string udi; @@ -398,6 +401,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, if (!m_db.needUpdate(udi, sig)) { LOGDEB(("processone: up to date: %s\n", fn.c_str())); if (m_updater) { + // Status bar update, abort request etc. m_updater->status.fn = fn; if (!m_updater->update()) { return FsTreeWalker::FtwStop; @@ -422,14 +426,18 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, ercnt, charset.c_str(), path_getsimple(fn).c_str())); } LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n", - path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(), "UTF-8")); + path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(), + "UTF-8")); + + string parent_udi; + make_udi(fn, "", parent_udi); + Rcl::Doc doc; + const string plus("+"); + char ascdate[20]; + sprintf(ascdate, "%ld", long(stp->st_mtime)); FileInterner::Status fis = FileInterner::FIAgain; bool hadNullIpath = false; - Rcl::Doc doc; - const string plus = "+"; - char ascdate[20]; - sprintf(ascdate, "%ld", long(stp->st_mtime)); while (fis == FileInterner::FIAgain) { doc.erase(); string ipath; @@ -468,6 +476,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, hadNullIpath = true; else doc.ipath = ipath; + doc.url = string("file://") + fn; // Note that the filter may have its own idea of the file name @@ -484,10 +493,11 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime); doc.sig = cbuf; - // Add document to database + // Add document to database. If there is an ipath, add it as a children + // of the file document. string udi; make_udi(fn, ipath, udi); - if (!m_db.add(udi, doc)) + if (!m_db.addOrUpdate(udi, ipath.empty() ? "" : parent_udi, doc)) return FsTreeWalker::FtwError; // Tell what we are doing and check for interrupt request @@ -520,9 +530,7 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp, // Document signature for up to date checks. sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime); fileDoc.sig = cbuf; - string udi; - make_udi(fn, "", udi); - if (!m_db.add(udi, fileDoc)) + if (!m_db.addOrUpdate(parent_udi, "", fileDoc)) return FsTreeWalker::FtwError; } diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 9307c667..11b5030b 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -1,5 +1,5 @@ #ifndef lint -static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.136 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes"; +static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.137 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes"; #endif /* * This program is free software; you can redistribute it and/or modify @@ -80,39 +80,41 @@ namespace Rcl { // found in document) const static string rclSyntAbs("?!#@"); -// Compute the unique term used to link documents to their file-system source: -// Hashed path + possible internal path +// Compute the unique term used to link documents to their origin. +// "Q" + external udi static inline string make_uniterm(const string& udi) { string uniterm("Q"); uniterm.append(udi); return uniterm; } +// Compute parent term used to link documents to their parent document (if any) +// "" + parent external udi +static inline string make_parentterm(const string& udi) +{ + // I prefer to be in possible conflict with omega than with + // user-defined fields (Xxxx) that we also allow. "F" is currently + // not used by omega (2008-07) + string pterm("F"); + pterm.append(udi); + return pterm; +} /* See comment in class declaration: return all subdocuments of a - * document given by its unique path id */ -bool Db::Native::subDocs(const string &uniterm, vector& docids) + * document given by its unique id. +*/ +bool Db::Native::subDocs(const string &udi, vector& docids) { LOGDEB2(("subDocs: [%s]\n", uniterm.c_str())); - docids.clear(); - string ermsg; + string pterm = make_parentterm(udi); for (int tries = 0; tries < 2; tries++) { try { - Xapian::TermIterator it = db.allterms_begin(); - it.skip_to(uniterm); - // Don't return the doc itself: - it++; - for (; it != db.allterms_end(); it++) { - LOGDEB2(("subDocs: testing [%s]\n", (*it).c_str())); - // If current term does not begin with uniterm or has - // another |, not the same file - if ((*it).find(uniterm) != 0 || - (*it).find_last_of("|") != uniterm.length()-1) - break; - docids.push_back(*(db.postlist_begin(*it))); + Xapian::PostingIterator it = db.postlist_begin(pterm); + for (; it != db.postlist_end(pterm); it++) { + docids.push_back(*it); } - LOGDEB2(("Db::Native::subDocs: returning %d ids\n", docids.size())); + LOGDEB(("Db::Native::subDocs: returning %d ids\n", docids.size())); return true; } catch (const Xapian::DatabaseModifiedError &e) { LOGDEB(("Db::subDocs: got modified error. reopen/retry\n")); @@ -800,9 +802,11 @@ static const int MB = 1024 * 1024; // the title abstract and body and add special terms for file name, // date, mime type ... , create the document data record (more // metadata), and update database -bool Db::add(const string &udi, const Doc &idoc) +bool Db::addOrUpdate(const string &udi, const string &parent_udi, + const Doc &idoc) { - LOGDEB1(("Db::add: udi %s\n", udi.c_str())); + LOGDEB(("Db::add: udi [%s] parent [%s]\n", + udi.c_str(), parent_udi.c_str())); if (m_ndb == 0) return false; static int first = 1; @@ -927,7 +931,11 @@ bool Db::add(const string &udi, const Doc &idoc) // checks, and unique id for the replace_document() call. string uniterm = make_uniterm(udi); newdocument.add_term(uniterm); - + // Parent term. This is used to find all descendents, mostly to delete them + // when the parent goes away + if (!parent_udi.empty()) { + newdocument.add_term(make_parentterm(parent_udi)); + } // Dates etc... time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : doc.dmtime.c_str()); @@ -1091,7 +1099,7 @@ bool Db::needUpdate(const string &udi, const string& sig) // Set the existence flag for all the subdocs (if any) vector docids; - if (!m_ndb->subDocs(uniterm, docids)) { + if (!m_ndb->subDocs(udi, docids)) { LOGERR(("Rcl::Db::needUpdate: can't get subdocs list\n")); return true; } @@ -1193,9 +1201,9 @@ bool Db::purge() } catch (const Xapian::DocNotFoundError &) { LOGDEB(("Db::purge: document #%d not found\n", docid)); } catch (const Xapian::Error &e) { - LOGERR(("Db::purge: document #%d: %s\n", e.get_msg().c_str())); + LOGERR(("Db::purge: document #%d: %s\n", docid, e.get_msg().c_str())); } catch (...) { - LOGERR(("Db::purge: document #%d: unknown error\n")); + LOGERR(("Db::purge: document #%d: unknown error\n", docid)); } } } @@ -1224,7 +1232,7 @@ bool Db::purgeFile(const string &udi) LOGDEB(("purgeFile: delete docid %d\n", *docid)); db.delete_document(*docid); vector docids; - m_ndb->subDocs(uniterm, docids); + m_ndb->subDocs(udi, docids); LOGDEB(("purgeFile: subdocs cnt %d\n", docids.size())); for (vector::iterator it = docids.begin(); it != docids.end(); it++) { diff --git a/src/rcldb/rcldb.h b/src/rcldb/rcldb.h index 679bb47c..d362979f 100644 --- a/src/rcldb/rcldb.h +++ b/src/rcldb/rcldb.h @@ -16,7 +16,7 @@ */ #ifndef _DB_H_INCLUDED_ #define _DB_H_INCLUDED_ -/* @(#$Id: rcldb.h,v 1.58 2008-07-28 12:24:15 dockes Exp $ (C) 2004 J.F.Dockes */ +/* @(#$Id: rcldb.h,v 1.59 2008-07-29 06:25:29 dockes Exp $ (C) 2004 J.F.Dockes */ #include #include @@ -43,9 +43,12 @@ using std::vector; // The main goal is simplicity and good matching to usage inside the recoll // user interface. In other words, this is not exhaustive or well-designed or // reusable. - - -struct stat; +// +// Unique Document Identifier: unically identifies a document in its +// source storage (file system or other). Used for up to date checks +// etc. "udi". Our user is responsible for making sure it's not too +// big, cause it's stored as a Xapian term (< 150 bytes would be +// reasonable) #ifndef NO_NAMESPACES namespace Rcl { @@ -103,14 +106,17 @@ class Db { /* Update-related methods ******************************************/ - /** Test if the db entry for the given filename/stat is up to date. This + /** Test if the db entry for the given udi is up to date. This * has the side-effect of setting the existence flag for the file document - * and all subdocs if any (for later use by 'purge()') */ + * and all subdocs if any (for later use by 'purge()') + */ bool needUpdate(const string &udi, const string& sig); - /** Add document. The Doc class should have been filled as much as - * possible depending on the document type */ - bool add(const string &udi, const Doc &doc); + /** Add or update document. The Doc class should have been filled as much as + * possible depending on the document type. parent_udi is only + * use for subdocs, else set it to empty */ + bool addOrUpdate(const string &udi, const string &parent_udi, + const Doc &doc); /** Delete document(s) for given UDI, including subdocs */ bool purgeFile(const string &udi); diff --git a/src/rcldb/rcldb_p.h b/src/rcldb/rcldb_p.h index 2d26be87..e61516a7 100644 --- a/src/rcldb/rcldb_p.h +++ b/src/rcldb/rcldb_p.h @@ -4,7 +4,7 @@ #include "xapian.h" namespace Rcl { -/* @(#$Id: rcldb_p.h,v 1.2 2008-07-28 08:42:52 dockes Exp $ (C) 2007 J.F.Dockes */ +/* @(#$Id: rcldb_p.h,v 1.3 2008-07-29 06:25:29 dockes Exp $ (C) 2007 J.F.Dockes */ // Generic Xapian exception catching code. We do this quite often, // and I have no idea how to do this except for a macro @@ -51,16 +51,22 @@ class Db::Native { bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc); - /** Compute list of subdocuments for a given path (given by hash) - * We look for all Q terms beginning with the path/hash - * As suggested by James Aylett, a better method would be to add - * a single term (ie: XP/path/to/file) to all subdocs, then finding - * them would be a simple matter of retrieving the posting list for the - * term. There would still be a need for the current Qterm though, as a - * unique term for replace_document, and for retrieving by - * path/ipath (history) + /** Compute list of subdocuments for a given udi. We look for documents + * indexed by a parent term matching the udi, the posting list for the + * parentterm(udi) (As suggested by James Aylett) + * + * Note that this is not currently recursive: all subdocs are supposed + * to be children of the file doc. + * Ie: in a mail folder, all messages, attachments, attachments of + * attached messages etc. must have the folder file document as + * parent. + * Parent-child relationships are defined by the indexer (rcldb user) + * + * The file-system indexer currently works this way (flatly), + * subDocs() could be relatively easily changed to support full recursivity + * if needed. */ - bool subDocs(const string &uniterm, vector& docids); + bool subDocs(const string &udi, vector& docids); }; } diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h index a38b0a54..fcf8bd9f 100644 --- a/src/rcldb/rcldoc.h +++ b/src/rcldb/rcldoc.h @@ -16,7 +16,7 @@ */ #ifndef _RCLDOC_H_INCLUDED_ #define _RCLDOC_H_INCLUDED_ -/* @(#$Id: rcldoc.h,v 1.5 2008-07-28 12:24:15 dockes Exp $ (C) 2006 J.F.Dockes */ +/* @(#$Id: rcldoc.h,v 1.6 2008-07-29 06:25:29 dockes Exp $ (C) 2006 J.F.Dockes */ #include #include @@ -75,9 +75,11 @@ class Doc { // Doc text size. Index: from text.length(). Query: set by rcldb from // index doc data. string dbytes; - // Doc signature. Used for up to date checks. This is opaque, and - // could just as well be ctime, size, ctime+size, md5, whatever. + + // Doc signature. Used for up to date checks. // Index: set by Db::Add caller. Query: set from doc data. + // This is opaque to rcldb, and could just as well be ctime, size, + // ctime+size, md5, whatever. string sig; // The following fields don't go to the db record