new way for doc unique terms: only path for monodoc, only path+ipath for doc inside multidoc, add pseudo-doc for file itself

2006-04-25 09:59:12 +00:00 · 2006-04-25 09:59:12 +00:00 · 4646f62d6b
commit 4646f62d6b
parent 4928503f60
2 changed files with 107 additions and 75 deletions
--- a/src/index/indexer.cpp
+++ b/src/index/indexer.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: indexer.cpp,v 1.31 2006-04-12 10:41:39 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: indexer.cpp,v 1.32 2006-04-25 09:59:12 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -248,33 +248,48 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
    }

    FileInterner interner(fn, m_config, m_tmpdir);
+
+    // File name transcoded to utf8 for indexation. 
+    string charset = m_config->getDefCharset(true);
+    // If this fails, the file name won't be indexed, no big deal
+    // Note that we used to do the full path here, but I ended up believing
+    // that it made more sense to use only the file name
+    string utf8fn;
+    transcode(path_getsimple(fn), utf8fn, charset, "UTF-8");
+
    FileInterner::Status fis = FileInterner::FIAgain;
+    bool hadNullIpath = false;
+    Rcl::Doc doc;
+    char ascdate[20];
+    sprintf(ascdate, "%ld", long(stp->st_ctime));
    while (fis == FileInterner::FIAgain) {
-	Rcl::Doc doc;
+	doc.erase();
+
 	string ipath;
 	fis = interner.internfile(doc, ipath);
-	if (fis == FileInterner::FIError)
-	    break;
+	if (fis == FileInterner::FIError) {
+	    // We dont stop indexing for one bad doc
+	    return FsTreeWalker::FtwOk;
+	}

 	// Set the date if this was not done in the document handler
 	if (doc.fmtime.empty()) {
-	    char ascdate[20];
-	    sprintf(ascdate, "%ld", long(stp->st_ctime));
 	    doc.fmtime = ascdate;
 	}
-	// Internal access path for multi-document files
-	doc.ipath = ipath;

-	// File name transcoded to utf8 for indexation. 
-	string charset = m_config->getDefCharset(true);
-	// If this fails, the file name won't be indexed, no big deal
-	// Note that we used to do the full path here, but I ended up believing
-	// that it made more sense to use only the file name
-	transcode(path_getsimple(fn), doc.utf8fn, charset, "UTF-8");
-	// Do database-specific work to update document data
+	// Internal access path for multi-document files
+	if (ipath.empty())
+	    hadNullIpath = true;
+	else
+	    doc.ipath = ipath;
+	
+	doc.utf8fn = utf8fn;
+
+	// Add document to database
 	if (!m_db.add(fn, doc, stp)) 
 	    return FsTreeWalker::FtwError;

+	// Tell what we are doing and check for interrupt request
 	if (m_updater) {
 	    if ((++(m_updater->status.docsdone) % 10) == 0) {
 		m_updater->status.fn = fn;
@ -287,6 +302,19 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
 	}
    }

+    // If we had no instance with a null ipath, we create an empty
+    // document to stand for the file itself, to be used mainly for up
+    // to date checks. Typically this happens for an mbox file.
+    if (hadNullIpath == false) {
+	LOGDEB1(("Creating empty doc for file\n"));
+	Rcl::Doc fileDoc;
+	fileDoc.fmtime = doc.fmtime;
+	fileDoc.utf8fn = doc.utf8fn;
+	fileDoc.mimetype = doc.mimetype;
+	if (!m_db.add(fn, fileDoc, stp)) 
+	    return FsTreeWalker::FtwError;
+    }
+
    return FsTreeWalker::FtwOk;
 }

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.71 2006-04-25 08:17:36 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.72 2006-04-25 09:59:12 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -522,18 +522,20 @@ bool Db::add(const string &fn, const Doc &idoc,
    string hash;
    pathHash(fn, hash, PATHHASHLEN);
    LOGDEB2(("Db::add: pathhash [%s]\n", hash.c_str()));
-    string pathterm = "P" + hash;
-    newdocument.add_term(pathterm);
-    
-    // Unique term: with path, makes unique identifier for documents
-    // inside multidocument files.
+
+    // Unique term: makes unique identifier for documents
+    // either path or path+ipath inside multidocument files.
+    // We only add a path term if ipath is empty. Else there will be a qterm
+    // (path+ipath), and a pseudo-doc will be created to stand for the file 
+    // itself (for up to date checks). This is handled by 
+    // DbIndexer::processone() 
    string uniterm;
    if (doc.ipath.empty()) {
-	uniterm = pathterm;
+	uniterm = "P" + hash;
    } else {
-	uniterm  = "Q" + hash + "|" + doc.ipath;
-	newdocument.add_term(uniterm);
+	uniterm = "Q" + hash + "|" + doc.ipath;
    }
+    newdocument.add_term(uniterm);

    // Dates etc...
    time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : 
@ -613,11 +615,11 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
    if (m_ndb == 0)
 	return false;

-    // If no document exist with this path, we do need update
    string hash;
    pathHash(filename, hash, PATHHASHLEN);
-    string pathterm  = "P" + hash;
+    string pterm  = "P" + hash;
    const char *ermsg;
+    string qterm = "Q"+ hash + "|";

    // Look for all documents with this path. We need to look at all
    // to set their existence flag.  We check the update time on the
@ -626,42 +628,54 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
    // file changed)
    Xapian::PostingIterator doc;
    try {
-	if (!m_ndb->wdb.term_exists(pathterm)) {
-	    LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
+	if (!m_ndb->wdb.term_exists(pterm)) {
+	    // If no document exist with this path, we do need update
+	    LOGDEB2(("Db::needUpdate: no such path: [%s]\n", pterm.c_str()));
 	    return true;
 	}
+	// Check the date using the Pterm doc or pseudo-doc
+	Xapian::PostingIterator docid = m_ndb->wdb.postlist_begin(pterm);
+	Xapian::Document doc = m_ndb->wdb.get_document(*docid);
+	string data = doc.get_data();
+	const char *cp = strstr(data.c_str(), "fmtime=");
+	if (cp) {
+	    cp += 7;
+	} else {
+	    cp = strstr(data.c_str(), "mtime=");
+	    if (cp)
+		cp+= 6;
+	}
+	long mtime = cp ? atol(cp) : 0;
+	if (mtime < stp->st_mtime) {
+	    LOGDEB2(("Db::needUpdate: yes: mtime: Db %ld file %ld\n", 
+		     (long)mtime, (long)stp->st_mtime));
+	    // Db is not up to date. Let's index the file
+	    return true;
+	} 

-	Xapian::PostingIterator docid0 = m_ndb->wdb.postlist_begin(pathterm);
-	for (Xapian::PostingIterator docid = docid0;
-	     docid != m_ndb->wdb.postlist_end(pathterm); docid++) {
+	LOGDEB2(("Db::needUpdate: uptodate: [%s]\n", pterm.c_str()));

-	    Xapian::Document doc = m_ndb->wdb.get_document(*docid);
+	// Up to date. 

-	    // Check the date once. no need to look at the others if
-	    // the db needs updating. Note that the fmtime used to be
-	    // called mtime, and we're keeping compat
-	    if (docid == docid0) {
-		string data = doc.get_data();
-		const char *cp = strstr(data.c_str(), "fmtime=");
-		if (cp) {
-		    cp += 7;
-		} else {
-		    cp = strstr(data.c_str(), "mtime=");
-		    if (cp)
-			cp+= 6;
-		}
-		long mtime = cp ? atol(cp) : 0;
-		if (mtime < stp->st_mtime) {
-		    LOGDEB2(("Db::needUpdate: yes: mtime: Db %ld file %ld\n", 
-			    (long)mtime, (long)stp->st_mtime));
-		    // Db is not up to date. Let's index the file
-		    return true;
-		} 
-	    }
+	// Set the uptodate flag for doc / pseudo doc
+	m_ndb->updated[*docid] = true;

-	    // Db is up to date. Make a note that this document exists.
-	    if (*docid < m_ndb->updated.size())
+	// Set the existence flag for all the subdocs (if any)
+	Xapian::TermIterator it = m_ndb->wdb.allterms_begin(); 
+	it.skip_to(qterm);
+	LOGDEB2(("First qterm: [%s]\n", (*it).c_str()));
+	for (;it != m_ndb->wdb.allterms_end(); it++) {
+	    // If current term does not begin with qterm or has another |, not
+	    // the same file
+	    if ((*it).find(qterm) != 0 || 
+		(*it).find_last_of("|") != qterm.length() -1)
+		break;
+	    docid = m_ndb->wdb.postlist_begin(*it);
+	    if (*docid < m_ndb->updated.size()) {
+		LOGDEB2(("Db::needUpdate: set exist flag for docid %d [%s]\n", 
+			*docid, (*it).c_str()));
 		m_ndb->updated[*docid] = true;
+	    }
 	}
 	return false;
    } catch (const Xapian::Error &e) {
@ -1246,9 +1260,7 @@ bool Db::getDoc(int exti, Doc &doc, int *percent)
    return m_ndb->dbDataToRclDoc(data, doc, m_qOpts, docid, terms);
 }

-// Retrieve document defined by file name and internal path. Very inefficient,
-// used only for history display. We'd need to enter path+ipath terms in the
-// db if we wanted to make this more efficient.
+// Retrieve document defined by file name and internal path. 
 bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
 {
    LOGDEB(("Db:getDoc: [%s] (%d) [%s]\n", fn.c_str(), fn.length(),
@ -1265,32 +1277,24 @@ bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)

    string hash;
    pathHash(fn, hash, PATHHASHLEN);
-    string pathterm  = "P" + hash;
-    // Look for all documents with this path, searching for the one
-    // with the appropriate ipath. This is very inefficient.
+    string pqterm  = ipath.empty() ? "P" + hash : "Q" + hash + "|" + ipath;
    const char *ermsg = "";
    try {
-	if (!m_ndb->db.term_exists(pathterm)) {
+	if (!m_ndb->db.term_exists(pqterm)) {
 	    // Document found in history no longer in the database.
 	    // We return true (because their might be other ok docs further)
 	    // but indicate the error with pc = -1
 	    if (*pc) 
 		*pc = -1;
-	    LOGINFO(("Db:getDoc: no such path in index: [%s] (len %d)\n",
-		     pathterm.c_str(), pathterm.length()));
+	    LOGINFO(("Db:getDoc: no such doc in index: [%s] (len %d)\n",
+		     pqterm.c_str(), pqterm.length()));
 	    return true;
 	}
-	for (Xapian::PostingIterator docid = 
-		 m_ndb->db.postlist_begin(pathterm);
-	     docid != m_ndb->db.postlist_end(pathterm); docid++) {
-
-	    Xapian::Document xdoc = m_ndb->db.get_document(*docid);
-	    string data = xdoc.get_data();
-	    list<string> terms;
-	    if (m_ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms) 
-		&& doc.ipath == ipath)
-		return true;
-	}
+	Xapian::PostingIterator docid = m_ndb->db.postlist_begin(pqterm);
+	Xapian::Document xdoc = m_ndb->db.get_document(*docid);
+	string data = xdoc.get_data();
+	list<string> terms;
+	return m_ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms);
    } catch (const Xapian::Error &e) {
 	ermsg = e.get_msg().c_str();
    } catch (const string &s) {