new way for doc unique terms: only path for monodoc, only path+ipath for doc inside multidoc, add pseudo-doc for file itself

2006-04-25 09:59:12 +00:00 · 2006-04-25 09:59:12 +00:00 · 4646f62d6b
commit 4646f62d6b
parent 4928503f60
2 changed files with 107 additions and 75 deletions
--- a/src/index/indexer.cpp
+++ b/src/index/indexer.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: indexer.cpp,v 1.31 2006-04-12 10:41:39 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: indexer.cpp,v 1.32 2006-04-25 09:59:12 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -248,33 +248,48 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
    }
    FileInterner interner(fn, m_config, m_tmpdir);
    // File name transcoded to utf8 for indexation. 
    string charset = m_config->getDefCharset(true);
    // If this fails, the file name won't be indexed, no big deal
    // Note that we used to do the full path here, but I ended up believing
    // that it made more sense to use only the file name
    string utf8fn;
    transcode(path_getsimple(fn), utf8fn, charset, "UTF-8");
    FileInterner::Status fis = FileInterner::FIAgain;
    bool hadNullIpath = false;
    Rcl::Doc doc;
    char ascdate[20];
    sprintf(ascdate, "%ld", long(stp->st_ctime));
    while (fis == FileInterner::FIAgain) {
-	Rcl::Doc doc;
+	doc.erase();
 	string ipath;
 	fis = interner.internfile(doc, ipath);
-	if (fis == FileInterner::FIError)
+	if (fis == FileInterner::FIError) {
-	    break;
+	    // We dont stop indexing for one bad doc
 	    return FsTreeWalker::FtwOk;
 	}
 	// Set the date if this was not done in the document handler
 	if (doc.fmtime.empty()) {
 	    char ascdate[20];
 	    sprintf(ascdate, "%ld", long(stp->st_ctime));
 	    doc.fmtime = ascdate;
 	}
 	// Internal access path for multi-document files
 	doc.ipath = ipath;
-	// File name transcoded to utf8 for indexation. 
+	// Internal access path for multi-document files
-	string charset = m_config->getDefCharset(true);
+	if (ipath.empty())
-	// If this fails, the file name won't be indexed, no big deal
+	    hadNullIpath = true;
-	// Note that we used to do the full path here, but I ended up believing
+	else
-	// that it made more sense to use only the file name
+	    doc.ipath = ipath;
-	transcode(path_getsimple(fn), doc.utf8fn, charset, "UTF-8");
+	
-	// Do database-specific work to update document data
+	doc.utf8fn = utf8fn;
 	// Add document to database
 	if (!m_db.add(fn, doc, stp)) 
 	    return FsTreeWalker::FtwError;
 	// Tell what we are doing and check for interrupt request
 	if (m_updater) {
 	    if ((++(m_updater->status.docsdone) % 10) == 0) {
 		m_updater->status.fn = fn;
@ -287,6 +302,19 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
 	}
    }
    // If we had no instance with a null ipath, we create an empty
    // document to stand for the file itself, to be used mainly for up
    // to date checks. Typically this happens for an mbox file.
    if (hadNullIpath == false) {
 	LOGDEB1(("Creating empty doc for file\n"));
 	Rcl::Doc fileDoc;
 	fileDoc.fmtime = doc.fmtime;
 	fileDoc.utf8fn = doc.utf8fn;
 	fileDoc.mimetype = doc.mimetype;
 	if (!m_db.add(fn, fileDoc, stp)) 
 	    return FsTreeWalker::FtwError;
    }
    return FsTreeWalker::FtwOk;
 }
--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.71 2006-04-25 08:17:36 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.72 2006-04-25 09:59:12 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
 *   This program is free software; you can redistribute it and/or modify
@ -522,18 +522,20 @@ bool Db::add(const string &fn, const Doc &idoc,
    string hash;
    pathHash(fn, hash, PATHHASHLEN);
    LOGDEB2(("Db::add: pathhash [%s]\n", hash.c_str()));
-    string pathterm = "P" + hash;
+
-    newdocument.add_term(pathterm);
+    // Unique term: makes unique identifier for documents
-    
+    // either path or path+ipath inside multidocument files.
-    // Unique term: with path, makes unique identifier for documents
+    // We only add a path term if ipath is empty. Else there will be a qterm
-    // inside multidocument files.
+    // (path+ipath), and a pseudo-doc will be created to stand for the file 
    // itself (for up to date checks). This is handled by 
    // DbIndexer::processone() 
    string uniterm;
    if (doc.ipath.empty()) {
-	uniterm = pathterm;
+	uniterm = "P" + hash;
    } else {
-	uniterm  = "Q" + hash + "|" + doc.ipath;
+	uniterm = "Q" + hash + "|" + doc.ipath;
 	newdocument.add_term(uniterm);
    }
    newdocument.add_term(uniterm);
    // Dates etc...
    time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : 
@ -613,11 +615,11 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
    if (m_ndb == 0)
 	return false;
    // If no document exist with this path, we do need update
    string hash;
    pathHash(filename, hash, PATHHASHLEN);
-    string pathterm  = "P" + hash;
+    string pterm  = "P" + hash;
    const char *ermsg;
    string qterm = "Q"+ hash + "|";
    // Look for all documents with this path. We need to look at all
    // to set their existence flag.  We check the update time on the
@ -626,42 +628,54 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
    // file changed)
    Xapian::PostingIterator doc;
    try {
-	if (!m_ndb->wdb.term_exists(pathterm)) {
+	if (!m_ndb->wdb.term_exists(pterm)) {
-	    LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
+	    // If no document exist with this path, we do need update
 	    LOGDEB2(("Db::needUpdate: no such path: [%s]\n", pterm.c_str()));
 	    return true;
 	}
 	// Check the date using the Pterm doc or pseudo-doc
 	Xapian::PostingIterator docid = m_ndb->wdb.postlist_begin(pterm);
 	Xapian::Document doc = m_ndb->wdb.get_document(*docid);
 	string data = doc.get_data();
 	const char *cp = strstr(data.c_str(), "fmtime=");
 	if (cp) {
 	    cp += 7;
 	} else {
 	    cp = strstr(data.c_str(), "mtime=");
 	    if (cp)
 		cp+= 6;
 	}
 	long mtime = cp ? atol(cp) : 0;
 	if (mtime < stp->st_mtime) {
 	    LOGDEB2(("Db::needUpdate: yes: mtime: Db %ld file %ld\n", 
 		     (long)mtime, (long)stp->st_mtime));
 	    // Db is not up to date. Let's index the file
 	    return true;
 	} 
-	Xapian::PostingIterator docid0 = m_ndb->wdb.postlist_begin(pathterm);
+	LOGDEB2(("Db::needUpdate: uptodate: [%s]\n", pterm.c_str()));
 	for (Xapian::PostingIterator docid = docid0;
 	     docid != m_ndb->wdb.postlist_end(pathterm); docid++) {
-	    Xapian::Document doc = m_ndb->wdb.get_document(*docid);
+	// Up to date. 
-	    // Check the date once. no need to look at the others if
+	// Set the uptodate flag for doc / pseudo doc
-	    // the db needs updating. Note that the fmtime used to be
+	m_ndb->updated[*docid] = true;
 	    // called mtime, and we're keeping compat
 	    if (docid == docid0) {
 		string data = doc.get_data();
 		const char *cp = strstr(data.c_str(), "fmtime=");
 		if (cp) {
 		    cp += 7;
 		} else {
 		    cp = strstr(data.c_str(), "mtime=");
 		    if (cp)
 			cp+= 6;
 		}
 		long mtime = cp ? atol(cp) : 0;
 		if (mtime < stp->st_mtime) {
 		    LOGDEB2(("Db::needUpdate: yes: mtime: Db %ld file %ld\n", 
 			    (long)mtime, (long)stp->st_mtime));
 		    // Db is not up to date. Let's index the file
 		    return true;
 		} 
 	    }
-	    // Db is up to date. Make a note that this document exists.
+	// Set the existence flag for all the subdocs (if any)
-	    if (*docid < m_ndb->updated.size())
+	Xapian::TermIterator it = m_ndb->wdb.allterms_begin(); 
 	it.skip_to(qterm);
 	LOGDEB2(("First qterm: [%s]\n", (*it).c_str()));
 	for (;it != m_ndb->wdb.allterms_end(); it++) {
 	    // If current term does not begin with qterm or has another |, not
 	    // the same file
 	    if ((*it).find(qterm) != 0 || 
 		(*it).find_last_of("|") != qterm.length() -1)
 		break;
 	    docid = m_ndb->wdb.postlist_begin(*it);
 	    if (*docid < m_ndb->updated.size()) {
 		LOGDEB2(("Db::needUpdate: set exist flag for docid %d [%s]\n", 
 			*docid, (*it).c_str()));
 		m_ndb->updated[*docid] = true;
 	    }
 	}
 	return false;
    } catch (const Xapian::Error &e) {
@ -1246,9 +1260,7 @@ bool Db::getDoc(int exti, Doc &doc, int *percent)
    return m_ndb->dbDataToRclDoc(data, doc, m_qOpts, docid, terms);
 }
-// Retrieve document defined by file name and internal path. Very inefficient,
+// Retrieve document defined by file name and internal path. 
 // used only for history display. We'd need to enter path+ipath terms in the
 // db if we wanted to make this more efficient.
 bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
 {
    LOGDEB(("Db:getDoc: [%s] (%d) [%s]\n", fn.c_str(), fn.length(),
@ -1265,32 +1277,24 @@ bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
    string hash;
    pathHash(fn, hash, PATHHASHLEN);
-    string pathterm  = "P" + hash;
+    string pqterm  = ipath.empty() ? "P" + hash : "Q" + hash + "|" + ipath;
    // Look for all documents with this path, searching for the one
    // with the appropriate ipath. This is very inefficient.
    const char *ermsg = "";
    try {
-	if (!m_ndb->db.term_exists(pathterm)) {
+	if (!m_ndb->db.term_exists(pqterm)) {
 	    // Document found in history no longer in the database.
 	    // We return true (because their might be other ok docs further)
 	    // but indicate the error with pc = -1
 	    if (*pc) 
 		*pc = -1;
-	    LOGINFO(("Db:getDoc: no such path in index: [%s] (len %d)\n",
+	    LOGINFO(("Db:getDoc: no such doc in index: [%s] (len %d)\n",
-		     pathterm.c_str(), pathterm.length()));
+		     pqterm.c_str(), pqterm.length()));
 	    return true;
 	}
-	for (Xapian::PostingIterator docid = 
+	Xapian::PostingIterator docid = m_ndb->db.postlist_begin(pqterm);
-		 m_ndb->db.postlist_begin(pathterm);
+	Xapian::Document xdoc = m_ndb->db.get_document(*docid);
-	     docid != m_ndb->db.postlist_end(pathterm); docid++) {
+	string data = xdoc.get_data();
-
+	list<string> terms;
-	    Xapian::Document xdoc = m_ndb->db.get_document(*docid);
+	return m_ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms);
 	    string data = xdoc.get_data();
 	    list<string> terms;
 	    if (m_ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms) 
 		&& doc.ipath == ipath)
 		return true;
 	}
    } catch (const Xapian::Error &e) {
 	ermsg = e.get_msg().c_str();
    } catch (const string &s) {