new way for doc unique terms: only path for monodoc, only path+ipath for doc inside multidoc, add pseudo-doc for file itself
This commit is contained in:
parent
4928503f60
commit
4646f62d6b
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.31 2006-04-12 10:41:39 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.32 2006-04-25 09:59:12 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -248,33 +248,48 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
}
|
||||
|
||||
FileInterner interner(fn, m_config, m_tmpdir);
|
||||
|
||||
// File name transcoded to utf8 for indexation.
|
||||
string charset = m_config->getDefCharset(true);
|
||||
// If this fails, the file name won't be indexed, no big deal
|
||||
// Note that we used to do the full path here, but I ended up believing
|
||||
// that it made more sense to use only the file name
|
||||
string utf8fn;
|
||||
transcode(path_getsimple(fn), utf8fn, charset, "UTF-8");
|
||||
|
||||
FileInterner::Status fis = FileInterner::FIAgain;
|
||||
bool hadNullIpath = false;
|
||||
Rcl::Doc doc;
|
||||
char ascdate[20];
|
||||
sprintf(ascdate, "%ld", long(stp->st_ctime));
|
||||
while (fis == FileInterner::FIAgain) {
|
||||
Rcl::Doc doc;
|
||||
doc.erase();
|
||||
|
||||
string ipath;
|
||||
fis = interner.internfile(doc, ipath);
|
||||
if (fis == FileInterner::FIError)
|
||||
break;
|
||||
if (fis == FileInterner::FIError) {
|
||||
// We dont stop indexing for one bad doc
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
|
||||
// Set the date if this was not done in the document handler
|
||||
if (doc.fmtime.empty()) {
|
||||
char ascdate[20];
|
||||
sprintf(ascdate, "%ld", long(stp->st_ctime));
|
||||
doc.fmtime = ascdate;
|
||||
}
|
||||
// Internal access path for multi-document files
|
||||
doc.ipath = ipath;
|
||||
|
||||
// File name transcoded to utf8 for indexation.
|
||||
string charset = m_config->getDefCharset(true);
|
||||
// If this fails, the file name won't be indexed, no big deal
|
||||
// Note that we used to do the full path here, but I ended up believing
|
||||
// that it made more sense to use only the file name
|
||||
transcode(path_getsimple(fn), doc.utf8fn, charset, "UTF-8");
|
||||
// Do database-specific work to update document data
|
||||
// Internal access path for multi-document files
|
||||
if (ipath.empty())
|
||||
hadNullIpath = true;
|
||||
else
|
||||
doc.ipath = ipath;
|
||||
|
||||
doc.utf8fn = utf8fn;
|
||||
|
||||
// Add document to database
|
||||
if (!m_db.add(fn, doc, stp))
|
||||
return FsTreeWalker::FtwError;
|
||||
|
||||
// Tell what we are doing and check for interrupt request
|
||||
if (m_updater) {
|
||||
if ((++(m_updater->status.docsdone) % 10) == 0) {
|
||||
m_updater->status.fn = fn;
|
||||
@ -287,6 +302,19 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
}
|
||||
}
|
||||
|
||||
// If we had no instance with a null ipath, we create an empty
|
||||
// document to stand for the file itself, to be used mainly for up
|
||||
// to date checks. Typically this happens for an mbox file.
|
||||
if (hadNullIpath == false) {
|
||||
LOGDEB1(("Creating empty doc for file\n"));
|
||||
Rcl::Doc fileDoc;
|
||||
fileDoc.fmtime = doc.fmtime;
|
||||
fileDoc.utf8fn = doc.utf8fn;
|
||||
fileDoc.mimetype = doc.mimetype;
|
||||
if (!m_db.add(fn, fileDoc, stp))
|
||||
return FsTreeWalker::FtwError;
|
||||
}
|
||||
|
||||
return FsTreeWalker::FtwOk;
|
||||
}
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.71 2006-04-25 08:17:36 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.72 2006-04-25 09:59:12 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -522,18 +522,20 @@ bool Db::add(const string &fn, const Doc &idoc,
|
||||
string hash;
|
||||
pathHash(fn, hash, PATHHASHLEN);
|
||||
LOGDEB2(("Db::add: pathhash [%s]\n", hash.c_str()));
|
||||
string pathterm = "P" + hash;
|
||||
newdocument.add_term(pathterm);
|
||||
|
||||
// Unique term: with path, makes unique identifier for documents
|
||||
// inside multidocument files.
|
||||
|
||||
// Unique term: makes unique identifier for documents
|
||||
// either path or path+ipath inside multidocument files.
|
||||
// We only add a path term if ipath is empty. Else there will be a qterm
|
||||
// (path+ipath), and a pseudo-doc will be created to stand for the file
|
||||
// itself (for up to date checks). This is handled by
|
||||
// DbIndexer::processone()
|
||||
string uniterm;
|
||||
if (doc.ipath.empty()) {
|
||||
uniterm = pathterm;
|
||||
uniterm = "P" + hash;
|
||||
} else {
|
||||
uniterm = "Q" + hash + "|" + doc.ipath;
|
||||
newdocument.add_term(uniterm);
|
||||
uniterm = "Q" + hash + "|" + doc.ipath;
|
||||
}
|
||||
newdocument.add_term(uniterm);
|
||||
|
||||
// Dates etc...
|
||||
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
||||
@ -613,11 +615,11 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
|
||||
if (m_ndb == 0)
|
||||
return false;
|
||||
|
||||
// If no document exist with this path, we do need update
|
||||
string hash;
|
||||
pathHash(filename, hash, PATHHASHLEN);
|
||||
string pathterm = "P" + hash;
|
||||
string pterm = "P" + hash;
|
||||
const char *ermsg;
|
||||
string qterm = "Q"+ hash + "|";
|
||||
|
||||
// Look for all documents with this path. We need to look at all
|
||||
// to set their existence flag. We check the update time on the
|
||||
@ -626,42 +628,54 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
|
||||
// file changed)
|
||||
Xapian::PostingIterator doc;
|
||||
try {
|
||||
if (!m_ndb->wdb.term_exists(pathterm)) {
|
||||
LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
|
||||
if (!m_ndb->wdb.term_exists(pterm)) {
|
||||
// If no document exist with this path, we do need update
|
||||
LOGDEB2(("Db::needUpdate: no such path: [%s]\n", pterm.c_str()));
|
||||
return true;
|
||||
}
|
||||
// Check the date using the Pterm doc or pseudo-doc
|
||||
Xapian::PostingIterator docid = m_ndb->wdb.postlist_begin(pterm);
|
||||
Xapian::Document doc = m_ndb->wdb.get_document(*docid);
|
||||
string data = doc.get_data();
|
||||
const char *cp = strstr(data.c_str(), "fmtime=");
|
||||
if (cp) {
|
||||
cp += 7;
|
||||
} else {
|
||||
cp = strstr(data.c_str(), "mtime=");
|
||||
if (cp)
|
||||
cp+= 6;
|
||||
}
|
||||
long mtime = cp ? atol(cp) : 0;
|
||||
if (mtime < stp->st_mtime) {
|
||||
LOGDEB2(("Db::needUpdate: yes: mtime: Db %ld file %ld\n",
|
||||
(long)mtime, (long)stp->st_mtime));
|
||||
// Db is not up to date. Let's index the file
|
||||
return true;
|
||||
}
|
||||
|
||||
Xapian::PostingIterator docid0 = m_ndb->wdb.postlist_begin(pathterm);
|
||||
for (Xapian::PostingIterator docid = docid0;
|
||||
docid != m_ndb->wdb.postlist_end(pathterm); docid++) {
|
||||
LOGDEB2(("Db::needUpdate: uptodate: [%s]\n", pterm.c_str()));
|
||||
|
||||
Xapian::Document doc = m_ndb->wdb.get_document(*docid);
|
||||
// Up to date.
|
||||
|
||||
// Check the date once. no need to look at the others if
|
||||
// the db needs updating. Note that the fmtime used to be
|
||||
// called mtime, and we're keeping compat
|
||||
if (docid == docid0) {
|
||||
string data = doc.get_data();
|
||||
const char *cp = strstr(data.c_str(), "fmtime=");
|
||||
if (cp) {
|
||||
cp += 7;
|
||||
} else {
|
||||
cp = strstr(data.c_str(), "mtime=");
|
||||
if (cp)
|
||||
cp+= 6;
|
||||
}
|
||||
long mtime = cp ? atol(cp) : 0;
|
||||
if (mtime < stp->st_mtime) {
|
||||
LOGDEB2(("Db::needUpdate: yes: mtime: Db %ld file %ld\n",
|
||||
(long)mtime, (long)stp->st_mtime));
|
||||
// Db is not up to date. Let's index the file
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Set the uptodate flag for doc / pseudo doc
|
||||
m_ndb->updated[*docid] = true;
|
||||
|
||||
// Db is up to date. Make a note that this document exists.
|
||||
if (*docid < m_ndb->updated.size())
|
||||
// Set the existence flag for all the subdocs (if any)
|
||||
Xapian::TermIterator it = m_ndb->wdb.allterms_begin();
|
||||
it.skip_to(qterm);
|
||||
LOGDEB2(("First qterm: [%s]\n", (*it).c_str()));
|
||||
for (;it != m_ndb->wdb.allterms_end(); it++) {
|
||||
// If current term does not begin with qterm or has another |, not
|
||||
// the same file
|
||||
if ((*it).find(qterm) != 0 ||
|
||||
(*it).find_last_of("|") != qterm.length() -1)
|
||||
break;
|
||||
docid = m_ndb->wdb.postlist_begin(*it);
|
||||
if (*docid < m_ndb->updated.size()) {
|
||||
LOGDEB2(("Db::needUpdate: set exist flag for docid %d [%s]\n",
|
||||
*docid, (*it).c_str()));
|
||||
m_ndb->updated[*docid] = true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
} catch (const Xapian::Error &e) {
|
||||
@ -1246,9 +1260,7 @@ bool Db::getDoc(int exti, Doc &doc, int *percent)
|
||||
return m_ndb->dbDataToRclDoc(data, doc, m_qOpts, docid, terms);
|
||||
}
|
||||
|
||||
// Retrieve document defined by file name and internal path. Very inefficient,
|
||||
// used only for history display. We'd need to enter path+ipath terms in the
|
||||
// db if we wanted to make this more efficient.
|
||||
// Retrieve document defined by file name and internal path.
|
||||
bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
|
||||
{
|
||||
LOGDEB(("Db:getDoc: [%s] (%d) [%s]\n", fn.c_str(), fn.length(),
|
||||
@ -1265,32 +1277,24 @@ bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
|
||||
|
||||
string hash;
|
||||
pathHash(fn, hash, PATHHASHLEN);
|
||||
string pathterm = "P" + hash;
|
||||
// Look for all documents with this path, searching for the one
|
||||
// with the appropriate ipath. This is very inefficient.
|
||||
string pqterm = ipath.empty() ? "P" + hash : "Q" + hash + "|" + ipath;
|
||||
const char *ermsg = "";
|
||||
try {
|
||||
if (!m_ndb->db.term_exists(pathterm)) {
|
||||
if (!m_ndb->db.term_exists(pqterm)) {
|
||||
// Document found in history no longer in the database.
|
||||
// We return true (because their might be other ok docs further)
|
||||
// but indicate the error with pc = -1
|
||||
if (*pc)
|
||||
*pc = -1;
|
||||
LOGINFO(("Db:getDoc: no such path in index: [%s] (len %d)\n",
|
||||
pathterm.c_str(), pathterm.length()));
|
||||
LOGINFO(("Db:getDoc: no such doc in index: [%s] (len %d)\n",
|
||||
pqterm.c_str(), pqterm.length()));
|
||||
return true;
|
||||
}
|
||||
for (Xapian::PostingIterator docid =
|
||||
m_ndb->db.postlist_begin(pathterm);
|
||||
docid != m_ndb->db.postlist_end(pathterm); docid++) {
|
||||
|
||||
Xapian::Document xdoc = m_ndb->db.get_document(*docid);
|
||||
string data = xdoc.get_data();
|
||||
list<string> terms;
|
||||
if (m_ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms)
|
||||
&& doc.ipath == ipath)
|
||||
return true;
|
||||
}
|
||||
Xapian::PostingIterator docid = m_ndb->db.postlist_begin(pqterm);
|
||||
Xapian::Document xdoc = m_ndb->db.get_document(*docid);
|
||||
string data = xdoc.get_data();
|
||||
list<string> terms;
|
||||
return m_ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms);
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg().c_str();
|
||||
} catch (const string &s) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user