new way for doc unique terms: only path for monodoc, only path+ipath for doc inside multidoc, add pseudo-doc for file itself
This commit is contained in:
parent
4928503f60
commit
4646f62d6b
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.31 2006-04-12 10:41:39 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.32 2006-04-25 09:59:12 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -248,33 +248,48 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
|||||||
}
|
}
|
||||||
|
|
||||||
FileInterner interner(fn, m_config, m_tmpdir);
|
FileInterner interner(fn, m_config, m_tmpdir);
|
||||||
|
|
||||||
|
// File name transcoded to utf8 for indexation.
|
||||||
|
string charset = m_config->getDefCharset(true);
|
||||||
|
// If this fails, the file name won't be indexed, no big deal
|
||||||
|
// Note that we used to do the full path here, but I ended up believing
|
||||||
|
// that it made more sense to use only the file name
|
||||||
|
string utf8fn;
|
||||||
|
transcode(path_getsimple(fn), utf8fn, charset, "UTF-8");
|
||||||
|
|
||||||
FileInterner::Status fis = FileInterner::FIAgain;
|
FileInterner::Status fis = FileInterner::FIAgain;
|
||||||
|
bool hadNullIpath = false;
|
||||||
|
Rcl::Doc doc;
|
||||||
|
char ascdate[20];
|
||||||
|
sprintf(ascdate, "%ld", long(stp->st_ctime));
|
||||||
while (fis == FileInterner::FIAgain) {
|
while (fis == FileInterner::FIAgain) {
|
||||||
Rcl::Doc doc;
|
doc.erase();
|
||||||
|
|
||||||
string ipath;
|
string ipath;
|
||||||
fis = interner.internfile(doc, ipath);
|
fis = interner.internfile(doc, ipath);
|
||||||
if (fis == FileInterner::FIError)
|
if (fis == FileInterner::FIError) {
|
||||||
break;
|
// We dont stop indexing for one bad doc
|
||||||
|
return FsTreeWalker::FtwOk;
|
||||||
|
}
|
||||||
|
|
||||||
// Set the date if this was not done in the document handler
|
// Set the date if this was not done in the document handler
|
||||||
if (doc.fmtime.empty()) {
|
if (doc.fmtime.empty()) {
|
||||||
char ascdate[20];
|
|
||||||
sprintf(ascdate, "%ld", long(stp->st_ctime));
|
|
||||||
doc.fmtime = ascdate;
|
doc.fmtime = ascdate;
|
||||||
}
|
}
|
||||||
// Internal access path for multi-document files
|
|
||||||
doc.ipath = ipath;
|
|
||||||
|
|
||||||
// File name transcoded to utf8 for indexation.
|
// Internal access path for multi-document files
|
||||||
string charset = m_config->getDefCharset(true);
|
if (ipath.empty())
|
||||||
// If this fails, the file name won't be indexed, no big deal
|
hadNullIpath = true;
|
||||||
// Note that we used to do the full path here, but I ended up believing
|
else
|
||||||
// that it made more sense to use only the file name
|
doc.ipath = ipath;
|
||||||
transcode(path_getsimple(fn), doc.utf8fn, charset, "UTF-8");
|
|
||||||
// Do database-specific work to update document data
|
doc.utf8fn = utf8fn;
|
||||||
|
|
||||||
|
// Add document to database
|
||||||
if (!m_db.add(fn, doc, stp))
|
if (!m_db.add(fn, doc, stp))
|
||||||
return FsTreeWalker::FtwError;
|
return FsTreeWalker::FtwError;
|
||||||
|
|
||||||
|
// Tell what we are doing and check for interrupt request
|
||||||
if (m_updater) {
|
if (m_updater) {
|
||||||
if ((++(m_updater->status.docsdone) % 10) == 0) {
|
if ((++(m_updater->status.docsdone) % 10) == 0) {
|
||||||
m_updater->status.fn = fn;
|
m_updater->status.fn = fn;
|
||||||
@ -287,6 +302,19 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we had no instance with a null ipath, we create an empty
|
||||||
|
// document to stand for the file itself, to be used mainly for up
|
||||||
|
// to date checks. Typically this happens for an mbox file.
|
||||||
|
if (hadNullIpath == false) {
|
||||||
|
LOGDEB1(("Creating empty doc for file\n"));
|
||||||
|
Rcl::Doc fileDoc;
|
||||||
|
fileDoc.fmtime = doc.fmtime;
|
||||||
|
fileDoc.utf8fn = doc.utf8fn;
|
||||||
|
fileDoc.mimetype = doc.mimetype;
|
||||||
|
if (!m_db.add(fn, fileDoc, stp))
|
||||||
|
return FsTreeWalker::FtwError;
|
||||||
|
}
|
||||||
|
|
||||||
return FsTreeWalker::FtwOk;
|
return FsTreeWalker::FtwOk;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#ifndef lint
|
#ifndef lint
|
||||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.71 2006-04-25 08:17:36 dockes Exp $ (C) 2004 J.F.Dockes";
|
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.72 2006-04-25 09:59:12 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||||
#endif
|
#endif
|
||||||
/*
|
/*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
@ -522,18 +522,20 @@ bool Db::add(const string &fn, const Doc &idoc,
|
|||||||
string hash;
|
string hash;
|
||||||
pathHash(fn, hash, PATHHASHLEN);
|
pathHash(fn, hash, PATHHASHLEN);
|
||||||
LOGDEB2(("Db::add: pathhash [%s]\n", hash.c_str()));
|
LOGDEB2(("Db::add: pathhash [%s]\n", hash.c_str()));
|
||||||
string pathterm = "P" + hash;
|
|
||||||
newdocument.add_term(pathterm);
|
// Unique term: makes unique identifier for documents
|
||||||
|
// either path or path+ipath inside multidocument files.
|
||||||
// Unique term: with path, makes unique identifier for documents
|
// We only add a path term if ipath is empty. Else there will be a qterm
|
||||||
// inside multidocument files.
|
// (path+ipath), and a pseudo-doc will be created to stand for the file
|
||||||
|
// itself (for up to date checks). This is handled by
|
||||||
|
// DbIndexer::processone()
|
||||||
string uniterm;
|
string uniterm;
|
||||||
if (doc.ipath.empty()) {
|
if (doc.ipath.empty()) {
|
||||||
uniterm = pathterm;
|
uniterm = "P" + hash;
|
||||||
} else {
|
} else {
|
||||||
uniterm = "Q" + hash + "|" + doc.ipath;
|
uniterm = "Q" + hash + "|" + doc.ipath;
|
||||||
newdocument.add_term(uniterm);
|
|
||||||
}
|
}
|
||||||
|
newdocument.add_term(uniterm);
|
||||||
|
|
||||||
// Dates etc...
|
// Dates etc...
|
||||||
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
||||||
@ -613,11 +615,11 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
|
|||||||
if (m_ndb == 0)
|
if (m_ndb == 0)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// If no document exist with this path, we do need update
|
|
||||||
string hash;
|
string hash;
|
||||||
pathHash(filename, hash, PATHHASHLEN);
|
pathHash(filename, hash, PATHHASHLEN);
|
||||||
string pathterm = "P" + hash;
|
string pterm = "P" + hash;
|
||||||
const char *ermsg;
|
const char *ermsg;
|
||||||
|
string qterm = "Q"+ hash + "|";
|
||||||
|
|
||||||
// Look for all documents with this path. We need to look at all
|
// Look for all documents with this path. We need to look at all
|
||||||
// to set their existence flag. We check the update time on the
|
// to set their existence flag. We check the update time on the
|
||||||
@ -626,42 +628,54 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
|
|||||||
// file changed)
|
// file changed)
|
||||||
Xapian::PostingIterator doc;
|
Xapian::PostingIterator doc;
|
||||||
try {
|
try {
|
||||||
if (!m_ndb->wdb.term_exists(pathterm)) {
|
if (!m_ndb->wdb.term_exists(pterm)) {
|
||||||
LOGDEB1(("Db::needUpdate: no such path: %s\n", pathterm.c_str()));
|
// If no document exist with this path, we do need update
|
||||||
|
LOGDEB2(("Db::needUpdate: no such path: [%s]\n", pterm.c_str()));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
// Check the date using the Pterm doc or pseudo-doc
|
||||||
|
Xapian::PostingIterator docid = m_ndb->wdb.postlist_begin(pterm);
|
||||||
|
Xapian::Document doc = m_ndb->wdb.get_document(*docid);
|
||||||
|
string data = doc.get_data();
|
||||||
|
const char *cp = strstr(data.c_str(), "fmtime=");
|
||||||
|
if (cp) {
|
||||||
|
cp += 7;
|
||||||
|
} else {
|
||||||
|
cp = strstr(data.c_str(), "mtime=");
|
||||||
|
if (cp)
|
||||||
|
cp+= 6;
|
||||||
|
}
|
||||||
|
long mtime = cp ? atol(cp) : 0;
|
||||||
|
if (mtime < stp->st_mtime) {
|
||||||
|
LOGDEB2(("Db::needUpdate: yes: mtime: Db %ld file %ld\n",
|
||||||
|
(long)mtime, (long)stp->st_mtime));
|
||||||
|
// Db is not up to date. Let's index the file
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
Xapian::PostingIterator docid0 = m_ndb->wdb.postlist_begin(pathterm);
|
LOGDEB2(("Db::needUpdate: uptodate: [%s]\n", pterm.c_str()));
|
||||||
for (Xapian::PostingIterator docid = docid0;
|
|
||||||
docid != m_ndb->wdb.postlist_end(pathterm); docid++) {
|
|
||||||
|
|
||||||
Xapian::Document doc = m_ndb->wdb.get_document(*docid);
|
// Up to date.
|
||||||
|
|
||||||
// Check the date once. no need to look at the others if
|
// Set the uptodate flag for doc / pseudo doc
|
||||||
// the db needs updating. Note that the fmtime used to be
|
m_ndb->updated[*docid] = true;
|
||||||
// called mtime, and we're keeping compat
|
|
||||||
if (docid == docid0) {
|
|
||||||
string data = doc.get_data();
|
|
||||||
const char *cp = strstr(data.c_str(), "fmtime=");
|
|
||||||
if (cp) {
|
|
||||||
cp += 7;
|
|
||||||
} else {
|
|
||||||
cp = strstr(data.c_str(), "mtime=");
|
|
||||||
if (cp)
|
|
||||||
cp+= 6;
|
|
||||||
}
|
|
||||||
long mtime = cp ? atol(cp) : 0;
|
|
||||||
if (mtime < stp->st_mtime) {
|
|
||||||
LOGDEB2(("Db::needUpdate: yes: mtime: Db %ld file %ld\n",
|
|
||||||
(long)mtime, (long)stp->st_mtime));
|
|
||||||
// Db is not up to date. Let's index the file
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Db is up to date. Make a note that this document exists.
|
// Set the existence flag for all the subdocs (if any)
|
||||||
if (*docid < m_ndb->updated.size())
|
Xapian::TermIterator it = m_ndb->wdb.allterms_begin();
|
||||||
|
it.skip_to(qterm);
|
||||||
|
LOGDEB2(("First qterm: [%s]\n", (*it).c_str()));
|
||||||
|
for (;it != m_ndb->wdb.allterms_end(); it++) {
|
||||||
|
// If current term does not begin with qterm or has another |, not
|
||||||
|
// the same file
|
||||||
|
if ((*it).find(qterm) != 0 ||
|
||||||
|
(*it).find_last_of("|") != qterm.length() -1)
|
||||||
|
break;
|
||||||
|
docid = m_ndb->wdb.postlist_begin(*it);
|
||||||
|
if (*docid < m_ndb->updated.size()) {
|
||||||
|
LOGDEB2(("Db::needUpdate: set exist flag for docid %d [%s]\n",
|
||||||
|
*docid, (*it).c_str()));
|
||||||
m_ndb->updated[*docid] = true;
|
m_ndb->updated[*docid] = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
@ -1246,9 +1260,7 @@ bool Db::getDoc(int exti, Doc &doc, int *percent)
|
|||||||
return m_ndb->dbDataToRclDoc(data, doc, m_qOpts, docid, terms);
|
return m_ndb->dbDataToRclDoc(data, doc, m_qOpts, docid, terms);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Retrieve document defined by file name and internal path. Very inefficient,
|
// Retrieve document defined by file name and internal path.
|
||||||
// used only for history display. We'd need to enter path+ipath terms in the
|
|
||||||
// db if we wanted to make this more efficient.
|
|
||||||
bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
|
bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
|
||||||
{
|
{
|
||||||
LOGDEB(("Db:getDoc: [%s] (%d) [%s]\n", fn.c_str(), fn.length(),
|
LOGDEB(("Db:getDoc: [%s] (%d) [%s]\n", fn.c_str(), fn.length(),
|
||||||
@ -1265,32 +1277,24 @@ bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
|
|||||||
|
|
||||||
string hash;
|
string hash;
|
||||||
pathHash(fn, hash, PATHHASHLEN);
|
pathHash(fn, hash, PATHHASHLEN);
|
||||||
string pathterm = "P" + hash;
|
string pqterm = ipath.empty() ? "P" + hash : "Q" + hash + "|" + ipath;
|
||||||
// Look for all documents with this path, searching for the one
|
|
||||||
// with the appropriate ipath. This is very inefficient.
|
|
||||||
const char *ermsg = "";
|
const char *ermsg = "";
|
||||||
try {
|
try {
|
||||||
if (!m_ndb->db.term_exists(pathterm)) {
|
if (!m_ndb->db.term_exists(pqterm)) {
|
||||||
// Document found in history no longer in the database.
|
// Document found in history no longer in the database.
|
||||||
// We return true (because their might be other ok docs further)
|
// We return true (because their might be other ok docs further)
|
||||||
// but indicate the error with pc = -1
|
// but indicate the error with pc = -1
|
||||||
if (*pc)
|
if (*pc)
|
||||||
*pc = -1;
|
*pc = -1;
|
||||||
LOGINFO(("Db:getDoc: no such path in index: [%s] (len %d)\n",
|
LOGINFO(("Db:getDoc: no such doc in index: [%s] (len %d)\n",
|
||||||
pathterm.c_str(), pathterm.length()));
|
pqterm.c_str(), pqterm.length()));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
for (Xapian::PostingIterator docid =
|
Xapian::PostingIterator docid = m_ndb->db.postlist_begin(pqterm);
|
||||||
m_ndb->db.postlist_begin(pathterm);
|
Xapian::Document xdoc = m_ndb->db.get_document(*docid);
|
||||||
docid != m_ndb->db.postlist_end(pathterm); docid++) {
|
string data = xdoc.get_data();
|
||||||
|
list<string> terms;
|
||||||
Xapian::Document xdoc = m_ndb->db.get_document(*docid);
|
return m_ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms);
|
||||||
string data = xdoc.get_data();
|
|
||||||
list<string> terms;
|
|
||||||
if (m_ndb->dbDataToRclDoc(data, doc, QO_NONE, *docid, terms)
|
|
||||||
&& doc.ipath == ipath)
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} catch (const Xapian::Error &e) {
|
} catch (const Xapian::Error &e) {
|
||||||
ermsg = e.get_msg().c_str();
|
ermsg = e.get_msg().c_str();
|
||||||
} catch (const string &s) {
|
} catch (const string &s) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user