begin i/f cleanup: opacify doc uptodate sig (size+mtime)
This commit is contained in:
parent
23163f1b4f
commit
ca4a4e65b0
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.65 2007-12-20 09:08:04 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: indexer.cpp,v 1.66 2008-07-28 08:42:52 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -386,7 +386,11 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
// from on to off it may happen that some files which are now
|
||||
// without mime type will not be purged from the db, resulting
|
||||
// in possible 'cannot intern file' messages at query time...
|
||||
if (!m_db.needUpdate(fn, stp)) {
|
||||
char cbuf[100];
|
||||
// Document signature
|
||||
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
|
||||
string sig = cbuf;
|
||||
if (!m_db.needUpdate(fn, sig)) {
|
||||
LOGDEB(("processone: up to date: %s\n", fn.c_str()));
|
||||
if (m_updater) {
|
||||
m_updater->status.fn = fn;
|
||||
@ -465,8 +469,17 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
if (doc.utf8fn.empty())
|
||||
doc.utf8fn = utf8fn;
|
||||
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, "%ld", (long)stp->st_size);
|
||||
doc.fbytes = cbuf;
|
||||
// Document signature for up to date checks: concatenate mtime and
|
||||
// size. Note: looking for changes only, no need to parseback so no
|
||||
// need for reversible formatting
|
||||
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
|
||||
doc.sig = cbuf;
|
||||
|
||||
// Add document to database
|
||||
if (!m_db.add(fn, doc, stp))
|
||||
if (!m_db.add(fn, doc))
|
||||
return FsTreeWalker::FtwError;
|
||||
|
||||
// Tell what we are doing and check for interrupt request
|
||||
@ -491,7 +504,15 @@ DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
fileDoc.fmtime = ascdate;
|
||||
fileDoc.utf8fn = utf8fn;
|
||||
fileDoc.mimetype = interner.getMimetype();
|
||||
if (!m_db.add(fn, fileDoc, stp))
|
||||
|
||||
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, "%ld", (long)stp->st_size);
|
||||
fileDoc.fbytes = cbuf;
|
||||
// Document signature for up to date checks.
|
||||
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->st_mtime);
|
||||
fileDoc.sig = cbuf;
|
||||
if (!m_db.add(fn, fileDoc))
|
||||
return FsTreeWalker::FtwError;
|
||||
}
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.24 2008-05-05 20:24:55 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.25 2008-07-28 08:42:52 dockes Exp $ (C) 2005 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -56,6 +56,7 @@ static char rcsid[] = "@(#$Id: uiprefs_w.cpp,v 1.24 2008-05-05 20:24:55 dockes E
|
||||
#include "recoll.h"
|
||||
#include "guiutils.h"
|
||||
#include "rcldb.h"
|
||||
#include "rclconfig.h"
|
||||
#include "pathut.h"
|
||||
#include "uiprefs_w.h"
|
||||
#include "viewaction_w.h"
|
||||
@ -363,9 +364,7 @@ void UIPrefsDialog::addExtraDbPB_clicked()
|
||||
}
|
||||
struct stat st1, st2;
|
||||
stat(dbdir.c_str(), &st1);
|
||||
string rcldbdir;
|
||||
if (rcldb)
|
||||
rcldbdir = rcldb->getDbDir();
|
||||
string rcldbdir = RclConfig::getMainConfig()->getDbDir();
|
||||
stat(rcldbdir.c_str(), &st2);
|
||||
path_catslash(rcldbdir);
|
||||
fprintf(stderr, "rcldbdir: [%s]\n", rcldbdir.c_str());
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#ifndef lint
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.134 2008-07-01 11:51:51 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.135 2008-07-28 08:42:52 dockes Exp $ (C) 2004 J.F.Dockes";
|
||||
#endif
|
||||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@ -20,7 +20,6 @@ static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.134 2008-07-01 11:51:51 dockes Exp
|
||||
#include <stdio.h>
|
||||
#include <cstring>
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fnmatch.h>
|
||||
#include <regex.h>
|
||||
#include <math.h>
|
||||
@ -59,76 +58,82 @@ using namespace std;
|
||||
#define MIN(A,B) (A<B?A:B)
|
||||
#endif
|
||||
|
||||
// Omega compatible values. We leave a hole for future omega values. Not sure
|
||||
// it makes any sense to keep any level of omega compat given that the index
|
||||
// is incompatible anyway.
|
||||
enum value_slot {
|
||||
VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.
|
||||
VALUE_MD5 = 1, // 16 byte MD5 checksum of original document.
|
||||
VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size
|
||||
};
|
||||
|
||||
|
||||
// This is the word position offset at which we index the body text
|
||||
// (abstract, keywords, etc.. are stored before this)
|
||||
static const unsigned int baseTextPosition = 100000;
|
||||
|
||||
#undef MTIME_IN_VALUE
|
||||
#ifdef MTIME_IN_VALUE
|
||||
// Omega compatible values
|
||||
#define enum value_slot {
|
||||
VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.
|
||||
VALUE_MD5 = 1 // 16 byte MD5 checksum of original document.
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifndef NO_NAMESPACES
|
||||
namespace Rcl {
|
||||
#endif
|
||||
|
||||
// Max length for path terms stored for each document. Truncate
|
||||
// longer path and uniquize with hash. The goal for this is to avoid
|
||||
// xapian max term length limitations, not to gain space (we gain very
|
||||
// little even with very short maxlens like 30)
|
||||
// Note that Q terms add the ipath to this, and that the xapian max
|
||||
// key length seems to be around 250
|
||||
|
||||
// Synthetic abstract marker (to discriminate from abstract actually
|
||||
// found in document)
|
||||
const static string rclSyntAbs("?!#@");
|
||||
|
||||
// Maximum length for path terms stored for each document. We truncate
|
||||
// longer paths and uniquize them by appending a hashed value. This
|
||||
// is done to avoid xapian max term length limitations, not
|
||||
// to gain space (we gain very little even with very short maxlens
|
||||
// like 30) Note that Q terms add the ipath to this, and that the
|
||||
// xapian max key length seems to be around 250.
|
||||
// The value for PATHHASHLEN includes the length of the hash part.
|
||||
#define PATHHASHLEN 150
|
||||
|
||||
// Synthetic abstract marker (to discriminate from abstract actually
|
||||
// found in doc)
|
||||
const static string rclSyntAbs = "?!#@";
|
||||
const static string emptystring;
|
||||
// Compute the unique term used to link documents to their file-system source:
|
||||
// Hashed path + possible internal path
|
||||
static inline string make_uniterm(const string& fn, const string& ipath)
|
||||
{
|
||||
string hash;
|
||||
pathHash(fn, hash, PATHHASHLEN);
|
||||
string s("Q");
|
||||
s.append(hash);
|
||||
s.append("|");
|
||||
s.append(ipath);
|
||||
return s;
|
||||
}
|
||||
|
||||
/* See comment in class declaration */
|
||||
bool Db::Native::subDocs(const string &hash, vector<Xapian::docid>& docids)
|
||||
/* See comment in class declaration: return all subdocuments of a
|
||||
* document given by its unique path id */
|
||||
bool Db::Native::subDocs(const string &uniterm, vector<Xapian::docid>& docids)
|
||||
{
|
||||
docids.clear();
|
||||
string qterm = "Q"+ hash + "|";
|
||||
string ermsg;
|
||||
|
||||
string ermsg;
|
||||
for (int tries = 0; tries < 2; tries++) {
|
||||
try {
|
||||
Xapian::TermIterator it = db.allterms_begin();
|
||||
it.skip_to(qterm);
|
||||
for (;it != db.allterms_end(); it++) {
|
||||
// If current term does not begin with qterm or has
|
||||
it.skip_to(uniterm);
|
||||
// Don't return the doc itself:
|
||||
it++;
|
||||
for (; it != db.allterms_end(); it++) {
|
||||
LOGDEB2(("Testing [%s]\n", (*it).c_str()));
|
||||
// If current term does not begin with uniterm or has
|
||||
// another |, not the same file
|
||||
if ((*it).find(qterm) != 0 ||
|
||||
(*it).find_last_of("|") != qterm.length() -1)
|
||||
if ((*it).find(uniterm) != 0 ||
|
||||
(*it).find_last_of("|") != uniterm.length() - 1)
|
||||
break;
|
||||
docids.push_back(*(db.postlist_begin(*it)));
|
||||
}
|
||||
LOGDEB2(("Db::Native::subDocs: returning %d ids\n", docids.size()));
|
||||
return true;
|
||||
} catch (const Xapian::DatabaseModifiedError &e) {
|
||||
LOGDEB(("Db::subDocs: got modified error. reopen/retry\n"));
|
||||
// Can't use reOpen here, it would delete *me*
|
||||
// Can't use reOpen() here, I'm a Native:: method, this
|
||||
// would delete my own object
|
||||
db = Xapian::Database(m_db->m_basedir);
|
||||
} catch (const Xapian::Error &e) {
|
||||
ermsg = e.get_msg().c_str();
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty())
|
||||
break;
|
||||
} catch (const string &s) {
|
||||
ermsg = s;
|
||||
if (ermsg.empty())
|
||||
ermsg = "Empty error message";
|
||||
} catch (const char *s) {
|
||||
ermsg = s ? s : string();
|
||||
if (ermsg.empty())
|
||||
ermsg = "Empty error message";
|
||||
} catch (...) {
|
||||
ermsg= "Unknown xapian error (not Xapian::Error or string)";
|
||||
break;
|
||||
}
|
||||
}
|
||||
LOGERR(("Rcl::Db::subDocs: %s\n", ermsg.c_str()));
|
||||
return false;
|
||||
@ -159,6 +164,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc
|
||||
parms.get(string("ipath"), doc.ipath);
|
||||
parms.get(string("fbytes"), doc.fbytes);
|
||||
parms.get(string("dbytes"), doc.dbytes);
|
||||
parms.get(string("sig"), doc.sig);
|
||||
doc.xdocid = docid;
|
||||
return true;
|
||||
}
|
||||
@ -544,11 +550,6 @@ bool Db::open(const string& dir, const string &stops, OpenMode mode,
|
||||
return false;
|
||||
}
|
||||
|
||||
string Db::getDbDir()
|
||||
{
|
||||
return m_basedir;
|
||||
}
|
||||
|
||||
// Note: xapian has no close call, we delete and recreate the db
|
||||
bool Db::close()
|
||||
{
|
||||
@ -811,7 +812,7 @@ static const int MB = 1024 * 1024;
|
||||
// the title abstract and body and add special terms for file name,
|
||||
// date, mime type ... , create the document data record (more
|
||||
// metadata), and update database
|
||||
bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
|
||||
bool Db::add(const string &fn, const Doc &idoc)
|
||||
{
|
||||
LOGDEB1(("Db::add: fn %s\n", fn.c_str()));
|
||||
if (m_ndb == 0)
|
||||
@ -899,7 +900,7 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
|
||||
}
|
||||
splitData.setprefix(pfx); // Subject
|
||||
splitter.text_to_words(noacc);
|
||||
splitData.setprefix(emptystring);
|
||||
splitData.setprefix(string());
|
||||
splitData.basepos += splitData.curpos + 100;
|
||||
}
|
||||
}
|
||||
@ -934,31 +935,9 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
|
||||
newdocument.add_term(noacc);
|
||||
}
|
||||
|
||||
// Pathname/ipath terms. This is used for file existence/uptodate
|
||||
// checks, and unique id for the replace_document() call
|
||||
|
||||
// Truncate the filepath part to a reasonable length and
|
||||
// replace the truncated part with a hopefully unique hash
|
||||
string hash;
|
||||
pathHash(fn, hash, PATHHASHLEN);
|
||||
LOGDEB2(("Db::add: pathhash [%s]\n", hash.c_str()));
|
||||
|
||||
// Unique term: makes unique identifier for documents
|
||||
// either path or path+ipath inside multidocument files.
|
||||
// We only add a path term if ipath is empty. Else there will be a qterm
|
||||
// (path+ipath), and a pseudo-doc will be created to stand for the file
|
||||
// itself (for up to date checks). This is handled by
|
||||
// DbIndexer::processone()
|
||||
string uniterm;
|
||||
if (doc.ipath.empty()) {
|
||||
uniterm = "P" + hash;
|
||||
#ifdef MTIME_IN_VALUE
|
||||
#error need to fix fmtime to be stored as omega does it (bin net order str)
|
||||
newdocument.add_value(VALUE_LASTMOD, doc.fmtime);
|
||||
#endif
|
||||
} else {
|
||||
uniterm = "Q" + hash + "|" + doc.ipath;
|
||||
}
|
||||
// Pathname/ipath unique term: this is used for file existence/uptodate
|
||||
// checks, and unique id for the replace_document() call.
|
||||
string uniterm = make_uniterm(fn, doc.ipath);
|
||||
newdocument.add_term(uniterm);
|
||||
|
||||
// Dates etc...
|
||||
@ -985,14 +964,18 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
|
||||
record += "\ndmtime=" + doc.dmtime;
|
||||
}
|
||||
record += "\norigcharset=" + doc.origcharset;
|
||||
char sizebuf[20];
|
||||
sizebuf[0] = 0;
|
||||
if (stp)
|
||||
sprintf(sizebuf, "%ld", (long)stp->st_size);
|
||||
if (sizebuf[0])
|
||||
record += string("\nfbytes=") + sizebuf;
|
||||
|
||||
if (!doc.fbytes.empty())
|
||||
record += string("\nfbytes=") + doc.fbytes;
|
||||
// Note that we add the signature both as a value and in the data record
|
||||
if (!doc.sig.empty())
|
||||
record += string("\nsig=") + doc.sig;
|
||||
newdocument.add_value(VALUE_SIG, doc.sig);
|
||||
|
||||
char sizebuf[30];
|
||||
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
||||
record += string("\ndbytes=") + sizebuf;
|
||||
|
||||
if (!doc.ipath.empty()) {
|
||||
record += "\nipath=" + doc.ipath;
|
||||
}
|
||||
@ -1062,71 +1045,58 @@ bool Db::add(const string &fn, const Doc &idoc, const struct stat *stp)
|
||||
}
|
||||
|
||||
// Test if given filename has changed since last indexed:
|
||||
bool Db::needUpdate(const string &filename, const struct stat *stp)
|
||||
bool Db::needUpdate(const string &filename, const string& sig)
|
||||
{
|
||||
// Chrono chron;
|
||||
if (m_ndb == 0)
|
||||
return false;
|
||||
|
||||
string hash;
|
||||
pathHash(filename, hash, PATHHASHLEN);
|
||||
string pterm = "P" + hash;
|
||||
string uniterm = make_uniterm(filename, string());
|
||||
string ermsg;
|
||||
|
||||
// We look up the document indexed by the Pterm. This is either
|
||||
// We look up the document indexed by the uniterm. This is either
|
||||
// the actual document file, or, for a multi-document file, the
|
||||
// pseudo-doc we create to stand for the file itself.
|
||||
|
||||
// We try twice in case database needs to be reopened.
|
||||
for (int tries = 0; tries < 2; tries++) {
|
||||
try {
|
||||
// Get the Pterm doc or pseudo-doc
|
||||
Xapian::PostingIterator docid = m_ndb->db.postlist_begin(pterm);
|
||||
if (docid == m_ndb->db.postlist_end(pterm)) {
|
||||
// Get the doc or pseudo-doc
|
||||
Xapian::PostingIterator docid = m_ndb->db.postlist_begin(uniterm);
|
||||
if (docid == m_ndb->db.postlist_end(uniterm)) {
|
||||
// If no document exist with this path, we do need update
|
||||
LOGDEB2(("Db::needUpdate: no path: [%s]\n", pterm.c_str()));
|
||||
LOGDEB(("Db::needUpdate: no path: [%s]\n", uniterm.c_str()));
|
||||
return true;
|
||||
}
|
||||
Xapian::Document doc = m_ndb->db.get_document(*docid);
|
||||
|
||||
// Retrieve file modification time from db stored value
|
||||
#ifdef MTIME_IN_VALUE
|
||||
// This is slightly faster, but we'd need to setup a conversion
|
||||
// for old dbs, and it's not really worth it
|
||||
string value = doc.get_value(VALUE_LASTMOD);
|
||||
#error fixme make storage format compatible with omega
|
||||
const char *cp = value.c_str();
|
||||
#else
|
||||
// Retrieve old file/doc signature from value
|
||||
string osig = doc.get_value(VALUE_SIG);
|
||||
#if 0
|
||||
// Get old sig from data record
|
||||
string data = doc.get_data();
|
||||
const char *cp = strstr(data.c_str(), "fmtime=");
|
||||
if (cp) {
|
||||
cp += 7;
|
||||
} else {
|
||||
cp = strstr(data.c_str(), "mtime=");
|
||||
if (cp)
|
||||
cp+= 6;
|
||||
}
|
||||
string::size_type i1, i2;
|
||||
i1 = data.find("sig=");
|
||||
if (i1 == string::npos)
|
||||
return true;
|
||||
i1 += 4;
|
||||
if (i1 >= data.length())
|
||||
return true;
|
||||
i2 = data.find_first_of("\n\r", i1);
|
||||
if (i2 == string::npos)
|
||||
return true;
|
||||
string osig = data.substr(i1, i2-i1);
|
||||
#endif
|
||||
// If the time string begins with a "+", force an update. Happens
|
||||
// after a filter error, see indexer.cpp, processone()
|
||||
time_t mtime = (!cp || *cp == '+') ? 0 : atoll(cp);
|
||||
|
||||
// Retrieve file size as stored in db data
|
||||
cp = strstr(data.c_str(), "fbytes=");
|
||||
if (cp)
|
||||
cp += 7;
|
||||
off_t fbytes = cp ? atoll(cp) : 0;
|
||||
|
||||
// Compare db time and size data to filesystem's
|
||||
if (mtime != stp->st_mtime || fbytes != stp->st_size) {
|
||||
LOGDEB2(("Db::needUpdate:yes: mtime: D %ld F %ld."
|
||||
"sz D %ld F %ld\n", long(mtime), long(stp->st_mtime),
|
||||
long(fbytes), long(stp->st_size)));
|
||||
LOGDEB(("Db::needUpdate: oldsig [%s] new [%s]\n",
|
||||
osig.c_str(), sig.c_str()));
|
||||
// Compare new/old sig
|
||||
if (sig != osig) {
|
||||
LOGDEB(("Db::needUpdate:yes: olsig [%s] new [%s]\n",
|
||||
osig.c_str(), sig.c_str()));
|
||||
// Db is not up to date. Let's index the file
|
||||
return true;
|
||||
}
|
||||
|
||||
LOGDEB2(("Db::needUpdate: uptodate: [%s]\n", pterm.c_str()));
|
||||
LOGDEB(("Db::needUpdate: uptodate: [%s]\n", uniterm.c_str()));
|
||||
|
||||
// Up to date.
|
||||
|
||||
@ -1135,7 +1105,7 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
|
||||
|
||||
// Set the existence flag for all the subdocs (if any)
|
||||
vector<Xapian::docid> docids;
|
||||
if (!m_ndb->subDocs(hash, docids)) {
|
||||
if (!m_ndb->subDocs(uniterm, docids)) {
|
||||
LOGERR(("Rcl::Db::needUpdate: can't get subdocs list\n"));
|
||||
return true;
|
||||
}
|
||||
@ -1146,12 +1116,13 @@ bool Db::needUpdate(const string &filename, const struct stat *stp)
|
||||
updated[*it] = true;
|
||||
}
|
||||
}
|
||||
// LOGDEB(("Db::needUpdate: used %d mS\n", chron.millis()));
|
||||
return false;
|
||||
} catch (const Xapian::DatabaseModifiedError &e) {
|
||||
LOGDEB(("Db::needUpdate: got modified error. reopen/retry\n"));
|
||||
reOpen();
|
||||
} XCATCHERROR(ermsg);
|
||||
if (!ermsg.empty())
|
||||
break;
|
||||
}
|
||||
LOGERR(("Db::needUpdate: error while checking existence: %s\n",
|
||||
ermsg.c_str()));
|
||||
@ -1258,22 +1229,20 @@ bool Db::purgeFile(const string &fn)
|
||||
if (m_ndb == 0)
|
||||
return false;
|
||||
Xapian::WritableDatabase db = m_ndb->wdb;
|
||||
string hash;
|
||||
pathHash(fn, hash, PATHHASHLEN);
|
||||
string pterm = "P" + hash;
|
||||
string uniterm = make_uniterm(fn, string());
|
||||
string ermsg;
|
||||
try {
|
||||
Xapian::PostingIterator docid = db.postlist_begin(pterm);
|
||||
if (docid == db.postlist_end(pterm))
|
||||
Xapian::PostingIterator docid = db.postlist_begin(uniterm);
|
||||
if (docid == db.postlist_end(uniterm))
|
||||
return true;
|
||||
LOGDEB(("purgeFile: delete docid %d\n", *docid));
|
||||
db.delete_document(*docid);
|
||||
vector<Xapian::docid> docids;
|
||||
m_ndb->subDocs(hash, docids);
|
||||
m_ndb->subDocs(uniterm, docids);
|
||||
LOGDEB(("purgeFile: subdocs cnt %d\n", docids.size()));
|
||||
for (vector<Xapian::docid>::iterator it = docids.begin();
|
||||
it != docids.end(); it++) {
|
||||
LOGDEB2(("Db::purgeFile: delete subdoc %d\n", *it));
|
||||
LOGDEB(("Db::purgeFile: delete subdoc %d\n", *it));
|
||||
db.delete_document(*it);
|
||||
}
|
||||
return true;
|
||||
@ -1573,22 +1542,20 @@ bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
|
||||
if (*pc)
|
||||
*pc = 100;
|
||||
|
||||
string hash;
|
||||
pathHash(fn, hash, PATHHASHLEN);
|
||||
string pqterm = ipath.empty() ? "P" + hash : "Q" + hash + "|" + ipath;
|
||||
string uniterm = make_uniterm(fn, ipath);
|
||||
string ermsg;
|
||||
try {
|
||||
if (!m_ndb->db.term_exists(pqterm)) {
|
||||
if (!m_ndb->db.term_exists(uniterm)) {
|
||||
// Document found in history no longer in the database.
|
||||
// We return true (because their might be other ok docs further)
|
||||
// but indicate the error with pc = -1
|
||||
if (*pc)
|
||||
*pc = -1;
|
||||
LOGINFO(("Db:getDoc: no such doc in index: [%s] (len %d)\n",
|
||||
pqterm.c_str(), pqterm.length()));
|
||||
uniterm.c_str(), uniterm.length()));
|
||||
return true;
|
||||
}
|
||||
Xapian::PostingIterator docid = m_ndb->db.postlist_begin(pqterm);
|
||||
Xapian::PostingIterator docid = m_ndb->db.postlist_begin(uniterm);
|
||||
Xapian::Document xdoc = m_ndb->db.get_document(*docid);
|
||||
string data = xdoc.get_data();
|
||||
list<string> terms;
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _DB_H_INCLUDED_
|
||||
#define _DB_H_INCLUDED_
|
||||
/* @(#$Id: rcldb.h,v 1.56 2008-07-01 08:28:45 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: rcldb.h,v 1.57 2008-07-28 08:42:52 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
@ -86,42 +86,45 @@ class Db {
|
||||
bool close();
|
||||
bool isopen();
|
||||
|
||||
/** Retrieve main database directory */
|
||||
string getDbDir();
|
||||
|
||||
/** Get explanation about last error */
|
||||
string getReason() const {return m_reason;}
|
||||
|
||||
/** Return list of configured stop words */
|
||||
const StopList& getStopList() const {return m_stops;}
|
||||
|
||||
/** Field name to prefix translation (ie: author -> 'A') */
|
||||
bool fieldToPrefix(const string& fldname, string &pfx);
|
||||
|
||||
/** List possible stemmer names */
|
||||
static list<string> getStemmerNames();
|
||||
|
||||
/* Update-related methods ******************************************/
|
||||
/** List existing stemming databases */
|
||||
std::list<std::string> getStemLangs();
|
||||
|
||||
/** Add document. The Doc class should have been filled as much as
|
||||
possible depending on the document type */
|
||||
bool add(const string &filename, const Doc &doc, const struct stat *stp);
|
||||
/* The next two, only for searchdata, should be somehow hidden */
|
||||
/* Return list of configured stop words */
|
||||
const StopList& getStopList() const {return m_stops;}
|
||||
/* Field name to prefix translation (ie: author -> 'A') */
|
||||
bool fieldToPrefix(const string& fldname, string &pfx);
|
||||
|
||||
/* Update-related methods ******************************************/
|
||||
|
||||
/** Test if the db entry for the given filename/stat is up to date. This
|
||||
* has the side-effect of setting the existence flag for the file document
|
||||
* and all subdocs if any (for later use by 'purge()') */
|
||||
bool needUpdate(const string &filename, const struct stat *stp);
|
||||
bool needUpdate(const string &udi, const string& sig);
|
||||
|
||||
/** Add document. The Doc class should have been filled as much as
|
||||
* possible depending on the document type */
|
||||
bool add(const string &udi, const Doc &doc);
|
||||
|
||||
/** Delete document(s) for given UDI, including subdocs */
|
||||
bool purgeFile(const string &fn);
|
||||
|
||||
/** Remove documents that no longer exist in the file system. This
|
||||
depends on the update map, which is built during
|
||||
indexation. This should only be called after a full walk of
|
||||
the file system, else the update map will not be complete, and
|
||||
many documents will be deleted that shouldn't */
|
||||
* depends on the update map, which is built during
|
||||
* indexation. This should only be called after a full walk of
|
||||
* the file system, else the update map will not be complete, and
|
||||
* many documents will be deleted that shouldn't, which is why this
|
||||
* has to be called externally, we can't know if the indexing
|
||||
* pass was complete or partial.
|
||||
*/
|
||||
bool purge();
|
||||
|
||||
/** Delete document(s) for given filename */
|
||||
bool purgeFile(const string &filename);
|
||||
|
||||
/** Create stem expansion database for given language. */
|
||||
bool createStemDb(const string &lang);
|
||||
/** Delete stem expansion database for given language. */
|
||||
@ -146,6 +149,9 @@ class Db {
|
||||
bool termMatch(MatchType typ, const string &lang, const string &s,
|
||||
list<TermMatchEntry>& result, int max = -1);
|
||||
|
||||
/** Specific filename wildcard expansion */
|
||||
bool filenameWildExp(const string& exp, list<string>& names);
|
||||
|
||||
/** Set parameters for synthetic abstract generation */
|
||||
void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen);
|
||||
|
||||
@ -153,12 +159,11 @@ class Db {
|
||||
* the input query. This uses index data only (no access to the file) */
|
||||
bool makeDocAbstract(Doc &doc, Query *query, string& abstract);
|
||||
|
||||
/** Get document for given filename and ipath */
|
||||
/** Get document for given filename and ipath. Used by the 'history'
|
||||
* feature (and nothing else?) */
|
||||
bool getDoc(const string &fn, const string &ipath, Doc &doc, int *percent);
|
||||
|
||||
/** Get a list of existing stemming databases */
|
||||
std::list<std::string> getStemLangs();
|
||||
|
||||
/* The following are mainly for the aspell module */
|
||||
/** Whole term list walking. */
|
||||
TermIter *termWalkOpen();
|
||||
bool termWalkNext(TermIter *, string &term);
|
||||
@ -169,9 +174,6 @@ class Db {
|
||||
bool stemDiffers(const string& lang, const string& term,
|
||||
const string& base);
|
||||
|
||||
/** Filename wildcard expansion */
|
||||
bool filenameWildExp(const string& exp, list<string>& names);
|
||||
|
||||
/* This has to be public for access by embedded Query::Native */
|
||||
Native *m_ndb;
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#include "xapian.h"
|
||||
|
||||
namespace Rcl {
|
||||
/* @(#$Id: rcldb_p.h,v 1.1 2008-06-13 18:22:46 dockes Exp $ (C) 2007 J.F.Dockes */
|
||||
/* @(#$Id: rcldb_p.h,v 1.2 2008-07-28 08:42:52 dockes Exp $ (C) 2007 J.F.Dockes */
|
||||
|
||||
// Generic Xapian exception catching code. We do this quite often,
|
||||
// and I have no idea how to do this except for a macro
|
||||
@ -60,7 +60,7 @@ class Db::Native {
|
||||
* unique term for replace_document, and for retrieving by
|
||||
* path/ipath (history)
|
||||
*/
|
||||
bool subDocs(const string &hash, vector<Xapian::docid>& docids);
|
||||
bool subDocs(const string &uniterm, vector<Xapian::docid>& docids);
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
*/
|
||||
#ifndef _RCLDOC_H_INCLUDED_
|
||||
#define _RCLDOC_H_INCLUDED_
|
||||
/* @(#$Id: rcldoc.h,v 1.3 2007-06-19 08:36:24 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||
/* @(#$Id: rcldoc.h,v 1.4 2008-07-28 08:42:52 dockes Exp $ (C) 2006 J.F.Dockes */
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
@ -58,9 +58,20 @@ class Doc {
|
||||
// Attribute for the "abstract" entry. true if it is just the top
|
||||
// of doc, not a native document attribute
|
||||
bool syntabs;
|
||||
|
||||
string fbytes; // File size. Set by Db::Add
|
||||
string dbytes; // Doc size. Set by Db::Add from text length
|
||||
|
||||
// File size. Index: Set by caller prior to Db::Add. Query: set by
|
||||
// rcldb from index doc data. Historically this always has
|
||||
// represented the whole file size (as from stat()), but there
|
||||
// would be a need for a 3rd value for multidoc files (file
|
||||
// size/doc size/ doc text size)
|
||||
string fbytes;
|
||||
// Doc text size. Index: from text.length(). Query: set by rcldb from
|
||||
// index doc data.
|
||||
string dbytes;
|
||||
// Doc signature. Used for up to date checks. This is opaque, and
|
||||
// could just as well be ctime, size, ctime+size, md5, whatever.
|
||||
// Index: set by Db::Add caller. Query: set from doc data.
|
||||
string sig;
|
||||
|
||||
// The following fields don't go to the db record
|
||||
|
||||
@ -82,6 +93,7 @@ class Doc {
|
||||
syntabs = false;
|
||||
fbytes.erase();
|
||||
dbytes.erase();
|
||||
sig.erase();
|
||||
|
||||
text.erase();
|
||||
pc = 0;
|
||||
|
||||
@ -16,10 +16,10 @@
|
||||
*/
|
||||
#ifndef _BASE64_H_INCLUDED_
|
||||
#define _BASE64_H_INCLUDED_
|
||||
/* @(#$Id: base64.h,v 1.2 2006-01-30 11:15:28 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
/* @(#$Id: base64.h,v 1.3 2008-07-28 08:42:52 dockes Exp $ (C) 2004 J.F.Dockes */
|
||||
#include <string>
|
||||
|
||||
void base64_encode(const std::string &in, std::string &out);
|
||||
void base64_encode(const std::string& in, std::string& out);
|
||||
bool base64_decode(const std::string& in, std::string& out);
|
||||
|
||||
#endif /* _BASE64_H_INCLUDED_ */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user