Changed the way we handle document sizes. The fbytes field should now be in most cases the most "natural" document size. pcbytes holds the top external container size and dbytes the text size

This commit is contained in:
Jean-Francois Dockes 2012-03-07 15:39:30 +01:00
parent 638d468796
commit 85166c93b2
10 changed files with 108 additions and 32 deletions

View File

@ -68,7 +68,7 @@ bool BeagleQueueCache::getFromCache(const string& udi, Rcl::Doc &dotdoc,
cf.get(cstr_url, dotdoc.url, cstr_null); cf.get(cstr_url, dotdoc.url, cstr_null);
cf.get(cstr_bgc_mimetype, dotdoc.mimetype, cstr_null); cf.get(cstr_bgc_mimetype, dotdoc.mimetype, cstr_null);
cf.get(cstr_fmtime, dotdoc.fmtime, cstr_null); cf.get(cstr_fmtime, dotdoc.fmtime, cstr_null);
cf.get(cstr_fbytes, dotdoc.fbytes, cstr_null); cf.get(cstr_fbytes, dotdoc.pcbytes, cstr_null);
dotdoc.sig.clear(); dotdoc.sig.clear();
list<string> names = cf.getNames(cstr_null); list<string> names = cf.getNames(cstr_null);
for (list<string>::const_iterator it = names.begin(); for (list<string>::const_iterator it = names.begin();

View File

@ -242,7 +242,7 @@ bool BeagleQueueIndexer::indexFromCache(const string& udi)
doc.mimetype = dotdoc.mimetype; doc.mimetype = dotdoc.mimetype;
doc.fmtime = dotdoc.fmtime; doc.fmtime = dotdoc.fmtime;
doc.url = dotdoc.url; doc.url = dotdoc.url;
doc.fbytes = dotdoc.fbytes; doc.pcbytes = dotdoc.pcbytes;
doc.sig.clear(); doc.sig.clear();
doc.meta[Rcl::Doc::keybcknd] = "BGL"; doc.meta[Rcl::Doc::keybcknd] = "BGL";
return m_db->addOrUpdate(udi, cstr_null, doc); return m_db->addOrUpdate(udi, cstr_null, doc);
@ -405,7 +405,7 @@ BeagleQueueIndexer::processone(const string &path,
char cbuf[100]; char cbuf[100];
sprintf(cbuf, OFFTPC, stp->st_size); sprintf(cbuf, OFFTPC, stp->st_size);
dotdoc.fbytes = cbuf; dotdoc.pcbytes = cbuf;
// Document signature for up to date checks: none. // Document signature for up to date checks: none.
dotdoc.sig.clear(); dotdoc.sig.clear();
@ -452,7 +452,7 @@ BeagleQueueIndexer::processone(const string &path,
char cbuf[100]; char cbuf[100];
sprintf(cbuf, OFFTPC, stp->st_size); sprintf(cbuf, OFFTPC, stp->st_size);
doc.fbytes = cbuf; doc.pcbytes = cbuf;
// Document signature for up to date checks: none. // Document signature for up to date checks: none.
doc.sig.clear(); doc.sig.clear();
doc.url = dotdoc.url; doc.url = dotdoc.url;
@ -466,7 +466,9 @@ BeagleQueueIndexer::processone(const string &path,
{ {
// doc fields not in meta, needing saving to the cache // doc fields not in meta, needing saving to the cache
dotfile.m_fields.set("fmtime", dotdoc.fmtime, cstr_null); dotfile.m_fields.set("fmtime", dotdoc.fmtime, cstr_null);
dotfile.m_fields.set("fbytes", dotdoc.fbytes, cstr_null); // fbytes is used for historical reasons, should be pcbytes, but makes
// no sense to change.
dotfile.m_fields.set(cstr_fbytes, dotdoc.pcbytes, cstr_null);
dotfile.m_fields.set("udi", udi, cstr_null); dotfile.m_fields.set("udi", udi, cstr_null);
string fdata; string fdata;
file_to_string(path, fdata); file_to_string(path, fdata);

View File

@ -458,7 +458,7 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
char cbuf[100]; char cbuf[100];
sprintf(cbuf, OFFTPC, stp->st_size); sprintf(cbuf, OFFTPC, stp->st_size);
doc.fbytes = cbuf; doc.pcbytes = cbuf;
// Document signature for up to date checks: concatenate // Document signature for up to date checks: concatenate
// m/ctime and size. Looking for changes only, no need to // m/ctime and size. Looking for changes only, no need to
// parseback so no need for reversible formatting. Also set, // parseback so no need for reversible formatting. Also set,
@ -519,7 +519,7 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
char cbuf[100]; char cbuf[100];
sprintf(cbuf, OFFTPC, stp->st_size); sprintf(cbuf, OFFTPC, stp->st_size);
fileDoc.fbytes = cbuf; fileDoc.pcbytes = cbuf;
// Document signature for up to date checks. // Document signature for up to date checks.
makesig(stp, fileDoc.sig); makesig(stp, fileDoc.sig);
#ifdef IDX_THREADS #ifdef IDX_THREADS

View File

@ -121,6 +121,16 @@ namespace Dijon
*/ */
virtual bool set_document_uri(const std::string &uri) = 0; virtual bool set_document_uri(const std::string &uri) = 0;
/** Set the document size meta_data element. This is the size
of the immediate containing file (ie, a .doc, a .odt), not
the size of, ie, a containing archive or .gz nor the size
of the extracted text. This is set externally, because the
surrounding code quite often has a better idea about it
(having created a temp file, etc.), and this saves more
stat() calls The value is stored inside metaData, docsize
key
*/
virtual void set_docsize(size_t size) = 0;
// Going from one nested document to the next. // Going from one nested document to the next.

View File

@ -239,6 +239,8 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
l_mime = *imime; l_mime = *imime;
} }
size_t docsize = stp->st_size;
if (!l_mime.empty()) { if (!l_mime.empty()) {
// Has mime: check for a compressed file. If so, create a // Has mime: check for a compressed file. If so, create a
// temporary uncompressed file, and rerun the mime type // temporary uncompressed file, and rerun the mime type
@ -255,8 +257,16 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
LOGDEB1(("FileInterner:: after ucomp: m_tdir %s, tfile %s\n", LOGDEB1(("FileInterner:: after ucomp: m_tdir %s, tfile %s\n",
m_tdir.dirname(), m_tfile.c_str())); m_tdir.dirname(), m_tfile.c_str()));
m_fn = m_tfile; m_fn = m_tfile;
// Note: still using the original file's stat. right ? // Stat the uncompressed file, mainly to get the size
l_mime = mimetype(m_fn, stp, m_cfg, usfci); struct stat ucstat;
if (stat(m_fn.c_str(), &ucstat) != 0) {
LOGERR(("FileInterner: can't stat the uncompressed file"
"[%s] errno %d\n", m_fn.c_str(), errno));
return;
} else {
docsize = ucstat.st_size;
}
l_mime = mimetype(m_fn, &ucstat, m_cfg, usfci);
if (l_mime.empty() && imime) if (l_mime.empty() && imime)
l_mime = *imime; l_mime = *imime;
} else { } else {
@ -294,6 +304,7 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
reapXAttrs(f); reapXAttrs(f);
#endif //RCL_USE_XATTR #endif //RCL_USE_XATTR
df->set_docsize(docsize);
if (!df->set_document_file(m_fn)) { if (!df->set_document_file(m_fn)) {
LOGERR(("FileInterner:: error converting %s\n", m_fn.c_str())); LOGERR(("FileInterner:: error converting %s\n", m_fn.c_str()));
return; return;
@ -335,6 +346,7 @@ void FileInterner::init(const string &data, RclConfig *cnf,
m_forPreview ? "view" : "index"); m_forPreview ? "view" : "index");
bool setres = false; bool setres = false;
df->set_docsize(data.length());
if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) { if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
setres = df->set_document_string(data); setres = df->set_document_string(data);
} else if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) { } else if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) {
@ -652,17 +664,30 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
it != docdata.end(); it++) { it != docdata.end(); it++) {
if (it->first == cstr_dj_keycontent) { if (it->first == cstr_dj_keycontent) {
doc.text = it->second; doc.text = it->second;
if (doc.fbytes.empty()) {
// It's normally set by walking the filter stack, in
// collectIpathAndMt, which was called before us. It
// can happen that the doc size is still empty at this
// point if the last container filter is directly
// returning text/plain content, so that there is no
// ipath-less filter at the top
char cbuf[30];
sprintf(cbuf, "%d", int(doc.text.length()));
doc.fbytes = cbuf;
}
} else if (it->first == cstr_dj_keymd) { } else if (it->first == cstr_dj_keymd) {
doc.dmtime = it->second; doc.dmtime = it->second;
} else if (it->first == cstr_dj_keyorigcharset) { } else if (it->first == cstr_dj_keyorigcharset) {
doc.origcharset = it->second; doc.origcharset = it->second;
} else if (it->first == cstr_dj_keymt || it->first == cstr_dj_keycharset) { } else if (it->first == cstr_dj_keymt ||
it->first == cstr_dj_keycharset) {
// don't need/want these. // don't need/want these.
} else { } else {
doc.meta[it->first] = it->second; doc.meta[it->first] = it->second;
} }
} }
if (doc.meta[Rcl::Doc::keyabs].empty() && !doc.meta[cstr_dj_keyds].empty()) { if (doc.meta[Rcl::Doc::keyabs].empty() &&
!doc.meta[cstr_dj_keyds].empty()) {
doc.meta[Rcl::Doc::keyabs] = doc.meta[cstr_dj_keyds]; doc.meta[Rcl::Doc::keyabs] = doc.meta[cstr_dj_keyds];
doc.meta.erase(cstr_dj_keyds); doc.meta.erase(cstr_dj_keyds);
} }
@ -670,11 +695,20 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
} }
// Collect the ipath from the current path in the document tree. // Collect the ipath from the current path in the document tree.
// While we're at it, we also set the mimetype and filename, which are special // While we're at it, we also set the mimetype and filename,
// properties: we want to get them from the topmost doc // which are special properties: we want to get them from the topmost
// with an ipath, not the last one which is usually text/plain // doc with an ipath, not the last one which is usually text/plain We
// We also set the author and modification time from the last doc // also set the author and modification time from the last doc which
// which has them. // has them.
//
// The docsize is fetched from the first element without an ipath
// (first non container). If the last element directly returns
// text/plain so that there is no ipath-less element, the value will
// be set in dijontorcl().
//
// The whole thing is a bit messy but it's not obvious how it should
// be cleaned up as the "inheritance" rules inside the stack are
// actually complicated.
void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
{ {
LOGDEB2(("FileInterner::collectIpathAndMT\n")); LOGDEB2(("FileInterner::collectIpathAndMT\n"));
@ -702,9 +736,14 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
hasipath = true; hasipath = true;
getKeyValue(docdata, cstr_dj_keymt, doc.mimetype); getKeyValue(docdata, cstr_dj_keymt, doc.mimetype);
getKeyValue(docdata, cstr_dj_keyfn, doc.utf8fn); getKeyValue(docdata, cstr_dj_keyfn, doc.utf8fn);
} else {
if (doc.fbytes.empty())
getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes);
} }
doc.ipath += colon_hide(ipathel) + cstr_isep; doc.ipath += colon_hide(ipathel) + cstr_isep;
} else { } else {
if (doc.fbytes.empty())
getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes);
doc.ipath += cstr_isep; doc.ipath += cstr_isep;
} }
getKeyValue(docdata, cstr_dj_keyauthor, doc.meta[Rcl::Doc::keyau]); getKeyValue(docdata, cstr_dj_keyauthor, doc.meta[Rcl::Doc::keyau]);
@ -793,6 +832,7 @@ int FileInterner::addHandler()
txt = &it->second; txt = &it->second;
} }
bool setres = false; bool setres = false;
newflt->set_docsize(txt->length());
if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) { if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
setres = newflt->set_document_string(*txt); setres = newflt->set_document_string(*txt);
} else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) { } else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) {

View File

@ -18,6 +18,8 @@
#define _MIMEHANDLER_H_INCLUDED_ #define _MIMEHANDLER_H_INCLUDED_
#include "autoconfig.h" #include "autoconfig.h"
#include <stdio.h>
#include <string> #include <string>
#include <list> #include <list>
using std::string; using std::string;
@ -66,6 +68,13 @@ public:
return set_document_string(string(cp, sz)); return set_document_string(string(cp, sz));
} }
virtual void set_docsize(size_t size)
{
char csize[30];
sprintf(csize, "%lld", (long long)size);
m_metaData[cstr_dj_keydocsize] = csize;
}
virtual bool has_documents() const {return m_havedoc;} virtual bool has_documents() const {return m_havedoc;}
// Most doc types are single-doc // Most doc types are single-doc

View File

@ -210,6 +210,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
doc.syntabs = true; doc.syntabs = true;
} }
parms.get(Doc::keyipt, doc.ipath); parms.get(Doc::keyipt, doc.ipath);
parms.get(Doc::keypcs, doc.pcbytes);
parms.get(Doc::keyfs, doc.fbytes); parms.get(Doc::keyfs, doc.fbytes);
parms.get(Doc::keyds, doc.dbytes); parms.get(Doc::keyds, doc.dbytes);
parms.get(Doc::keysig, doc.sig); parms.get(Doc::keysig, doc.sig);
@ -1254,17 +1255,21 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
} }
RECORD_APPEND(record, Doc::keyoc, doc.origcharset); RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
if (doc.fbytes.empty())
doc.fbytes = doc.pcbytes;
if (!doc.fbytes.empty()) if (!doc.fbytes.empty())
RECORD_APPEND(record, Doc::keyfs, doc.fbytes); RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
if (!doc.pcbytes.empty())
RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
char sizebuf[30];
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
RECORD_APPEND(record, Doc::keyds, sizebuf);
// Note that we add the signature both as a value and in the data record // Note that we add the signature both as a value and in the data record
if (!doc.sig.empty()) if (!doc.sig.empty())
RECORD_APPEND(record, Doc::keysig, doc.sig); RECORD_APPEND(record, Doc::keysig, doc.sig);
newdocument.add_value(VALUE_SIG, doc.sig); newdocument.add_value(VALUE_SIG, doc.sig);
char sizebuf[30];
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
RECORD_APPEND(record, Doc::keyds, sizebuf);
if (!doc.ipath.empty()) if (!doc.ipath.empty())
RECORD_APPEND(record, Doc::keyipt, doc.ipath); RECORD_APPEND(record, Doc::keyipt, doc.ipath);

View File

@ -27,6 +27,7 @@ namespace Rcl {
const string Doc::keydmt("dmtime"); const string Doc::keydmt("dmtime");
const string Doc::keymt("mtime"); const string Doc::keymt("mtime");
const string Doc::keyoc("origcharset"); const string Doc::keyoc("origcharset");
const string Doc::keypcs("pcbytes");
const string Doc::keyfs("fbytes"); const string Doc::keyfs("fbytes");
const string Doc::keyds("dbytes"); const string Doc::keyds("dbytes");
const string Doc::keysz("size"); const string Doc::keysz("size");
@ -53,6 +54,7 @@ namespace Rcl {
LOGDEB(("Rcl::Doc::dump: dmtime: [%s]\n", dmtime.c_str())); LOGDEB(("Rcl::Doc::dump: dmtime: [%s]\n", dmtime.c_str()));
LOGDEB(("Rcl::Doc::dump: origcharset: [%s]\n", origcharset.c_str())); LOGDEB(("Rcl::Doc::dump: origcharset: [%s]\n", origcharset.c_str()));
LOGDEB(("Rcl::Doc::dump: syntabs: [%d]\n", syntabs)); LOGDEB(("Rcl::Doc::dump: syntabs: [%d]\n", syntabs));
LOGDEB(("Rcl::Doc::dump: pcbytes: [%s]\n", pcbytes.c_str()));
LOGDEB(("Rcl::Doc::dump: fbytes: [%s]\n", fbytes.c_str())); LOGDEB(("Rcl::Doc::dump: fbytes: [%s]\n", fbytes.c_str()));
LOGDEB(("Rcl::Doc::dump: dbytes: [%s]\n", dbytes.c_str())); LOGDEB(("Rcl::Doc::dump: dbytes: [%s]\n", dbytes.c_str()));
LOGDEB(("Rcl::Doc::dump: sig: [%s]\n", sig.c_str())); LOGDEB(("Rcl::Doc::dump: sig: [%s]\n", sig.c_str()));

View File

@ -87,15 +87,20 @@ class Doc {
// as an indicative prefix at the beginning of the abstract (ugly hack) // as an indicative prefix at the beginning of the abstract (ugly hack)
bool syntabs; bool syntabs;
// File size. Index: Set by caller prior to Db::Add. Query: set by // File size. This is the size of the compressed file or of the
// rcldb from index doc data. Historically this always has // external containing archive.
// represented the whole file size (as from stat()), but there // Index: Set by caller prior to Db::Add.
// would be a need for a 3rd value for multidoc files (file // Query: not set currently (not stored)
// size/doc size/ doc text size) string pcbytes;
// Document size, ie, size of the .odt or .xls.
// Index: Set in internfile from the filter stack
// Query: set from data record
string fbytes; string fbytes;
// Doc text size. Index: from text.length(). Query: set by rcldb from // Doc text size.
// index doc data. // Index: from text.length().
// Query: set by rcldb from index data record
string dbytes; string dbytes;
// Doc signature. Used for up to date checks. // Doc signature. Used for up to date checks.
@ -126,6 +131,7 @@ class Doc {
origcharset.erase(); origcharset.erase();
meta.clear(); meta.clear();
syntabs = false; syntabs = false;
pcbytes.erase();
fbytes.erase(); fbytes.erase();
dbytes.erase(); dbytes.erase();
sig.erase(); sig.erase();
@ -163,9 +169,10 @@ class Doc {
static const string keydmt; // document mtime static const string keydmt; // document mtime
static const string keymt; // mtime dmtime if set else fmtime static const string keymt; // mtime dmtime if set else fmtime
static const string keyoc; // original charset static const string keyoc; // original charset
static const string keyfs; // file size static const string keypcs; // document outer container size
static const string keyds; // document size static const string keyfs; // document size
static const string keysz; // dbytes if set else fbytes static const string keyds; // document text size
static const string keysz; // dbytes if set else fbytes else pcbytes
static const string keysig; // sig static const string keysig; // sig
static const string keyrr; // relevancy rating static const string keyrr; // relevancy rating
static const string keycc; // Collapse count static const string keycc; // Collapse count

View File

@ -64,7 +64,8 @@ public:
: m_fld(docfToDatf(f) + "=") : m_fld(docfToDatf(f) + "=")
{ {
m_ismtime = !m_fld.compare("dmtime="); m_ismtime = !m_fld.compare("dmtime=");
m_issize = !m_fld.compare("fbytes=") || !m_fld.compare("dbytes="); m_issize = !m_fld.compare("fbytes=") || !m_fld.compare("dbytes=") ||
!m_fld.compare("pcbytes=");
} }
virtual std::string operator()(const Xapian::Document& xdoc) const virtual std::string operator()(const Xapian::Document& xdoc) const