From 85166c93b24797cdbd6de668f114a8f0db2a4fcb Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Wed, 7 Mar 2012 15:39:30 +0100 Subject: [PATCH] Changed the way we handle document sizes. The fbytes field should now be in most cases the most "natural" document size. pcbytes holds the top external container size and dbytes the text size --- src/common/beaglequeuecache.cpp | 2 +- src/index/beaglequeue.cpp | 10 +++--- src/index/fsindexer.cpp | 4 +-- src/internfile/Filter.h | 10 ++++++ src/internfile/internfile.cpp | 58 ++++++++++++++++++++++++++++----- src/internfile/mimehandler.h | 9 +++++ src/rcldb/rcldb.cpp | 13 +++++--- src/rcldb/rcldoc.cpp | 2 ++ src/rcldb/rcldoc.h | 29 ++++++++++------- src/rcldb/rclquery.cpp | 3 +- 10 files changed, 108 insertions(+), 32 deletions(-) diff --git a/src/common/beaglequeuecache.cpp b/src/common/beaglequeuecache.cpp index 675e6afe..5478bc60 100644 --- a/src/common/beaglequeuecache.cpp +++ b/src/common/beaglequeuecache.cpp @@ -68,7 +68,7 @@ bool BeagleQueueCache::getFromCache(const string& udi, Rcl::Doc &dotdoc, cf.get(cstr_url, dotdoc.url, cstr_null); cf.get(cstr_bgc_mimetype, dotdoc.mimetype, cstr_null); cf.get(cstr_fmtime, dotdoc.fmtime, cstr_null); - cf.get(cstr_fbytes, dotdoc.fbytes, cstr_null); + cf.get(cstr_fbytes, dotdoc.pcbytes, cstr_null); dotdoc.sig.clear(); list names = cf.getNames(cstr_null); for (list::const_iterator it = names.begin(); diff --git a/src/index/beaglequeue.cpp b/src/index/beaglequeue.cpp index c7797318..8542cb09 100644 --- a/src/index/beaglequeue.cpp +++ b/src/index/beaglequeue.cpp @@ -242,7 +242,7 @@ bool BeagleQueueIndexer::indexFromCache(const string& udi) doc.mimetype = dotdoc.mimetype; doc.fmtime = dotdoc.fmtime; doc.url = dotdoc.url; - doc.fbytes = dotdoc.fbytes; + doc.pcbytes = dotdoc.pcbytes; doc.sig.clear(); doc.meta[Rcl::Doc::keybcknd] = "BGL"; return m_db->addOrUpdate(udi, cstr_null, doc); @@ -405,7 +405,7 @@ BeagleQueueIndexer::processone(const string &path, char cbuf[100]; sprintf(cbuf, OFFTPC, stp->st_size); - dotdoc.fbytes = cbuf; + dotdoc.pcbytes = cbuf; // Document signature for up to date checks: none. dotdoc.sig.clear(); @@ -452,7 +452,7 @@ BeagleQueueIndexer::processone(const string &path, char cbuf[100]; sprintf(cbuf, OFFTPC, stp->st_size); - doc.fbytes = cbuf; + doc.pcbytes = cbuf; // Document signature for up to date checks: none. doc.sig.clear(); doc.url = dotdoc.url; @@ -466,7 +466,9 @@ BeagleQueueIndexer::processone(const string &path, { // doc fields not in meta, needing saving to the cache dotfile.m_fields.set("fmtime", dotdoc.fmtime, cstr_null); - dotfile.m_fields.set("fbytes", dotdoc.fbytes, cstr_null); + // fbytes is used for historical reasons, should be pcbytes, but makes + // no sense to change. + dotfile.m_fields.set(cstr_fbytes, dotdoc.pcbytes, cstr_null); dotfile.m_fields.set("udi", udi, cstr_null); string fdata; file_to_string(path, fdata); diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp index 997cae48..9cd4715f 100644 --- a/src/index/fsindexer.cpp +++ b/src/index/fsindexer.cpp @@ -458,7 +458,7 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp, char cbuf[100]; sprintf(cbuf, OFFTPC, stp->st_size); - doc.fbytes = cbuf; + doc.pcbytes = cbuf; // Document signature for up to date checks: concatenate // m/ctime and size. Looking for changes only, no need to // parseback so no need for reversible formatting. Also set, @@ -519,7 +519,7 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp, char cbuf[100]; sprintf(cbuf, OFFTPC, stp->st_size); - fileDoc.fbytes = cbuf; + fileDoc.pcbytes = cbuf; // Document signature for up to date checks. makesig(stp, fileDoc.sig); #ifdef IDX_THREADS diff --git a/src/internfile/Filter.h b/src/internfile/Filter.h index 13f35572..4ed70a1b 100644 --- a/src/internfile/Filter.h +++ b/src/internfile/Filter.h @@ -121,6 +121,16 @@ namespace Dijon */ virtual bool set_document_uri(const std::string &uri) = 0; + /** Set the document size meta_data element. This is the size + of the immediate containing file (ie, a .doc, a .odt), not + the size of, ie, a containing archive or .gz nor the size + of the extracted text. This is set externally, because the + surrounding code quite often has a better idea about it + (having created a temp file, etc.), and this saves more + stat() calls The value is stored inside metaData, docsize + key + */ + virtual void set_docsize(size_t size) = 0; // Going from one nested document to the next. diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index b6bdeb7c..4c8235c6 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -239,6 +239,8 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf, l_mime = *imime; } + size_t docsize = stp->st_size; + if (!l_mime.empty()) { // Has mime: check for a compressed file. If so, create a // temporary uncompressed file, and rerun the mime type @@ -255,8 +257,16 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf, LOGDEB1(("FileInterner:: after ucomp: m_tdir %s, tfile %s\n", m_tdir.dirname(), m_tfile.c_str())); m_fn = m_tfile; - // Note: still using the original file's stat. right ? - l_mime = mimetype(m_fn, stp, m_cfg, usfci); + // Stat the uncompressed file, mainly to get the size + struct stat ucstat; + if (stat(m_fn.c_str(), &ucstat) != 0) { + LOGERR(("FileInterner: can't stat the uncompressed file" + "[%s] errno %d\n", m_fn.c_str(), errno)); + return; + } else { + docsize = ucstat.st_size; + } + l_mime = mimetype(m_fn, &ucstat, m_cfg, usfci); if (l_mime.empty() && imime) l_mime = *imime; } else { @@ -294,6 +304,7 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf, reapXAttrs(f); #endif //RCL_USE_XATTR + df->set_docsize(docsize); if (!df->set_document_file(m_fn)) { LOGERR(("FileInterner:: error converting %s\n", m_fn.c_str())); return; @@ -335,6 +346,7 @@ void FileInterner::init(const string &data, RclConfig *cnf, m_forPreview ? "view" : "index"); bool setres = false; + df->set_docsize(data.length()); if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) { setres = df->set_document_string(data); } else if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) { @@ -652,17 +664,30 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc) it != docdata.end(); it++) { if (it->first == cstr_dj_keycontent) { doc.text = it->second; + if (doc.fbytes.empty()) { + // It's normally set by walking the filter stack, in + // collectIpathAndMt, which was called before us. It + // can happen that the doc size is still empty at this + // point if the last container filter is directly + // returning text/plain content, so that there is no + // ipath-less filter at the top + char cbuf[30]; + sprintf(cbuf, "%d", int(doc.text.length())); + doc.fbytes = cbuf; + } } else if (it->first == cstr_dj_keymd) { doc.dmtime = it->second; } else if (it->first == cstr_dj_keyorigcharset) { doc.origcharset = it->second; - } else if (it->first == cstr_dj_keymt || it->first == cstr_dj_keycharset) { + } else if (it->first == cstr_dj_keymt || + it->first == cstr_dj_keycharset) { // don't need/want these. } else { doc.meta[it->first] = it->second; } } - if (doc.meta[Rcl::Doc::keyabs].empty() && !doc.meta[cstr_dj_keyds].empty()) { + if (doc.meta[Rcl::Doc::keyabs].empty() && + !doc.meta[cstr_dj_keyds].empty()) { doc.meta[Rcl::Doc::keyabs] = doc.meta[cstr_dj_keyds]; doc.meta.erase(cstr_dj_keyds); } @@ -670,11 +695,20 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc) } // Collect the ipath from the current path in the document tree. -// While we're at it, we also set the mimetype and filename, which are special -// properties: we want to get them from the topmost doc -// with an ipath, not the last one which is usually text/plain -// We also set the author and modification time from the last doc -// which has them. +// While we're at it, we also set the mimetype and filename, +// which are special properties: we want to get them from the topmost +// doc with an ipath, not the last one which is usually text/plain We +// also set the author and modification time from the last doc which +// has them. +// +// The docsize is fetched from the first element without an ipath +// (first non container). If the last element directly returns +// text/plain so that there is no ipath-less element, the value will +// be set in dijontorcl(). +// +// The whole thing is a bit messy but it's not obvious how it should +// be cleaned up as the "inheritance" rules inside the stack are +// actually complicated. void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const { LOGDEB2(("FileInterner::collectIpathAndMT\n")); @@ -702,9 +736,14 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const hasipath = true; getKeyValue(docdata, cstr_dj_keymt, doc.mimetype); getKeyValue(docdata, cstr_dj_keyfn, doc.utf8fn); + } else { + if (doc.fbytes.empty()) + getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes); } doc.ipath += colon_hide(ipathel) + cstr_isep; } else { + if (doc.fbytes.empty()) + getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes); doc.ipath += cstr_isep; } getKeyValue(docdata, cstr_dj_keyauthor, doc.meta[Rcl::Doc::keyau]); @@ -793,6 +832,7 @@ int FileInterner::addHandler() txt = &it->second; } bool setres = false; + newflt->set_docsize(txt->length()); if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) { setres = newflt->set_document_string(*txt); } else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) { diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h index 964ec81e..87640a74 100644 --- a/src/internfile/mimehandler.h +++ b/src/internfile/mimehandler.h @@ -18,6 +18,8 @@ #define _MIMEHANDLER_H_INCLUDED_ #include "autoconfig.h" +#include + #include #include using std::string; @@ -66,6 +68,13 @@ public: return set_document_string(string(cp, sz)); } + virtual void set_docsize(size_t size) + { + char csize[30]; + sprintf(csize, "%lld", (long long)size); + m_metaData[cstr_dj_keydocsize] = csize; + } + virtual bool has_documents() const {return m_havedoc;} // Most doc types are single-doc diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 44c87a6d..c83c043f 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -210,6 +210,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, doc.syntabs = true; } parms.get(Doc::keyipt, doc.ipath); + parms.get(Doc::keypcs, doc.pcbytes); parms.get(Doc::keyfs, doc.fbytes); parms.get(Doc::keyds, doc.dbytes); parms.get(Doc::keysig, doc.sig); @@ -1254,17 +1255,21 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, } RECORD_APPEND(record, Doc::keyoc, doc.origcharset); + if (doc.fbytes.empty()) + doc.fbytes = doc.pcbytes; if (!doc.fbytes.empty()) RECORD_APPEND(record, Doc::keyfs, doc.fbytes); + if (!doc.pcbytes.empty()) + RECORD_APPEND(record, Doc::keypcs, doc.pcbytes); + char sizebuf[30]; + sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); + RECORD_APPEND(record, Doc::keyds, sizebuf); + // Note that we add the signature both as a value and in the data record if (!doc.sig.empty()) RECORD_APPEND(record, Doc::keysig, doc.sig); newdocument.add_value(VALUE_SIG, doc.sig); - char sizebuf[30]; - sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); - RECORD_APPEND(record, Doc::keyds, sizebuf); - if (!doc.ipath.empty()) RECORD_APPEND(record, Doc::keyipt, doc.ipath); diff --git a/src/rcldb/rcldoc.cpp b/src/rcldb/rcldoc.cpp index 070ed505..8a2e9eef 100644 --- a/src/rcldb/rcldoc.cpp +++ b/src/rcldb/rcldoc.cpp @@ -27,6 +27,7 @@ namespace Rcl { const string Doc::keydmt("dmtime"); const string Doc::keymt("mtime"); const string Doc::keyoc("origcharset"); + const string Doc::keypcs("pcbytes"); const string Doc::keyfs("fbytes"); const string Doc::keyds("dbytes"); const string Doc::keysz("size"); @@ -53,6 +54,7 @@ namespace Rcl { LOGDEB(("Rcl::Doc::dump: dmtime: [%s]\n", dmtime.c_str())); LOGDEB(("Rcl::Doc::dump: origcharset: [%s]\n", origcharset.c_str())); LOGDEB(("Rcl::Doc::dump: syntabs: [%d]\n", syntabs)); + LOGDEB(("Rcl::Doc::dump: pcbytes: [%s]\n", pcbytes.c_str())); LOGDEB(("Rcl::Doc::dump: fbytes: [%s]\n", fbytes.c_str())); LOGDEB(("Rcl::Doc::dump: dbytes: [%s]\n", dbytes.c_str())); LOGDEB(("Rcl::Doc::dump: sig: [%s]\n", sig.c_str())); diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h index f243a39d..a06dada4 100644 --- a/src/rcldb/rcldoc.h +++ b/src/rcldb/rcldoc.h @@ -87,15 +87,20 @@ class Doc { // as an indicative prefix at the beginning of the abstract (ugly hack) bool syntabs; - // File size. Index: Set by caller prior to Db::Add. Query: set by - // rcldb from index doc data. Historically this always has - // represented the whole file size (as from stat()), but there - // would be a need for a 3rd value for multidoc files (file - // size/doc size/ doc text size) - string fbytes; + // File size. This is the size of the compressed file or of the + // external containing archive. + // Index: Set by caller prior to Db::Add. + // Query: not set currently (not stored) + string pcbytes; - // Doc text size. Index: from text.length(). Query: set by rcldb from - // index doc data. + // Document size, ie, size of the .odt or .xls. + // Index: Set in internfile from the filter stack + // Query: set from data record + string fbytes; + + // Doc text size. + // Index: from text.length(). + // Query: set by rcldb from index data record string dbytes; // Doc signature. Used for up to date checks. @@ -126,6 +131,7 @@ class Doc { origcharset.erase(); meta.clear(); syntabs = false; + pcbytes.erase(); fbytes.erase(); dbytes.erase(); sig.erase(); @@ -163,9 +169,10 @@ class Doc { static const string keydmt; // document mtime static const string keymt; // mtime dmtime if set else fmtime static const string keyoc; // original charset - static const string keyfs; // file size - static const string keyds; // document size - static const string keysz; // dbytes if set else fbytes + static const string keypcs; // document outer container size + static const string keyfs; // document size + static const string keyds; // document text size + static const string keysz; // dbytes if set else fbytes else pcbytes static const string keysig; // sig static const string keyrr; // relevancy rating static const string keycc; // Collapse count diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp index 6755d0be..8aa43346 100644 --- a/src/rcldb/rclquery.cpp +++ b/src/rcldb/rclquery.cpp @@ -64,7 +64,8 @@ public: : m_fld(docfToDatf(f) + "=") { m_ismtime = !m_fld.compare("dmtime="); - m_issize = !m_fld.compare("fbytes=") || !m_fld.compare("dbytes="); + m_issize = !m_fld.compare("fbytes=") || !m_fld.compare("dbytes=") || + !m_fld.compare("pcbytes="); } virtual std::string operator()(const Xapian::Document& xdoc) const