diff --git a/src/common/beaglequeuecache.cpp b/src/common/beaglequeuecache.cpp index 675e6afe..5478bc60 100644 --- a/src/common/beaglequeuecache.cpp +++ b/src/common/beaglequeuecache.cpp @@ -68,7 +68,7 @@ bool BeagleQueueCache::getFromCache(const string& udi, Rcl::Doc &dotdoc, cf.get(cstr_url, dotdoc.url, cstr_null); cf.get(cstr_bgc_mimetype, dotdoc.mimetype, cstr_null); cf.get(cstr_fmtime, dotdoc.fmtime, cstr_null); - cf.get(cstr_fbytes, dotdoc.fbytes, cstr_null); + cf.get(cstr_fbytes, dotdoc.pcbytes, cstr_null); dotdoc.sig.clear(); list names = cf.getNames(cstr_null); for (list::const_iterator it = names.begin(); diff --git a/src/index/beaglequeue.cpp b/src/index/beaglequeue.cpp index c7797318..8542cb09 100644 --- a/src/index/beaglequeue.cpp +++ b/src/index/beaglequeue.cpp @@ -242,7 +242,7 @@ bool BeagleQueueIndexer::indexFromCache(const string& udi) doc.mimetype = dotdoc.mimetype; doc.fmtime = dotdoc.fmtime; doc.url = dotdoc.url; - doc.fbytes = dotdoc.fbytes; + doc.pcbytes = dotdoc.pcbytes; doc.sig.clear(); doc.meta[Rcl::Doc::keybcknd] = "BGL"; return m_db->addOrUpdate(udi, cstr_null, doc); @@ -405,7 +405,7 @@ BeagleQueueIndexer::processone(const string &path, char cbuf[100]; sprintf(cbuf, OFFTPC, stp->st_size); - dotdoc.fbytes = cbuf; + dotdoc.pcbytes = cbuf; // Document signature for up to date checks: none. dotdoc.sig.clear(); @@ -452,7 +452,7 @@ BeagleQueueIndexer::processone(const string &path, char cbuf[100]; sprintf(cbuf, OFFTPC, stp->st_size); - doc.fbytes = cbuf; + doc.pcbytes = cbuf; // Document signature for up to date checks: none. doc.sig.clear(); doc.url = dotdoc.url; @@ -466,7 +466,9 @@ BeagleQueueIndexer::processone(const string &path, { // doc fields not in meta, needing saving to the cache dotfile.m_fields.set("fmtime", dotdoc.fmtime, cstr_null); - dotfile.m_fields.set("fbytes", dotdoc.fbytes, cstr_null); + // fbytes is used for historical reasons, should be pcbytes, but makes + // no sense to change. + dotfile.m_fields.set(cstr_fbytes, dotdoc.pcbytes, cstr_null); dotfile.m_fields.set("udi", udi, cstr_null); string fdata; file_to_string(path, fdata); diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp index 997cae48..9cd4715f 100644 --- a/src/index/fsindexer.cpp +++ b/src/index/fsindexer.cpp @@ -458,7 +458,7 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp, char cbuf[100]; sprintf(cbuf, OFFTPC, stp->st_size); - doc.fbytes = cbuf; + doc.pcbytes = cbuf; // Document signature for up to date checks: concatenate // m/ctime and size. Looking for changes only, no need to // parseback so no need for reversible formatting. Also set, @@ -519,7 +519,7 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp, char cbuf[100]; sprintf(cbuf, OFFTPC, stp->st_size); - fileDoc.fbytes = cbuf; + fileDoc.pcbytes = cbuf; // Document signature for up to date checks. makesig(stp, fileDoc.sig); #ifdef IDX_THREADS diff --git a/src/internfile/Filter.h b/src/internfile/Filter.h index 13f35572..4ed70a1b 100644 --- a/src/internfile/Filter.h +++ b/src/internfile/Filter.h @@ -121,6 +121,16 @@ namespace Dijon */ virtual bool set_document_uri(const std::string &uri) = 0; + /** Set the document size meta_data element. This is the size + of the immediate containing file (ie, a .doc, a .odt), not + the size of, ie, a containing archive or .gz nor the size + of the extracted text. This is set externally, because the + surrounding code quite often has a better idea about it + (having created a temp file, etc.), and this saves more + stat() calls The value is stored inside metaData, docsize + key + */ + virtual void set_docsize(size_t size) = 0; // Going from one nested document to the next. diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index b6bdeb7c..4c8235c6 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -239,6 +239,8 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf, l_mime = *imime; } + size_t docsize = stp->st_size; + if (!l_mime.empty()) { // Has mime: check for a compressed file. If so, create a // temporary uncompressed file, and rerun the mime type @@ -255,8 +257,16 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf, LOGDEB1(("FileInterner:: after ucomp: m_tdir %s, tfile %s\n", m_tdir.dirname(), m_tfile.c_str())); m_fn = m_tfile; - // Note: still using the original file's stat. right ? - l_mime = mimetype(m_fn, stp, m_cfg, usfci); + // Stat the uncompressed file, mainly to get the size + struct stat ucstat; + if (stat(m_fn.c_str(), &ucstat) != 0) { + LOGERR(("FileInterner: can't stat the uncompressed file" + "[%s] errno %d\n", m_fn.c_str(), errno)); + return; + } else { + docsize = ucstat.st_size; + } + l_mime = mimetype(m_fn, &ucstat, m_cfg, usfci); if (l_mime.empty() && imime) l_mime = *imime; } else { @@ -294,6 +304,7 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf, reapXAttrs(f); #endif //RCL_USE_XATTR + df->set_docsize(docsize); if (!df->set_document_file(m_fn)) { LOGERR(("FileInterner:: error converting %s\n", m_fn.c_str())); return; @@ -335,6 +346,7 @@ void FileInterner::init(const string &data, RclConfig *cnf, m_forPreview ? "view" : "index"); bool setres = false; + df->set_docsize(data.length()); if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) { setres = df->set_document_string(data); } else if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) { @@ -652,17 +664,30 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc) it != docdata.end(); it++) { if (it->first == cstr_dj_keycontent) { doc.text = it->second; + if (doc.fbytes.empty()) { + // It's normally set by walking the filter stack, in + // collectIpathAndMt, which was called before us. It + // can happen that the doc size is still empty at this + // point if the last container filter is directly + // returning text/plain content, so that there is no + // ipath-less filter at the top + char cbuf[30]; + sprintf(cbuf, "%d", int(doc.text.length())); + doc.fbytes = cbuf; + } } else if (it->first == cstr_dj_keymd) { doc.dmtime = it->second; } else if (it->first == cstr_dj_keyorigcharset) { doc.origcharset = it->second; - } else if (it->first == cstr_dj_keymt || it->first == cstr_dj_keycharset) { + } else if (it->first == cstr_dj_keymt || + it->first == cstr_dj_keycharset) { // don't need/want these. } else { doc.meta[it->first] = it->second; } } - if (doc.meta[Rcl::Doc::keyabs].empty() && !doc.meta[cstr_dj_keyds].empty()) { + if (doc.meta[Rcl::Doc::keyabs].empty() && + !doc.meta[cstr_dj_keyds].empty()) { doc.meta[Rcl::Doc::keyabs] = doc.meta[cstr_dj_keyds]; doc.meta.erase(cstr_dj_keyds); } @@ -670,11 +695,20 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc) } // Collect the ipath from the current path in the document tree. -// While we're at it, we also set the mimetype and filename, which are special -// properties: we want to get them from the topmost doc -// with an ipath, not the last one which is usually text/plain -// We also set the author and modification time from the last doc -// which has them. +// While we're at it, we also set the mimetype and filename, +// which are special properties: we want to get them from the topmost +// doc with an ipath, not the last one which is usually text/plain We +// also set the author and modification time from the last doc which +// has them. +// +// The docsize is fetched from the first element without an ipath +// (first non container). If the last element directly returns +// text/plain so that there is no ipath-less element, the value will +// be set in dijontorcl(). +// +// The whole thing is a bit messy but it's not obvious how it should +// be cleaned up as the "inheritance" rules inside the stack are +// actually complicated. void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const { LOGDEB2(("FileInterner::collectIpathAndMT\n")); @@ -702,9 +736,14 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const hasipath = true; getKeyValue(docdata, cstr_dj_keymt, doc.mimetype); getKeyValue(docdata, cstr_dj_keyfn, doc.utf8fn); + } else { + if (doc.fbytes.empty()) + getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes); } doc.ipath += colon_hide(ipathel) + cstr_isep; } else { + if (doc.fbytes.empty()) + getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes); doc.ipath += cstr_isep; } getKeyValue(docdata, cstr_dj_keyauthor, doc.meta[Rcl::Doc::keyau]); @@ -793,6 +832,7 @@ int FileInterner::addHandler() txt = &it->second; } bool setres = false; + newflt->set_docsize(txt->length()); if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) { setres = newflt->set_document_string(*txt); } else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) { diff --git a/src/internfile/mimehandler.h b/src/internfile/mimehandler.h index 964ec81e..87640a74 100644 --- a/src/internfile/mimehandler.h +++ b/src/internfile/mimehandler.h @@ -18,6 +18,8 @@ #define _MIMEHANDLER_H_INCLUDED_ #include "autoconfig.h" +#include + #include #include using std::string; @@ -66,6 +68,13 @@ public: return set_document_string(string(cp, sz)); } + virtual void set_docsize(size_t size) + { + char csize[30]; + sprintf(csize, "%lld", (long long)size); + m_metaData[cstr_dj_keydocsize] = csize; + } + virtual bool has_documents() const {return m_havedoc;} // Most doc types are single-doc diff --git a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp index 44c87a6d..c83c043f 100644 --- a/src/rcldb/rcldb.cpp +++ b/src/rcldb/rcldb.cpp @@ -210,6 +210,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, doc.syntabs = true; } parms.get(Doc::keyipt, doc.ipath); + parms.get(Doc::keypcs, doc.pcbytes); parms.get(Doc::keyfs, doc.fbytes); parms.get(Doc::keyds, doc.dbytes); parms.get(Doc::keysig, doc.sig); @@ -1254,17 +1255,21 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, } RECORD_APPEND(record, Doc::keyoc, doc.origcharset); + if (doc.fbytes.empty()) + doc.fbytes = doc.pcbytes; if (!doc.fbytes.empty()) RECORD_APPEND(record, Doc::keyfs, doc.fbytes); + if (!doc.pcbytes.empty()) + RECORD_APPEND(record, Doc::keypcs, doc.pcbytes); + char sizebuf[30]; + sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); + RECORD_APPEND(record, Doc::keyds, sizebuf); + // Note that we add the signature both as a value and in the data record if (!doc.sig.empty()) RECORD_APPEND(record, Doc::keysig, doc.sig); newdocument.add_value(VALUE_SIG, doc.sig); - char sizebuf[30]; - sprintf(sizebuf, "%u", (unsigned int)doc.text.length()); - RECORD_APPEND(record, Doc::keyds, sizebuf); - if (!doc.ipath.empty()) RECORD_APPEND(record, Doc::keyipt, doc.ipath); diff --git a/src/rcldb/rcldoc.cpp b/src/rcldb/rcldoc.cpp index 070ed505..8a2e9eef 100644 --- a/src/rcldb/rcldoc.cpp +++ b/src/rcldb/rcldoc.cpp @@ -27,6 +27,7 @@ namespace Rcl { const string Doc::keydmt("dmtime"); const string Doc::keymt("mtime"); const string Doc::keyoc("origcharset"); + const string Doc::keypcs("pcbytes"); const string Doc::keyfs("fbytes"); const string Doc::keyds("dbytes"); const string Doc::keysz("size"); @@ -53,6 +54,7 @@ namespace Rcl { LOGDEB(("Rcl::Doc::dump: dmtime: [%s]\n", dmtime.c_str())); LOGDEB(("Rcl::Doc::dump: origcharset: [%s]\n", origcharset.c_str())); LOGDEB(("Rcl::Doc::dump: syntabs: [%d]\n", syntabs)); + LOGDEB(("Rcl::Doc::dump: pcbytes: [%s]\n", pcbytes.c_str())); LOGDEB(("Rcl::Doc::dump: fbytes: [%s]\n", fbytes.c_str())); LOGDEB(("Rcl::Doc::dump: dbytes: [%s]\n", dbytes.c_str())); LOGDEB(("Rcl::Doc::dump: sig: [%s]\n", sig.c_str())); diff --git a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h index f243a39d..a06dada4 100644 --- a/src/rcldb/rcldoc.h +++ b/src/rcldb/rcldoc.h @@ -87,15 +87,20 @@ class Doc { // as an indicative prefix at the beginning of the abstract (ugly hack) bool syntabs; - // File size. Index: Set by caller prior to Db::Add. Query: set by - // rcldb from index doc data. Historically this always has - // represented the whole file size (as from stat()), but there - // would be a need for a 3rd value for multidoc files (file - // size/doc size/ doc text size) - string fbytes; + // File size. This is the size of the compressed file or of the + // external containing archive. + // Index: Set by caller prior to Db::Add. + // Query: not set currently (not stored) + string pcbytes; - // Doc text size. Index: from text.length(). Query: set by rcldb from - // index doc data. + // Document size, ie, size of the .odt or .xls. + // Index: Set in internfile from the filter stack + // Query: set from data record + string fbytes; + + // Doc text size. + // Index: from text.length(). + // Query: set by rcldb from index data record string dbytes; // Doc signature. Used for up to date checks. @@ -126,6 +131,7 @@ class Doc { origcharset.erase(); meta.clear(); syntabs = false; + pcbytes.erase(); fbytes.erase(); dbytes.erase(); sig.erase(); @@ -163,9 +169,10 @@ class Doc { static const string keydmt; // document mtime static const string keymt; // mtime dmtime if set else fmtime static const string keyoc; // original charset - static const string keyfs; // file size - static const string keyds; // document size - static const string keysz; // dbytes if set else fbytes + static const string keypcs; // document outer container size + static const string keyfs; // document size + static const string keyds; // document text size + static const string keysz; // dbytes if set else fbytes else pcbytes static const string keysig; // sig static const string keyrr; // relevancy rating static const string keycc; // Collapse count diff --git a/src/rcldb/rclquery.cpp b/src/rcldb/rclquery.cpp index 6755d0be..8aa43346 100644 --- a/src/rcldb/rclquery.cpp +++ b/src/rcldb/rclquery.cpp @@ -64,7 +64,8 @@ public: : m_fld(docfToDatf(f) + "=") { m_ismtime = !m_fld.compare("dmtime="); - m_issize = !m_fld.compare("fbytes=") || !m_fld.compare("dbytes="); + m_issize = !m_fld.compare("fbytes=") || !m_fld.compare("dbytes=") || + !m_fld.compare("pcbytes="); } virtual std::string operator()(const Xapian::Document& xdoc) const