Changed the way we handle document sizes. The fbytes field should now be in most cases the most "natural" document size. pcbytes holds the top external container size and dbytes the text size
This commit is contained in:
parent
638d468796
commit
85166c93b2
@ -68,7 +68,7 @@ bool BeagleQueueCache::getFromCache(const string& udi, Rcl::Doc &dotdoc,
|
||||
cf.get(cstr_url, dotdoc.url, cstr_null);
|
||||
cf.get(cstr_bgc_mimetype, dotdoc.mimetype, cstr_null);
|
||||
cf.get(cstr_fmtime, dotdoc.fmtime, cstr_null);
|
||||
cf.get(cstr_fbytes, dotdoc.fbytes, cstr_null);
|
||||
cf.get(cstr_fbytes, dotdoc.pcbytes, cstr_null);
|
||||
dotdoc.sig.clear();
|
||||
list<string> names = cf.getNames(cstr_null);
|
||||
for (list<string>::const_iterator it = names.begin();
|
||||
|
||||
@ -242,7 +242,7 @@ bool BeagleQueueIndexer::indexFromCache(const string& udi)
|
||||
doc.mimetype = dotdoc.mimetype;
|
||||
doc.fmtime = dotdoc.fmtime;
|
||||
doc.url = dotdoc.url;
|
||||
doc.fbytes = dotdoc.fbytes;
|
||||
doc.pcbytes = dotdoc.pcbytes;
|
||||
doc.sig.clear();
|
||||
doc.meta[Rcl::Doc::keybcknd] = "BGL";
|
||||
return m_db->addOrUpdate(udi, cstr_null, doc);
|
||||
@ -405,7 +405,7 @@ BeagleQueueIndexer::processone(const string &path,
|
||||
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, OFFTPC, stp->st_size);
|
||||
dotdoc.fbytes = cbuf;
|
||||
dotdoc.pcbytes = cbuf;
|
||||
|
||||
// Document signature for up to date checks: none.
|
||||
dotdoc.sig.clear();
|
||||
@ -452,7 +452,7 @@ BeagleQueueIndexer::processone(const string &path,
|
||||
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, OFFTPC, stp->st_size);
|
||||
doc.fbytes = cbuf;
|
||||
doc.pcbytes = cbuf;
|
||||
// Document signature for up to date checks: none.
|
||||
doc.sig.clear();
|
||||
doc.url = dotdoc.url;
|
||||
@ -466,7 +466,9 @@ BeagleQueueIndexer::processone(const string &path,
|
||||
{
|
||||
// doc fields not in meta, needing saving to the cache
|
||||
dotfile.m_fields.set("fmtime", dotdoc.fmtime, cstr_null);
|
||||
dotfile.m_fields.set("fbytes", dotdoc.fbytes, cstr_null);
|
||||
// fbytes is used for historical reasons, should be pcbytes, but makes
|
||||
// no sense to change.
|
||||
dotfile.m_fields.set(cstr_fbytes, dotdoc.pcbytes, cstr_null);
|
||||
dotfile.m_fields.set("udi", udi, cstr_null);
|
||||
string fdata;
|
||||
file_to_string(path, fdata);
|
||||
|
||||
@ -458,7 +458,7 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, OFFTPC, stp->st_size);
|
||||
doc.fbytes = cbuf;
|
||||
doc.pcbytes = cbuf;
|
||||
// Document signature for up to date checks: concatenate
|
||||
// m/ctime and size. Looking for changes only, no need to
|
||||
// parseback so no need for reversible formatting. Also set,
|
||||
@ -519,7 +519,7 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
|
||||
|
||||
char cbuf[100];
|
||||
sprintf(cbuf, OFFTPC, stp->st_size);
|
||||
fileDoc.fbytes = cbuf;
|
||||
fileDoc.pcbytes = cbuf;
|
||||
// Document signature for up to date checks.
|
||||
makesig(stp, fileDoc.sig);
|
||||
#ifdef IDX_THREADS
|
||||
|
||||
@ -121,6 +121,16 @@ namespace Dijon
|
||||
*/
|
||||
virtual bool set_document_uri(const std::string &uri) = 0;
|
||||
|
||||
/** Set the document size meta_data element. This is the size
|
||||
of the immediate containing file (ie, a .doc, a .odt), not
|
||||
the size of, ie, a containing archive or .gz nor the size
|
||||
of the extracted text. This is set externally, because the
|
||||
surrounding code quite often has a better idea about it
|
||||
(having created a temp file, etc.), and this saves more
|
||||
stat() calls The value is stored inside metaData, docsize
|
||||
key
|
||||
*/
|
||||
virtual void set_docsize(size_t size) = 0;
|
||||
|
||||
// Going from one nested document to the next.
|
||||
|
||||
|
||||
@ -239,6 +239,8 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
|
||||
l_mime = *imime;
|
||||
}
|
||||
|
||||
size_t docsize = stp->st_size;
|
||||
|
||||
if (!l_mime.empty()) {
|
||||
// Has mime: check for a compressed file. If so, create a
|
||||
// temporary uncompressed file, and rerun the mime type
|
||||
@ -255,8 +257,16 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
|
||||
LOGDEB1(("FileInterner:: after ucomp: m_tdir %s, tfile %s\n",
|
||||
m_tdir.dirname(), m_tfile.c_str()));
|
||||
m_fn = m_tfile;
|
||||
// Note: still using the original file's stat. right ?
|
||||
l_mime = mimetype(m_fn, stp, m_cfg, usfci);
|
||||
// Stat the uncompressed file, mainly to get the size
|
||||
struct stat ucstat;
|
||||
if (stat(m_fn.c_str(), &ucstat) != 0) {
|
||||
LOGERR(("FileInterner: can't stat the uncompressed file"
|
||||
"[%s] errno %d\n", m_fn.c_str(), errno));
|
||||
return;
|
||||
} else {
|
||||
docsize = ucstat.st_size;
|
||||
}
|
||||
l_mime = mimetype(m_fn, &ucstat, m_cfg, usfci);
|
||||
if (l_mime.empty() && imime)
|
||||
l_mime = *imime;
|
||||
} else {
|
||||
@ -294,6 +304,7 @@ void FileInterner::init(const string &f, const struct stat *stp, RclConfig *cnf,
|
||||
reapXAttrs(f);
|
||||
#endif //RCL_USE_XATTR
|
||||
|
||||
df->set_docsize(docsize);
|
||||
if (!df->set_document_file(m_fn)) {
|
||||
LOGERR(("FileInterner:: error converting %s\n", m_fn.c_str()));
|
||||
return;
|
||||
@ -335,6 +346,7 @@ void FileInterner::init(const string &data, RclConfig *cnf,
|
||||
m_forPreview ? "view" : "index");
|
||||
|
||||
bool setres = false;
|
||||
df->set_docsize(data.length());
|
||||
if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
|
||||
setres = df->set_document_string(data);
|
||||
} else if (df->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) {
|
||||
@ -652,17 +664,30 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
||||
it != docdata.end(); it++) {
|
||||
if (it->first == cstr_dj_keycontent) {
|
||||
doc.text = it->second;
|
||||
if (doc.fbytes.empty()) {
|
||||
// It's normally set by walking the filter stack, in
|
||||
// collectIpathAndMt, which was called before us. It
|
||||
// can happen that the doc size is still empty at this
|
||||
// point if the last container filter is directly
|
||||
// returning text/plain content, so that there is no
|
||||
// ipath-less filter at the top
|
||||
char cbuf[30];
|
||||
sprintf(cbuf, "%d", int(doc.text.length()));
|
||||
doc.fbytes = cbuf;
|
||||
}
|
||||
} else if (it->first == cstr_dj_keymd) {
|
||||
doc.dmtime = it->second;
|
||||
} else if (it->first == cstr_dj_keyorigcharset) {
|
||||
doc.origcharset = it->second;
|
||||
} else if (it->first == cstr_dj_keymt || it->first == cstr_dj_keycharset) {
|
||||
} else if (it->first == cstr_dj_keymt ||
|
||||
it->first == cstr_dj_keycharset) {
|
||||
// don't need/want these.
|
||||
} else {
|
||||
doc.meta[it->first] = it->second;
|
||||
}
|
||||
}
|
||||
if (doc.meta[Rcl::Doc::keyabs].empty() && !doc.meta[cstr_dj_keyds].empty()) {
|
||||
if (doc.meta[Rcl::Doc::keyabs].empty() &&
|
||||
!doc.meta[cstr_dj_keyds].empty()) {
|
||||
doc.meta[Rcl::Doc::keyabs] = doc.meta[cstr_dj_keyds];
|
||||
doc.meta.erase(cstr_dj_keyds);
|
||||
}
|
||||
@ -670,11 +695,20 @@ bool FileInterner::dijontorcl(Rcl::Doc& doc)
|
||||
}
|
||||
|
||||
// Collect the ipath from the current path in the document tree.
|
||||
// While we're at it, we also set the mimetype and filename, which are special
|
||||
// properties: we want to get them from the topmost doc
|
||||
// with an ipath, not the last one which is usually text/plain
|
||||
// We also set the author and modification time from the last doc
|
||||
// which has them.
|
||||
// While we're at it, we also set the mimetype and filename,
|
||||
// which are special properties: we want to get them from the topmost
|
||||
// doc with an ipath, not the last one which is usually text/plain We
|
||||
// also set the author and modification time from the last doc which
|
||||
// has them.
|
||||
//
|
||||
// The docsize is fetched from the first element without an ipath
|
||||
// (first non container). If the last element directly returns
|
||||
// text/plain so that there is no ipath-less element, the value will
|
||||
// be set in dijontorcl().
|
||||
//
|
||||
// The whole thing is a bit messy but it's not obvious how it should
|
||||
// be cleaned up as the "inheritance" rules inside the stack are
|
||||
// actually complicated.
|
||||
void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
|
||||
{
|
||||
LOGDEB2(("FileInterner::collectIpathAndMT\n"));
|
||||
@ -702,9 +736,14 @@ void FileInterner::collectIpathAndMT(Rcl::Doc& doc) const
|
||||
hasipath = true;
|
||||
getKeyValue(docdata, cstr_dj_keymt, doc.mimetype);
|
||||
getKeyValue(docdata, cstr_dj_keyfn, doc.utf8fn);
|
||||
} else {
|
||||
if (doc.fbytes.empty())
|
||||
getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes);
|
||||
}
|
||||
doc.ipath += colon_hide(ipathel) + cstr_isep;
|
||||
} else {
|
||||
if (doc.fbytes.empty())
|
||||
getKeyValue(docdata, cstr_dj_keydocsize, doc.fbytes);
|
||||
doc.ipath += cstr_isep;
|
||||
}
|
||||
getKeyValue(docdata, cstr_dj_keyauthor, doc.meta[Rcl::Doc::keyau]);
|
||||
@ -793,6 +832,7 @@ int FileInterner::addHandler()
|
||||
txt = &it->second;
|
||||
}
|
||||
bool setres = false;
|
||||
newflt->set_docsize(txt->length());
|
||||
if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
|
||||
setres = newflt->set_document_string(*txt);
|
||||
} else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) {
|
||||
|
||||
@ -18,6 +18,8 @@
|
||||
#define _MIMEHANDLER_H_INCLUDED_
|
||||
#include "autoconfig.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <string>
|
||||
#include <list>
|
||||
using std::string;
|
||||
@ -66,6 +68,13 @@ public:
|
||||
return set_document_string(string(cp, sz));
|
||||
}
|
||||
|
||||
virtual void set_docsize(size_t size)
|
||||
{
|
||||
char csize[30];
|
||||
sprintf(csize, "%lld", (long long)size);
|
||||
m_metaData[cstr_dj_keydocsize] = csize;
|
||||
}
|
||||
|
||||
virtual bool has_documents() const {return m_havedoc;}
|
||||
|
||||
// Most doc types are single-doc
|
||||
|
||||
@ -210,6 +210,7 @@ bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
||||
doc.syntabs = true;
|
||||
}
|
||||
parms.get(Doc::keyipt, doc.ipath);
|
||||
parms.get(Doc::keypcs, doc.pcbytes);
|
||||
parms.get(Doc::keyfs, doc.fbytes);
|
||||
parms.get(Doc::keyds, doc.dbytes);
|
||||
parms.get(Doc::keysig, doc.sig);
|
||||
@ -1254,17 +1255,21 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
|
||||
}
|
||||
RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
|
||||
|
||||
if (doc.fbytes.empty())
|
||||
doc.fbytes = doc.pcbytes;
|
||||
if (!doc.fbytes.empty())
|
||||
RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
|
||||
if (!doc.pcbytes.empty())
|
||||
RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
|
||||
char sizebuf[30];
|
||||
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
||||
RECORD_APPEND(record, Doc::keyds, sizebuf);
|
||||
|
||||
// Note that we add the signature both as a value and in the data record
|
||||
if (!doc.sig.empty())
|
||||
RECORD_APPEND(record, Doc::keysig, doc.sig);
|
||||
newdocument.add_value(VALUE_SIG, doc.sig);
|
||||
|
||||
char sizebuf[30];
|
||||
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
||||
RECORD_APPEND(record, Doc::keyds, sizebuf);
|
||||
|
||||
if (!doc.ipath.empty())
|
||||
RECORD_APPEND(record, Doc::keyipt, doc.ipath);
|
||||
|
||||
|
||||
@ -27,6 +27,7 @@ namespace Rcl {
|
||||
const string Doc::keydmt("dmtime");
|
||||
const string Doc::keymt("mtime");
|
||||
const string Doc::keyoc("origcharset");
|
||||
const string Doc::keypcs("pcbytes");
|
||||
const string Doc::keyfs("fbytes");
|
||||
const string Doc::keyds("dbytes");
|
||||
const string Doc::keysz("size");
|
||||
@ -53,6 +54,7 @@ namespace Rcl {
|
||||
LOGDEB(("Rcl::Doc::dump: dmtime: [%s]\n", dmtime.c_str()));
|
||||
LOGDEB(("Rcl::Doc::dump: origcharset: [%s]\n", origcharset.c_str()));
|
||||
LOGDEB(("Rcl::Doc::dump: syntabs: [%d]\n", syntabs));
|
||||
LOGDEB(("Rcl::Doc::dump: pcbytes: [%s]\n", pcbytes.c_str()));
|
||||
LOGDEB(("Rcl::Doc::dump: fbytes: [%s]\n", fbytes.c_str()));
|
||||
LOGDEB(("Rcl::Doc::dump: dbytes: [%s]\n", dbytes.c_str()));
|
||||
LOGDEB(("Rcl::Doc::dump: sig: [%s]\n", sig.c_str()));
|
||||
|
||||
@ -87,15 +87,20 @@ class Doc {
|
||||
// as an indicative prefix at the beginning of the abstract (ugly hack)
|
||||
bool syntabs;
|
||||
|
||||
// File size. Index: Set by caller prior to Db::Add. Query: set by
|
||||
// rcldb from index doc data. Historically this always has
|
||||
// represented the whole file size (as from stat()), but there
|
||||
// would be a need for a 3rd value for multidoc files (file
|
||||
// size/doc size/ doc text size)
|
||||
string fbytes;
|
||||
// File size. This is the size of the compressed file or of the
|
||||
// external containing archive.
|
||||
// Index: Set by caller prior to Db::Add.
|
||||
// Query: not set currently (not stored)
|
||||
string pcbytes;
|
||||
|
||||
// Doc text size. Index: from text.length(). Query: set by rcldb from
|
||||
// index doc data.
|
||||
// Document size, ie, size of the .odt or .xls.
|
||||
// Index: Set in internfile from the filter stack
|
||||
// Query: set from data record
|
||||
string fbytes;
|
||||
|
||||
// Doc text size.
|
||||
// Index: from text.length().
|
||||
// Query: set by rcldb from index data record
|
||||
string dbytes;
|
||||
|
||||
// Doc signature. Used for up to date checks.
|
||||
@ -126,6 +131,7 @@ class Doc {
|
||||
origcharset.erase();
|
||||
meta.clear();
|
||||
syntabs = false;
|
||||
pcbytes.erase();
|
||||
fbytes.erase();
|
||||
dbytes.erase();
|
||||
sig.erase();
|
||||
@ -163,9 +169,10 @@ class Doc {
|
||||
static const string keydmt; // document mtime
|
||||
static const string keymt; // mtime dmtime if set else fmtime
|
||||
static const string keyoc; // original charset
|
||||
static const string keyfs; // file size
|
||||
static const string keyds; // document size
|
||||
static const string keysz; // dbytes if set else fbytes
|
||||
static const string keypcs; // document outer container size
|
||||
static const string keyfs; // document size
|
||||
static const string keyds; // document text size
|
||||
static const string keysz; // dbytes if set else fbytes else pcbytes
|
||||
static const string keysig; // sig
|
||||
static const string keyrr; // relevancy rating
|
||||
static const string keycc; // Collapse count
|
||||
|
||||
@ -64,7 +64,8 @@ public:
|
||||
: m_fld(docfToDatf(f) + "=")
|
||||
{
|
||||
m_ismtime = !m_fld.compare("dmtime=");
|
||||
m_issize = !m_fld.compare("fbytes=") || !m_fld.compare("dbytes=");
|
||||
m_issize = !m_fld.compare("fbytes=") || !m_fld.compare("dbytes=") ||
|
||||
!m_fld.compare("pcbytes=");
|
||||
}
|
||||
|
||||
virtual std::string operator()(const Xapian::Document& xdoc) const
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user