search/index: fixed a number of bad conversions to properly deal with text documents bigger than 2GB

This commit is contained in:
"Jean-Francois Dockes ext:(%22) 2011-07-12 08:28:09 -07:00
parent 5e59354535
commit 88685d2e64
11 changed files with 42 additions and 23 deletions

View File

@ -109,3 +109,8 @@
#undef _FILE_OFFSET_BITS #undef _FILE_OFFSET_BITS
#undef _LARGE_FILES #undef _LARGE_FILES
#if _FILE_OFFSET_BITS == 64 || defined(__APPLE__)
#define OFFTPC "%lld"
#else
#define OFFTPC "%ld"
#endif

View File

@ -406,7 +406,7 @@ BeagleQueueIndexer::processone(const string &path,
dotdoc.fmtime = ascdate; dotdoc.fmtime = ascdate;
char cbuf[100]; char cbuf[100];
sprintf(cbuf, "%ld", (long)stp->st_size); sprintf(cbuf, OFFTPC, stp->st_size);
dotdoc.fbytes = cbuf; dotdoc.fbytes = cbuf;
// Document signature for up to date checks: none. // Document signature for up to date checks: none.
@ -453,7 +453,7 @@ BeagleQueueIndexer::processone(const string &path,
doc.fmtime = ascdate; doc.fmtime = ascdate;
char cbuf[100]; char cbuf[100];
sprintf(cbuf, "%ld", (long)stp->st_size); sprintf(cbuf, OFFTPC, stp->st_size);
doc.fbytes = cbuf; doc.fbytes = cbuf;
// Document signature for up to date checks: none. // Document signature for up to date checks: none.
doc.sig = ""; doc.sig = "";

View File

@ -288,6 +288,12 @@ void FsIndexer::setlocalfields(Rcl::Doc& doc)
} }
} }
static void makesig(const struct stat *stp, string& out)
{
char cbuf[100];
sprintf(cbuf, OFFTPC "%ld", stp->st_size, (long)stp->RCL_STTIME);
out = cbuf;
}
/// This method gets called for every file and directory found by the /// This method gets called for every file and directory found by the
/// tree walker. /// tree walker.
@ -340,9 +346,8 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
// for the uptodate check (the value computed here is checked // for the uptodate check (the value computed here is checked
// against the stored one). Changing the computation forces a full // against the stored one). Changing the computation forces a full
// reindex of course. // reindex of course.
char cbuf[100]; string sig;
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME); makesig(stp, sig);
string sig = cbuf;
string udi; string udi;
make_udi(fn, "", udi); make_udi(fn, "", udi);
if (!m_db->needUpdate(udi, sig)) { if (!m_db->needUpdate(udi, sig)) {
@ -420,14 +425,14 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
doc.utf8fn = utf8fn; doc.utf8fn = utf8fn;
char cbuf[100]; char cbuf[100];
sprintf(cbuf, "%ld", (long)stp->st_size); sprintf(cbuf, OFFTPC, stp->st_size);
doc.fbytes = cbuf; doc.fbytes = cbuf;
// Document signature for up to date checks: concatenate // Document signature for up to date checks: concatenate
// m/ctime and size. Looking for changes only, no need to // m/ctime and size. Looking for changes only, no need to
// parseback so no need for reversible formatting. Also set, // parseback so no need for reversible formatting. Also set,
// but never used, for subdocs. // but never used, for subdocs.
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME); makesig(stp, doc.sig);
doc.sig = cbuf;
// If there was an error, ensure indexing will be // If there was an error, ensure indexing will be
// retried. This is for the once missing, later installed // retried. This is for the once missing, later installed
// filter case. It can make indexing much slower (if there are // filter case. It can make indexing much slower (if there are
@ -473,11 +478,10 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp,
fileDoc.url = string("file://") + fn; fileDoc.url = string("file://") + fn;
char cbuf[100]; char cbuf[100];
sprintf(cbuf, "%ld", (long)stp->st_size); sprintf(cbuf, OFFTPC, stp->st_size);
fileDoc.fbytes = cbuf; fileDoc.fbytes = cbuf;
// Document signature for up to date checks. // Document signature for up to date checks.
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME); makesig(stp, fileDoc.sig);
fileDoc.sig = cbuf;
if (!m_db->addOrUpdate(parent_udi, "", fileDoc)) if (!m_db->addOrUpdate(parent_udi, "", fileDoc))
return FsTreeWalker::FtwError; return FsTreeWalker::FtwError;
} }

View File

@ -108,8 +108,8 @@ public:
return -1; return -1;
} }
if (fseeko(fp, cacheoffset(msgnum), SEEK_SET) != 0) { if (fseeko(fp, cacheoffset(msgnum), SEEK_SET) != 0) {
LOGDEB0(("MboxCache::get_offsets: seek %ld errno %d\n", LOGDEB0(("MboxCache::get_offsets: seek %lld errno %d\n",
(long)cacheoffset(msgnum), errno)); cacheoffset(msgnum), errno));
return -1; return -1;
} }
mbhoff_type offset = -1; mbhoff_type offset = -1;

View File

@ -69,11 +69,13 @@ bool MimeHandlerText::set_document_file(const string &fn)
} }
m_pagesz = size_t(ps); m_pagesz = size_t(ps);
string reason; string reason;
LOGDEB(("calling file_to_string\n"));
// file_to_string() takes pagesz == size_t(-1) to mean read all. // file_to_string() takes pagesz == size_t(-1) to mean read all.
if (!file_to_string(fn, m_text, 0, m_pagesz, &reason)) { if (!file_to_string(fn, m_text, 0, m_pagesz, &reason)) {
LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str())); LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str()));
return false; return false;
} }
LOGDEB(("file_to_string OK\n"));
m_offs = m_text.length(); m_offs = m_text.length();
} }

View File

@ -14,6 +14,9 @@
* Free Software Foundation, Inc., * Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/ */
#ifdef HAVE_CONFIG_H
#include "autoconfig.h"
#endif
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@ -163,7 +166,7 @@ void ResListPager::displayDoc(RclConfig *config,
} }
// Size information. We print both doc and file if they differ a lot // Size information. We print both doc and file if they differ a lot
long fsize = -1, dsize = -1; off_t fsize = -1, dsize = -1;
if (!doc.dbytes.empty()) if (!doc.dbytes.empty())
dsize = atol(doc.dbytes.c_str()); dsize = atol(doc.dbytes.c_str());
if (!doc.fbytes.empty()) if (!doc.fbytes.empty())

View File

@ -368,7 +368,7 @@ string& MD5HexScan(const string& xdigest, string& digest)
class FileScanMd5 : public FileScanDo { class FileScanMd5 : public FileScanDo {
public: public:
FileScanMd5(string& d) : digest(d) {} FileScanMd5(string& d) : digest(d) {}
virtual bool init(unsigned int size, string *) virtual bool init(size_t size, string *)
{ {
MD5Init(&ctx); MD5Init(&ctx);
return true; return true;

View File

@ -78,7 +78,7 @@ class FileToString : public FileScanDo {
public: public:
FileToString(string& data) : m_data(data) {} FileToString(string& data) : m_data(data) {}
string& m_data; string& m_data;
bool init(unsigned int size, string *reason) { bool init(size_t size, string *reason) {
if (size > 0) if (size > 0)
m_data.reserve(size); m_data.reserve(size);
return true; return true;
@ -135,10 +135,10 @@ bool file_scan(const string &fn, FileScanDo* doer, off_t startoffs,
noclosing = false; noclosing = false;
} }
if (st.st_size > 0) { if (cnttoread != (size_t)-1 && cnttoread) {
doer->init(st.st_size+1, reason);
} else if (cnttoread) {
doer->init(cnttoread+1, reason); doer->init(cnttoread+1, reason);
} else if (st.st_size > 0) {
doer->init(st.st_size+1, reason);
} else { } else {
doer->init(0, reason); doer->init(0, reason);
} }

View File

@ -29,7 +29,7 @@ using std::string;
class FileScanDo { class FileScanDo {
public: public:
virtual ~FileScanDo() {} virtual ~FileScanDo() {}
virtual bool init(unsigned int size, string *reason) = 0; virtual bool init(size_t size, string *reason) = 0;
virtual bool data(const char *buf, int cnt, string* reason) = 0; virtual bool data(const char *buf, int cnt, string* reason) = 0;
}; };
bool file_scan(const string &filename, FileScanDo* doer, string *reason = 0); bool file_scan(const string &filename, FileScanDo* doer, string *reason = 0);

View File

@ -14,7 +14,11 @@
* Free Software Foundation, Inc., * Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/ */
#ifndef TEST_SMALLUT #ifndef TEST_SMALLUT
#ifdef HAVE_CONFIG_H
#include "autoconfig.h"
#endif
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <time.h>
@ -556,7 +560,7 @@ bool pcSubst(const string& in, string& out, map<string, string>& subs)
} }
// Convert byte count into unit (KB/MB...) appropriate for display // Convert byte count into unit (KB/MB...) appropriate for display
string displayableBytes(long size) string displayableBytes(off_t size)
{ {
char sizebuf[30]; char sizebuf[30];
const char * unit = " B "; const char * unit = " B ";
@ -568,7 +572,7 @@ string displayableBytes(long size)
unit = " MB "; unit = " MB ";
size /= (1024*1024); size /= (1024*1024);
} }
sprintf(sizebuf, "%ld%s", size, unit); sprintf(sizebuf, OFFTPC "%s", size, unit);
return string(sizebuf); return string(sizebuf);
} }

View File

@ -16,6 +16,7 @@
*/ */
#ifndef _SMALLUT_H_INCLUDED_ #ifndef _SMALLUT_H_INCLUDED_
#define _SMALLUT_H_INCLUDED_ #define _SMALLUT_H_INCLUDED_
#include <stdlib.h> #include <stdlib.h>
#include <string> #include <string>
@ -125,7 +126,7 @@ extern string truncate_to_word(const string &input, string::size_type maxlen);
extern void utf8truncate(string &s, int maxlen); extern void utf8truncate(string &s, int maxlen);
/** Convert byte count into unit (KB/MB...) appropriate for display */ /** Convert byte count into unit (KB/MB...) appropriate for display */
string displayableBytes(long size); string displayableBytes(off_t size);
/** Break big string into lines */ /** Break big string into lines */
string breakIntoLines(const string& in, unsigned int ll = 100, string breakIntoLines(const string& in, unsigned int ll = 100,