From 88685d2e64bc2955a29bceff4e53ca3d5da4eac4 Mon Sep 17 00:00:00 2001 From: "\"Jean-Francois Dockes ext:(%22)" Date: Tue, 12 Jul 2011 08:28:09 -0700 Subject: [PATCH] search/index: fixed a number of bad conversions to properly deal with text documents bigger than 2GB --- src/common/autoconfig.h.in | 5 +++++ src/index/beaglequeue.cpp | 4 ++-- src/index/fsindexer.cpp | 22 +++++++++++++--------- src/internfile/mh_mbox.cpp | 4 ++-- src/internfile/mh_text.cpp | 2 ++ src/query/reslistpager.cpp | 5 ++++- src/utils/md5.cpp | 2 +- src/utils/readfile.cpp | 8 ++++---- src/utils/readfile.h | 2 +- src/utils/smallut.cpp | 8 ++++++-- src/utils/smallut.h | 3 ++- 11 files changed, 42 insertions(+), 23 deletions(-) diff --git a/src/common/autoconfig.h.in b/src/common/autoconfig.h.in index 3ca2a15c..925a50fb 100644 --- a/src/common/autoconfig.h.in +++ b/src/common/autoconfig.h.in @@ -109,3 +109,8 @@ #undef _FILE_OFFSET_BITS #undef _LARGE_FILES +#if _FILE_OFFSET_BITS == 64 || defined(__APPLE__) +#define OFFTPC "%lld" +#else +#define OFFTPC "%ld" +#endif diff --git a/src/index/beaglequeue.cpp b/src/index/beaglequeue.cpp index f4c9defe..b7e19e2d 100644 --- a/src/index/beaglequeue.cpp +++ b/src/index/beaglequeue.cpp @@ -406,7 +406,7 @@ BeagleQueueIndexer::processone(const string &path, dotdoc.fmtime = ascdate; char cbuf[100]; - sprintf(cbuf, "%ld", (long)stp->st_size); + sprintf(cbuf, OFFTPC, stp->st_size); dotdoc.fbytes = cbuf; // Document signature for up to date checks: none. @@ -453,7 +453,7 @@ BeagleQueueIndexer::processone(const string &path, doc.fmtime = ascdate; char cbuf[100]; - sprintf(cbuf, "%ld", (long)stp->st_size); + sprintf(cbuf, OFFTPC, stp->st_size); doc.fbytes = cbuf; // Document signature for up to date checks: none. doc.sig = ""; diff --git a/src/index/fsindexer.cpp b/src/index/fsindexer.cpp index 1ef5774e..bdd5e6f5 100644 --- a/src/index/fsindexer.cpp +++ b/src/index/fsindexer.cpp @@ -288,6 +288,12 @@ void FsIndexer::setlocalfields(Rcl::Doc& doc) } } +static void makesig(const struct stat *stp, string& out) +{ + char cbuf[100]; + sprintf(cbuf, OFFTPC "%ld", stp->st_size, (long)stp->RCL_STTIME); + out = cbuf; +} /// This method gets called for every file and directory found by the /// tree walker. @@ -340,9 +346,8 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp, // for the uptodate check (the value computed here is checked // against the stored one). Changing the computation forces a full // reindex of course. - char cbuf[100]; - sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME); - string sig = cbuf; + string sig; + makesig(stp, sig); string udi; make_udi(fn, "", udi); if (!m_db->needUpdate(udi, sig)) { @@ -420,14 +425,14 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp, doc.utf8fn = utf8fn; char cbuf[100]; - sprintf(cbuf, "%ld", (long)stp->st_size); + sprintf(cbuf, OFFTPC, stp->st_size); doc.fbytes = cbuf; // Document signature for up to date checks: concatenate // m/ctime and size. Looking for changes only, no need to // parseback so no need for reversible formatting. Also set, // but never used, for subdocs. - sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME); - doc.sig = cbuf; + makesig(stp, doc.sig); + // If there was an error, ensure indexing will be // retried. This is for the once missing, later installed // filter case. It can make indexing much slower (if there are @@ -473,11 +478,10 @@ FsIndexer::processone(const std::string &fn, const struct stat *stp, fileDoc.url = string("file://") + fn; char cbuf[100]; - sprintf(cbuf, "%ld", (long)stp->st_size); + sprintf(cbuf, OFFTPC, stp->st_size); fileDoc.fbytes = cbuf; // Document signature for up to date checks. - sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME); - fileDoc.sig = cbuf; + makesig(stp, fileDoc.sig); if (!m_db->addOrUpdate(parent_udi, "", fileDoc)) return FsTreeWalker::FtwError; } diff --git a/src/internfile/mh_mbox.cpp b/src/internfile/mh_mbox.cpp index eeb4c1aa..2247cb79 100644 --- a/src/internfile/mh_mbox.cpp +++ b/src/internfile/mh_mbox.cpp @@ -108,8 +108,8 @@ public: return -1; } if (fseeko(fp, cacheoffset(msgnum), SEEK_SET) != 0) { - LOGDEB0(("MboxCache::get_offsets: seek %ld errno %d\n", - (long)cacheoffset(msgnum), errno)); + LOGDEB0(("MboxCache::get_offsets: seek %lld errno %d\n", + cacheoffset(msgnum), errno)); return -1; } mbhoff_type offset = -1; diff --git a/src/internfile/mh_text.cpp b/src/internfile/mh_text.cpp index a545d4a5..2b96ec9f 100644 --- a/src/internfile/mh_text.cpp +++ b/src/internfile/mh_text.cpp @@ -69,11 +69,13 @@ bool MimeHandlerText::set_document_file(const string &fn) } m_pagesz = size_t(ps); string reason; + LOGDEB(("calling file_to_string\n")); // file_to_string() takes pagesz == size_t(-1) to mean read all. if (!file_to_string(fn, m_text, 0, m_pagesz, &reason)) { LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str())); return false; } + LOGDEB(("file_to_string OK\n")); m_offs = m_text.length(); } diff --git a/src/query/reslistpager.cpp b/src/query/reslistpager.cpp index 7c19e1d7..4ac22670 100644 --- a/src/query/reslistpager.cpp +++ b/src/query/reslistpager.cpp @@ -14,6 +14,9 @@ * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#ifdef HAVE_CONFIG_H +#include "autoconfig.h" +#endif #include #include @@ -163,7 +166,7 @@ void ResListPager::displayDoc(RclConfig *config, } // Size information. We print both doc and file if they differ a lot - long fsize = -1, dsize = -1; + off_t fsize = -1, dsize = -1; if (!doc.dbytes.empty()) dsize = atol(doc.dbytes.c_str()); if (!doc.fbytes.empty()) diff --git a/src/utils/md5.cpp b/src/utils/md5.cpp index 14501501..9d424580 100644 --- a/src/utils/md5.cpp +++ b/src/utils/md5.cpp @@ -368,7 +368,7 @@ string& MD5HexScan(const string& xdigest, string& digest) class FileScanMd5 : public FileScanDo { public: FileScanMd5(string& d) : digest(d) {} - virtual bool init(unsigned int size, string *) + virtual bool init(size_t size, string *) { MD5Init(&ctx); return true; diff --git a/src/utils/readfile.cpp b/src/utils/readfile.cpp index 444f44b3..2023829f 100644 --- a/src/utils/readfile.cpp +++ b/src/utils/readfile.cpp @@ -78,7 +78,7 @@ class FileToString : public FileScanDo { public: FileToString(string& data) : m_data(data) {} string& m_data; - bool init(unsigned int size, string *reason) { + bool init(size_t size, string *reason) { if (size > 0) m_data.reserve(size); return true; @@ -135,10 +135,10 @@ bool file_scan(const string &fn, FileScanDo* doer, off_t startoffs, noclosing = false; } - if (st.st_size > 0) { - doer->init(st.st_size+1, reason); - } else if (cnttoread) { + if (cnttoread != (size_t)-1 && cnttoread) { doer->init(cnttoread+1, reason); + } else if (st.st_size > 0) { + doer->init(st.st_size+1, reason); } else { doer->init(0, reason); } diff --git a/src/utils/readfile.h b/src/utils/readfile.h index b92d19dc..f0ce5e75 100644 --- a/src/utils/readfile.h +++ b/src/utils/readfile.h @@ -29,7 +29,7 @@ using std::string; class FileScanDo { public: virtual ~FileScanDo() {} - virtual bool init(unsigned int size, string *reason) = 0; + virtual bool init(size_t size, string *reason) = 0; virtual bool data(const char *buf, int cnt, string* reason) = 0; }; bool file_scan(const string &filename, FileScanDo* doer, string *reason = 0); diff --git a/src/utils/smallut.cpp b/src/utils/smallut.cpp index fe7ac3d5..bbcc7acd 100644 --- a/src/utils/smallut.cpp +++ b/src/utils/smallut.cpp @@ -14,7 +14,11 @@ * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + #ifndef TEST_SMALLUT +#ifdef HAVE_CONFIG_H +#include "autoconfig.h" +#endif #include #include #include @@ -556,7 +560,7 @@ bool pcSubst(const string& in, string& out, map& subs) } // Convert byte count into unit (KB/MB...) appropriate for display -string displayableBytes(long size) +string displayableBytes(off_t size) { char sizebuf[30]; const char * unit = " B "; @@ -568,7 +572,7 @@ string displayableBytes(long size) unit = " MB "; size /= (1024*1024); } - sprintf(sizebuf, "%ld%s", size, unit); + sprintf(sizebuf, OFFTPC "%s", size, unit); return string(sizebuf); } diff --git a/src/utils/smallut.h b/src/utils/smallut.h index d43933b2..8e78331a 100644 --- a/src/utils/smallut.h +++ b/src/utils/smallut.h @@ -16,6 +16,7 @@ */ #ifndef _SMALLUT_H_INCLUDED_ #define _SMALLUT_H_INCLUDED_ + #include #include @@ -125,7 +126,7 @@ extern string truncate_to_word(const string &input, string::size_type maxlen); extern void utf8truncate(string &s, int maxlen); /** Convert byte count into unit (KB/MB...) appropriate for display */ -string displayableBytes(long size); +string displayableBytes(off_t size); /** Break big string into lines */ string breakIntoLines(const string& in, unsigned int ll = 100,