From a374b2a7b70a69107d4c6e80d3d4485965a0952d Mon Sep 17 00:00:00 2001 From: dockes Date: Wed, 30 Sep 2009 15:45:53 +0000 Subject: [PATCH] implemented paged text files --- src/internfile/mh_text.cpp | 107 ++++++++++++++++++++++------- src/internfile/mh_text.h | 14 +++- src/qtgui/confgui/confguiindex.cpp | 19 +++-- src/sampleconf/recoll.conf.in | 6 ++ src/utils/readfile.cpp | 95 +++++++++++++++++++------ src/utils/readfile.h | 25 +++++-- 6 files changed, 209 insertions(+), 57 deletions(-) diff --git a/src/internfile/mh_text.cpp b/src/internfile/mh_text.cpp index c5673b01..32cdb1c8 100644 --- a/src/internfile/mh_text.cpp +++ b/src/internfile/mh_text.cpp @@ -36,36 +36,54 @@ using namespace std; #include "rclconfig.h" const int MB = 1024*1024; +const int KB = 1024; // Process a plain text file bool MimeHandlerText::set_document_file(const string &fn) { - RecollFilter::set_document_file(fn); + LOGDEB(("MimeHandlerText::set_document_file: [%s]\n", fn.c_str())); - // file size + RecollFilter::set_document_file(fn); + m_fn = fn; + + // file size for oversize check struct stat st; - if (stat(fn.c_str(), &st) < 0) { + if (stat(m_fn.c_str(), &st) < 0) { LOGERR(("MimeHandlerText::set_document_file: stat(%s) errno %d\n", - fn.c_str(), errno)); + m_fn.c_str(), errno)); return false; } - // Handle max file size parameter. If it's too big, we just don't index - // the text at all (should we index the first maxmbs instead ?) + // Max file size parameter: texts over this size are not indexed int maxmbs = -1; RclConfig::getMainConfig()->getConfParam("textfilemaxmbs", &maxmbs); - string otext; - if (st.st_size / MB <= maxmbs) { + if (maxmbs == -1 || st.st_size / MB <= maxmbs) { + // Text file page size: if set, we split text files into + // multiple documents + int ps = -1; + RclConfig::getMainConfig()->getConfParam("textfilepagekbs", &ps); + if (ps != -1) { + ps *= KB; + m_paging = true; + } + m_pagesz = size_t(ps); string reason; - if (!file_to_string(fn, otext, &reason)) { + // file_to_string() takes pagesz == size_t(-1) to mean read all. + if (!file_to_string(fn, m_text, 0, m_pagesz, &reason)) { LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str())); return false; } + m_offs = m_text.length(); } - return set_document_string(otext); + + string md5, xmd5; + MD5String(m_text, md5); + m_metaData["md5"] = MD5HexPrint(md5, xmd5); + m_havedoc = true; + return true; } - + bool MimeHandlerText::set_document_string(const string& otext) { m_text = otext; @@ -76,29 +94,72 @@ bool MimeHandlerText::set_document_string(const string& otext) return true; } +bool MimeHandlerText::skip_to_document(const string& ipath) +{ + sscanf(ipath.c_str(), "%lld", &m_offs); + readnext(); + return true; +} + bool MimeHandlerText::next_document() -{ +{ + LOGDEB(("MimeHandlerText::next_document: m_havedoc %d\n", int(m_havedoc))); + if (m_havedoc == false) return false; - m_havedoc = false; + + // We transcode even if defcharset is already utf-8: + // this validates the encoding. LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n", m_defcharset.c_str())); - - // Avoid unneeded copy. This gets a reference to an empty string which is - // the entry for "content" - string& utf8 = m_metaData["content"]; - - // Note that we transcode always even if defcharset is already utf-8: - // this validates the encoding. - if (!transcode(m_text, utf8, m_defcharset, "UTF-8")) { + if (!transcode(m_text, m_metaData["content"], m_defcharset, "UTF-8")) { LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed " "for charset [%s]\n", m_defcharset.c_str())); - utf8.erase(); + m_metaData["content"].erase(); return false; } - m_metaData["origcharset"] = m_defcharset; m_metaData["charset"] = "utf-8"; m_metaData["mimetype"] = "text/plain"; + + // If text length is 0 (the file is empty or oversize), or we have + // read all at once, we're done + if (m_text.length() == 0 || !m_paging) { + m_havedoc = false; + return true; + } else { + // Paging: set ipath then read next chunk + char buf[20]; + sprintf(buf, "%lld", m_offs - m_text.length()); + m_metaData["ipath"] = buf; + readnext(); + return true; + } +} + +bool MimeHandlerText::readnext() +{ + string reason; + m_text.erase(); + if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) { + LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str())); + m_havedoc = false; + return false; + } + if (m_text.length() == 0) { + // EOF + m_havedoc = false; + return true; + } + + // If possible try to adjust the chunk to end right after a line + // Don't do this for the last chunk + if (m_text.length() == m_pagesz) { + string::size_type pos = m_text.find_last_of("\n\r"); + if (pos != string::npos && pos != 0) { + m_text.erase(pos); + } + } + m_offs += m_text.length(); return true; } diff --git a/src/internfile/mh_text.h b/src/internfile/mh_text.h index 94b66bb1..2ce7d101 100644 --- a/src/internfile/mh_text.h +++ b/src/internfile/mh_text.h @@ -17,6 +17,7 @@ #ifndef _MH_TEXT_H_INCLUDED_ #define _MH_TEXT_H_INCLUDED_ /* @(#$Id: mh_text.h,v 1.5 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */ +#include #include using std::string; @@ -30,7 +31,8 @@ using std::string; */ class MimeHandlerText : public RecollFilter { public: - MimeHandlerText(const string& mt) : RecollFilter(mt) {} + MimeHandlerText(const string& mt) + : RecollFilter(mt), m_paging(false), m_offs(0) {} virtual ~MimeHandlerText() {} virtual bool set_document_file(const string &file_path); virtual bool set_document_string(const string&); @@ -40,13 +42,23 @@ class MimeHandlerText : public RecollFilter { return false; } virtual bool next_document(); + virtual bool skip_to_document(const string& s); virtual void clear() { + m_paging = false; m_text.erase(); + m_fn.erase(); + m_offs = 0; RecollFilter::clear(); } private: + bool m_paging; string m_text; + string m_fn; + off_t m_offs; // Offset of next read in file if we're paging + size_t m_pagesz; + + bool readnext(); }; #endif /* _MH_TEXT_H_INCLUDED_ */ diff --git a/src/qtgui/confgui/confguiindex.cpp b/src/qtgui/confgui/confguiindex.cpp index bf55de93..8d7de6a8 100644 --- a/src/qtgui/confgui/confguiindex.cpp +++ b/src/qtgui/confgui/confguiindex.cpp @@ -354,8 +354,7 @@ ConfSubPanelW::ConfSubPanelW(QWidget *parent, ConfNull *config) "unsupported mime type). Default true")); m_widgets.push_back(eafln); - ConfLink lnkzfmaxkbs(new ConfLinkRclRep(config, - "compressedfilemaxkbs")); + ConfLink lnkzfmaxkbs(new ConfLinkRclRep(config, "compressedfilemaxkbs")); ConfParamIntW *ezfmaxkbs = new ConfParamIntW(m_groupbox, lnkzfmaxkbs, tr("Max. compressed file size (KB)"), @@ -365,18 +364,28 @@ ConfSubPanelW::ConfSubPanelW(QWidget *parent, ConfNull *config) -1, 1000000); m_widgets.push_back(ezfmaxkbs); - ConfLink lnktxtmaxmbs(new ConfLinkRclRep(config, - "textfilemaxmbs")); + ConfLink lnktxtmaxmbs(new ConfLinkRclRep(config, "textfilemaxmbs")); ConfParamIntW *etxtmaxmbs = new ConfParamIntW(m_groupbox, lnktxtmaxmbs, tr("Max. text file size (MB)"), tr("This value sets a threshold beyond which text " "files will not be processed. Set to -1 for no " - "limit. This is for excluding monster " + "limit. \nThis is for excluding monster " "log files from the index."), -1, 1000000); m_widgets.push_back(etxtmaxmbs); + ConfLink lnktxtpagekbs(new ConfLinkRclRep(config, "textfilepagekbs")); + ConfParamIntW *etxtpagekbs = new + ConfParamIntW(m_groupbox, lnktxtpagekbs, + tr("Text file page size (KB)"), + tr("If this value is set (not equal to -1), text " + "files will be split in chunks of this size for " + "indexing.\nThis will help searching very big text " + " files (ie: log files)."), + -1, 1000000); + m_widgets.push_back(etxtpagekbs); + vboxLayout->addWidget(m_groupbox); subDirChanged(); } diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in index ff9c5092..0e93da9b 100644 --- a/src/sampleconf/recoll.conf.in +++ b/src/sampleconf/recoll.conf.in @@ -107,6 +107,12 @@ indexallfilenames = 1 # Size limit for text files. This is for skipping monster logs textfilemaxmbs = -1 +# Page size for text files. If this is set, text/plain files will be +# divided into documents of approximately this size. May be useful to +# access pieces of big text files which would be problematic to load as one +# piece into the preview window. Might be useful for big logs +textfilepagekbs = -1 + # Length of abstracts we store while indexing. Longer will make for a # bigger db # idxabsmlen = 250 diff --git a/src/utils/readfile.cpp b/src/utils/readfile.cpp index 2cb43faf..51ff180c 100644 --- a/src/utils/readfile.cpp +++ b/src/utils/readfile.cpp @@ -39,6 +39,10 @@ using std::string; #include "readfile.h" +#ifndef MIN +#define MIN(A,B) ((A) < (B) ? (A) : (B)) +#endif + static void caterrno(string *reason, const char *what, int _errno) { if (reason) { @@ -93,16 +97,28 @@ public: }; bool file_to_string(const string &fn, string &data, string *reason) +{ + return file_to_string(fn, data, 0, size_t(-1), reason); +} +bool file_to_string(const string &fn, string &data, off_t offs, size_t cnt, + string *reason) { FileToString accum(data); - return file_scan(fn, &accum, reason); + return file_scan(fn, &accum, offs, cnt, reason); } +bool file_scan(const string &fn, FileScanDo* doer, string *reason) +{ + return file_scan(fn, doer, 0, size_t(-1), reason); +} + +const int RDBUFSZ = 4096; // Note: the fstat() + reserve() (in init()) calls divide cpu usage almost by 2 // on both linux i586 and macosx (compared to just append()) // Also tried a version with mmap, but it's actually slower on the mac and not // faster on linux. -bool file_scan(const string &fn, FileScanDo* doer, string *reason) +bool file_scan(const string &fn, FileScanDo* doer, off_t startoffs, + size_t cnttoread, string *reason) { bool ret = false; bool noclosing = true; @@ -120,13 +136,36 @@ bool file_scan(const string &fn, FileScanDo* doer, string *reason) } noclosing = false; } - if (st.st_size > 0) + + if (st.st_size > 0) { doer->init(st.st_size+1, reason); - else + } else if (cnttoread) { + doer->init(cnttoread+1, reason); + } else { doer->init(0, reason); - char buf[4096]; + } + + off_t curoffs = 0; + if (startoffs > 0 && !fn.empty()) { + if (lseek(fd, startoffs, SEEK_SET) != startoffs) { + caterrno(reason, "lseek", errno); + return false; + } + curoffs = startoffs; + } + + char buf[RDBUFSZ]; + size_t totread = 0; for (;;) { - int n = read(fd, buf, 4096); + size_t toread = RDBUFSZ; + if (startoffs > 0 && curoffs < startoffs) { + toread = MIN(RDBUFSZ, startoffs - curoffs); + } + + if (cnttoread != size_t(-1)) { + toread = MIN(toread, cnttoread - totread); + } + int n = read(fd, buf, toread); if (n < 0) { caterrno(reason, "read", errno); goto out; @@ -134,9 +173,16 @@ bool file_scan(const string &fn, FileScanDo* doer, string *reason) if (n == 0) break; + curoffs += n; + if (curoffs - n < startoffs) + continue; + if (!doer->data(buf, n, reason)) { goto out; } + totread += n; + if (cnttoread > 0 && totread >= cnttoread) + break; } ret = true; @@ -150,6 +196,8 @@ bool file_scan(const string &fn, FileScanDo* doer, string *reason) #include #include +#include +#include #include #include @@ -160,11 +208,6 @@ using namespace std; using namespace std; -static int op_flags; -#define OPT_MOINS 0x1 -#define OPT_f 0x2 -#define OPT_F 0x4 - class myCB : public FsTreeWalkerCB { public: FsTreeWalker::Status processone(const string &path, @@ -192,9 +235,14 @@ class myCB : public FsTreeWalkerCB { } }; +static int op_flags; +#define OPT_MOINS 0x1 +#define OPT_c 0x2 +#define OPT_o 0x4 + static const char *thisprog; static char usage [] = -"trreadfile topdirorfile\n\n" +"trreadfile [-o offs] [-c cnt] topdirorfile\n\n" ; static void Usage(void) @@ -205,8 +253,8 @@ Usage(void) int main(int argc, const char **argv) { - list patterns; - list paths; + off_t offs = 0; + size_t cnt = size_t(-1); thisprog = argv[0]; argc--; argv++; @@ -217,31 +265,36 @@ int main(int argc, const char **argv) Usage(); while (**argv) switch (*(*argv)++) { - case 'f': op_flags |= OPT_f;break; - case 'F': op_flags |= OPT_F;break; + case 'c': op_flags |= OPT_c; if (argc < 2) Usage(); + cnt = atol(*(++argv)); argc--; + goto b1; + case 'o': op_flags |= OPT_c; if (argc < 2) Usage(); + offs = strtoul(*(++argv), 0, 0); argc--; + goto b1; default: Usage(); break; } - argc--; argv++; + b1: argc--; argv++; } if (argc != 1) Usage(); string top = *argv++;argc--; + cerr << "filename " << top << " offs " << offs << " cnt " << cnt << endl; struct stat st; - if (stat(top.c_str(), &st) < 0) { + if (!top.empty() && stat(top.c_str(), &st) < 0) { perror("stat"); exit(1); } - if (S_ISDIR(st.st_mode)) { + if (!top.empty() && S_ISDIR(st.st_mode)) { FsTreeWalker walker; myCB cb; walker.walk(top, cb); if (walker.getErrCnt() > 0) cout << walker.getReason(); - } else if (S_ISREG(st.st_mode)) { + } else { string s, reason; - if (!file_to_string(top, s, &reason)) { + if (!file_to_string(top, s, offs, cnt, &reason)) { cerr << reason << endl; exit(1); } else { diff --git a/src/utils/readfile.h b/src/utils/readfile.h index ca067db7..a28f28a8 100644 --- a/src/utils/readfile.h +++ b/src/utils/readfile.h @@ -21,19 +21,30 @@ #include using std::string; -/** - * Read whole file into string. - * @return true for ok, false else +/** + * Read file in chunks, calling an accumulator for each chunk. Can be used + * for reading in a file, computing an md5... */ -bool file_to_string(const string &filename, string &data, string *reason = 0); - class FileScanDo { public: virtual ~FileScanDo() {} virtual bool init(unsigned int size, string *reason) = 0; virtual bool data(const char *buf, int cnt, string* reason) = 0; }; -bool file_scan(const std::string &filename, FileScanDo* doer, - std::string *reason = 0); +bool file_scan(const string &filename, FileScanDo* doer, string *reason = 0); +/* Same but only process count cnt from offset offs. Set cnt to size_t(-1) + * for no limit */ +bool file_scan(const string &fn, FileScanDo* doer, off_t offs, size_t cnt, + string *reason = 0); + +/** + * Read file into string. + * @return true for ok, false else + */ +bool file_to_string(const string &filename, string &data, string *reason = 0); + +/** Read file chunk into string. Set cnt to size_t(-1) for whole file */ +bool file_to_string(const string &filename, string &data, + off_t offs, size_t cnt, string *reason = 0); #endif /* _READFILE_H_INCLUDED_ */