implemented paged text files

This commit is contained in:
dockes 2009-09-30 15:45:53 +00:00
parent 0e1cbddb8b
commit a374b2a7b7
6 changed files with 209 additions and 57 deletions

View File

@ -36,36 +36,54 @@ using namespace std;
#include "rclconfig.h" #include "rclconfig.h"
const int MB = 1024*1024; const int MB = 1024*1024;
const int KB = 1024;
// Process a plain text file // Process a plain text file
bool MimeHandlerText::set_document_file(const string &fn) bool MimeHandlerText::set_document_file(const string &fn)
{ {
RecollFilter::set_document_file(fn); LOGDEB(("MimeHandlerText::set_document_file: [%s]\n", fn.c_str()));
// file size RecollFilter::set_document_file(fn);
m_fn = fn;
// file size for oversize check
struct stat st; struct stat st;
if (stat(fn.c_str(), &st) < 0) { if (stat(m_fn.c_str(), &st) < 0) {
LOGERR(("MimeHandlerText::set_document_file: stat(%s) errno %d\n", LOGERR(("MimeHandlerText::set_document_file: stat(%s) errno %d\n",
fn.c_str(), errno)); m_fn.c_str(), errno));
return false; return false;
} }
// Handle max file size parameter. If it's too big, we just don't index // Max file size parameter: texts over this size are not indexed
// the text at all (should we index the first maxmbs instead ?)
int maxmbs = -1; int maxmbs = -1;
RclConfig::getMainConfig()->getConfParam("textfilemaxmbs", &maxmbs); RclConfig::getMainConfig()->getConfParam("textfilemaxmbs", &maxmbs);
string otext; if (maxmbs == -1 || st.st_size / MB <= maxmbs) {
if (st.st_size / MB <= maxmbs) { // Text file page size: if set, we split text files into
// multiple documents
int ps = -1;
RclConfig::getMainConfig()->getConfParam("textfilepagekbs", &ps);
if (ps != -1) {
ps *= KB;
m_paging = true;
}
m_pagesz = size_t(ps);
string reason; string reason;
if (!file_to_string(fn, otext, &reason)) { // file_to_string() takes pagesz == size_t(-1) to mean read all.
if (!file_to_string(fn, m_text, 0, m_pagesz, &reason)) {
LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str())); LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str()));
return false; return false;
} }
m_offs = m_text.length();
} }
return set_document_string(otext);
string md5, xmd5;
MD5String(m_text, md5);
m_metaData["md5"] = MD5HexPrint(md5, xmd5);
m_havedoc = true;
return true;
} }
bool MimeHandlerText::set_document_string(const string& otext) bool MimeHandlerText::set_document_string(const string& otext)
{ {
m_text = otext; m_text = otext;
@ -76,29 +94,72 @@ bool MimeHandlerText::set_document_string(const string& otext)
return true; return true;
} }
bool MimeHandlerText::skip_to_document(const string& ipath)
{
sscanf(ipath.c_str(), "%lld", &m_offs);
readnext();
return true;
}
bool MimeHandlerText::next_document() bool MimeHandlerText::next_document()
{ {
LOGDEB(("MimeHandlerText::next_document: m_havedoc %d\n", int(m_havedoc)));
if (m_havedoc == false) if (m_havedoc == false)
return false; return false;
m_havedoc = false;
// We transcode even if defcharset is already utf-8:
// this validates the encoding.
LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n", LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n",
m_defcharset.c_str())); m_defcharset.c_str()));
if (!transcode(m_text, m_metaData["content"], m_defcharset, "UTF-8")) {
// Avoid unneeded copy. This gets a reference to an empty string which is
// the entry for "content"
string& utf8 = m_metaData["content"];
// Note that we transcode always even if defcharset is already utf-8:
// this validates the encoding.
if (!transcode(m_text, utf8, m_defcharset, "UTF-8")) {
LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed " LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed "
"for charset [%s]\n", m_defcharset.c_str())); "for charset [%s]\n", m_defcharset.c_str()));
utf8.erase(); m_metaData["content"].erase();
return false; return false;
} }
m_metaData["origcharset"] = m_defcharset; m_metaData["origcharset"] = m_defcharset;
m_metaData["charset"] = "utf-8"; m_metaData["charset"] = "utf-8";
m_metaData["mimetype"] = "text/plain"; m_metaData["mimetype"] = "text/plain";
// If text length is 0 (the file is empty or oversize), or we have
// read all at once, we're done
if (m_text.length() == 0 || !m_paging) {
m_havedoc = false;
return true;
} else {
// Paging: set ipath then read next chunk
char buf[20];
sprintf(buf, "%lld", m_offs - m_text.length());
m_metaData["ipath"] = buf;
readnext();
return true;
}
}
bool MimeHandlerText::readnext()
{
string reason;
m_text.erase();
if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) {
LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str()));
m_havedoc = false;
return false;
}
if (m_text.length() == 0) {
// EOF
m_havedoc = false;
return true;
}
// If possible try to adjust the chunk to end right after a line
// Don't do this for the last chunk
if (m_text.length() == m_pagesz) {
string::size_type pos = m_text.find_last_of("\n\r");
if (pos != string::npos && pos != 0) {
m_text.erase(pos);
}
}
m_offs += m_text.length();
return true; return true;
} }

View File

@ -17,6 +17,7 @@
#ifndef _MH_TEXT_H_INCLUDED_ #ifndef _MH_TEXT_H_INCLUDED_
#define _MH_TEXT_H_INCLUDED_ #define _MH_TEXT_H_INCLUDED_
/* @(#$Id: mh_text.h,v 1.5 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */ /* @(#$Id: mh_text.h,v 1.5 2008-10-04 14:26:59 dockes Exp $ (C) 2004 J.F.Dockes */
#include <sys/types.h>
#include <string> #include <string>
using std::string; using std::string;
@ -30,7 +31,8 @@ using std::string;
*/ */
class MimeHandlerText : public RecollFilter { class MimeHandlerText : public RecollFilter {
public: public:
MimeHandlerText(const string& mt) : RecollFilter(mt) {} MimeHandlerText(const string& mt)
: RecollFilter(mt), m_paging(false), m_offs(0) {}
virtual ~MimeHandlerText() {} virtual ~MimeHandlerText() {}
virtual bool set_document_file(const string &file_path); virtual bool set_document_file(const string &file_path);
virtual bool set_document_string(const string&); virtual bool set_document_string(const string&);
@ -40,13 +42,23 @@ class MimeHandlerText : public RecollFilter {
return false; return false;
} }
virtual bool next_document(); virtual bool next_document();
virtual bool skip_to_document(const string& s);
virtual void clear() virtual void clear()
{ {
m_paging = false;
m_text.erase(); m_text.erase();
m_fn.erase();
m_offs = 0;
RecollFilter::clear(); RecollFilter::clear();
} }
private: private:
bool m_paging;
string m_text; string m_text;
string m_fn;
off_t m_offs; // Offset of next read in file if we're paging
size_t m_pagesz;
bool readnext();
}; };
#endif /* _MH_TEXT_H_INCLUDED_ */ #endif /* _MH_TEXT_H_INCLUDED_ */

View File

@ -354,8 +354,7 @@ ConfSubPanelW::ConfSubPanelW(QWidget *parent, ConfNull *config)
"unsupported mime type). Default true")); "unsupported mime type). Default true"));
m_widgets.push_back(eafln); m_widgets.push_back(eafln);
ConfLink lnkzfmaxkbs(new ConfLinkRclRep(config, ConfLink lnkzfmaxkbs(new ConfLinkRclRep(config, "compressedfilemaxkbs"));
"compressedfilemaxkbs"));
ConfParamIntW *ezfmaxkbs = new ConfParamIntW *ezfmaxkbs = new
ConfParamIntW(m_groupbox, lnkzfmaxkbs, ConfParamIntW(m_groupbox, lnkzfmaxkbs,
tr("Max. compressed file size (KB)"), tr("Max. compressed file size (KB)"),
@ -365,18 +364,28 @@ ConfSubPanelW::ConfSubPanelW(QWidget *parent, ConfNull *config)
-1, 1000000); -1, 1000000);
m_widgets.push_back(ezfmaxkbs); m_widgets.push_back(ezfmaxkbs);
ConfLink lnktxtmaxmbs(new ConfLinkRclRep(config, ConfLink lnktxtmaxmbs(new ConfLinkRclRep(config, "textfilemaxmbs"));
"textfilemaxmbs"));
ConfParamIntW *etxtmaxmbs = new ConfParamIntW *etxtmaxmbs = new
ConfParamIntW(m_groupbox, lnktxtmaxmbs, ConfParamIntW(m_groupbox, lnktxtmaxmbs,
tr("Max. text file size (MB)"), tr("Max. text file size (MB)"),
tr("This value sets a threshold beyond which text " tr("This value sets a threshold beyond which text "
"files will not be processed. Set to -1 for no " "files will not be processed. Set to -1 for no "
"limit. This is for excluding monster " "limit. \nThis is for excluding monster "
"log files from the index."), "log files from the index."),
-1, 1000000); -1, 1000000);
m_widgets.push_back(etxtmaxmbs); m_widgets.push_back(etxtmaxmbs);
ConfLink lnktxtpagekbs(new ConfLinkRclRep(config, "textfilepagekbs"));
ConfParamIntW *etxtpagekbs = new
ConfParamIntW(m_groupbox, lnktxtpagekbs,
tr("Text file page size (KB)"),
tr("If this value is set (not equal to -1), text "
"files will be split in chunks of this size for "
"indexing.\nThis will help searching very big text "
" files (ie: log files)."),
-1, 1000000);
m_widgets.push_back(etxtpagekbs);
vboxLayout->addWidget(m_groupbox); vboxLayout->addWidget(m_groupbox);
subDirChanged(); subDirChanged();
} }

View File

@ -107,6 +107,12 @@ indexallfilenames = 1
# Size limit for text files. This is for skipping monster logs # Size limit for text files. This is for skipping monster logs
textfilemaxmbs = -1 textfilemaxmbs = -1
# Page size for text files. If this is set, text/plain files will be
# divided into documents of approximately this size. May be useful to
# access pieces of big text files which would be problematic to load as one
# piece into the preview window. Might be useful for big logs
textfilepagekbs = -1
# Length of abstracts we store while indexing. Longer will make for a # Length of abstracts we store while indexing. Longer will make for a
# bigger db # bigger db
# idxabsmlen = 250 # idxabsmlen = 250

View File

@ -39,6 +39,10 @@ using std::string;
#include "readfile.h" #include "readfile.h"
#ifndef MIN
#define MIN(A,B) ((A) < (B) ? (A) : (B))
#endif
static void caterrno(string *reason, const char *what, int _errno) static void caterrno(string *reason, const char *what, int _errno)
{ {
if (reason) { if (reason) {
@ -93,16 +97,28 @@ public:
}; };
bool file_to_string(const string &fn, string &data, string *reason) bool file_to_string(const string &fn, string &data, string *reason)
{
return file_to_string(fn, data, 0, size_t(-1), reason);
}
bool file_to_string(const string &fn, string &data, off_t offs, size_t cnt,
string *reason)
{ {
FileToString accum(data); FileToString accum(data);
return file_scan(fn, &accum, reason); return file_scan(fn, &accum, offs, cnt, reason);
} }
bool file_scan(const string &fn, FileScanDo* doer, string *reason)
{
return file_scan(fn, doer, 0, size_t(-1), reason);
}
const int RDBUFSZ = 4096;
// Note: the fstat() + reserve() (in init()) calls divide cpu usage almost by 2 // Note: the fstat() + reserve() (in init()) calls divide cpu usage almost by 2
// on both linux i586 and macosx (compared to just append()) // on both linux i586 and macosx (compared to just append())
// Also tried a version with mmap, but it's actually slower on the mac and not // Also tried a version with mmap, but it's actually slower on the mac and not
// faster on linux. // faster on linux.
bool file_scan(const string &fn, FileScanDo* doer, string *reason) bool file_scan(const string &fn, FileScanDo* doer, off_t startoffs,
size_t cnttoread, string *reason)
{ {
bool ret = false; bool ret = false;
bool noclosing = true; bool noclosing = true;
@ -120,13 +136,36 @@ bool file_scan(const string &fn, FileScanDo* doer, string *reason)
} }
noclosing = false; noclosing = false;
} }
if (st.st_size > 0)
if (st.st_size > 0) {
doer->init(st.st_size+1, reason); doer->init(st.st_size+1, reason);
else } else if (cnttoread) {
doer->init(cnttoread+1, reason);
} else {
doer->init(0, reason); doer->init(0, reason);
char buf[4096]; }
off_t curoffs = 0;
if (startoffs > 0 && !fn.empty()) {
if (lseek(fd, startoffs, SEEK_SET) != startoffs) {
caterrno(reason, "lseek", errno);
return false;
}
curoffs = startoffs;
}
char buf[RDBUFSZ];
size_t totread = 0;
for (;;) { for (;;) {
int n = read(fd, buf, 4096); size_t toread = RDBUFSZ;
if (startoffs > 0 && curoffs < startoffs) {
toread = MIN(RDBUFSZ, startoffs - curoffs);
}
if (cnttoread != size_t(-1)) {
toread = MIN(toread, cnttoread - totread);
}
int n = read(fd, buf, toread);
if (n < 0) { if (n < 0) {
caterrno(reason, "read", errno); caterrno(reason, "read", errno);
goto out; goto out;
@ -134,9 +173,16 @@ bool file_scan(const string &fn, FileScanDo* doer, string *reason)
if (n == 0) if (n == 0)
break; break;
curoffs += n;
if (curoffs - n < startoffs)
continue;
if (!doer->data(buf, n, reason)) { if (!doer->data(buf, n, reason)) {
goto out; goto out;
} }
totread += n;
if (cnttoread > 0 && totread >= cnttoread)
break;
} }
ret = true; ret = true;
@ -150,6 +196,8 @@ bool file_scan(const string &fn, FileScanDo* doer, string *reason)
#include <sys/stat.h> #include <sys/stat.h>
#include <stdlib.h> #include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <string> #include <string>
#include <iostream> #include <iostream>
@ -160,11 +208,6 @@ using namespace std;
using namespace std; using namespace std;
static int op_flags;
#define OPT_MOINS 0x1
#define OPT_f 0x2
#define OPT_F 0x4
class myCB : public FsTreeWalkerCB { class myCB : public FsTreeWalkerCB {
public: public:
FsTreeWalker::Status processone(const string &path, FsTreeWalker::Status processone(const string &path,
@ -192,9 +235,14 @@ class myCB : public FsTreeWalkerCB {
} }
}; };
static int op_flags;
#define OPT_MOINS 0x1
#define OPT_c 0x2
#define OPT_o 0x4
static const char *thisprog; static const char *thisprog;
static char usage [] = static char usage [] =
"trreadfile topdirorfile\n\n" "trreadfile [-o offs] [-c cnt] topdirorfile\n\n"
; ;
static void static void
Usage(void) Usage(void)
@ -205,8 +253,8 @@ Usage(void)
int main(int argc, const char **argv) int main(int argc, const char **argv)
{ {
list<string> patterns; off_t offs = 0;
list<string> paths; size_t cnt = size_t(-1);
thisprog = argv[0]; thisprog = argv[0];
argc--; argv++; argc--; argv++;
@ -217,31 +265,36 @@ int main(int argc, const char **argv)
Usage(); Usage();
while (**argv) while (**argv)
switch (*(*argv)++) { switch (*(*argv)++) {
case 'f': op_flags |= OPT_f;break; case 'c': op_flags |= OPT_c; if (argc < 2) Usage();
case 'F': op_flags |= OPT_F;break; cnt = atol(*(++argv)); argc--;
goto b1;
case 'o': op_flags |= OPT_c; if (argc < 2) Usage();
offs = strtoul(*(++argv), 0, 0); argc--;
goto b1;
default: Usage(); break; default: Usage(); break;
} }
argc--; argv++; b1: argc--; argv++;
} }
if (argc != 1) if (argc != 1)
Usage(); Usage();
string top = *argv++;argc--; string top = *argv++;argc--;
cerr << "filename " << top << " offs " << offs << " cnt " << cnt << endl;
struct stat st; struct stat st;
if (stat(top.c_str(), &st) < 0) { if (!top.empty() && stat(top.c_str(), &st) < 0) {
perror("stat"); perror("stat");
exit(1); exit(1);
} }
if (S_ISDIR(st.st_mode)) { if (!top.empty() && S_ISDIR(st.st_mode)) {
FsTreeWalker walker; FsTreeWalker walker;
myCB cb; myCB cb;
walker.walk(top, cb); walker.walk(top, cb);
if (walker.getErrCnt() > 0) if (walker.getErrCnt() > 0)
cout << walker.getReason(); cout << walker.getReason();
} else if (S_ISREG(st.st_mode)) { } else {
string s, reason; string s, reason;
if (!file_to_string(top, s, &reason)) { if (!file_to_string(top, s, offs, cnt, &reason)) {
cerr << reason << endl; cerr << reason << endl;
exit(1); exit(1);
} else { } else {

View File

@ -21,19 +21,30 @@
#include <string> #include <string>
using std::string; using std::string;
/** /**
* Read whole file into string. * Read file in chunks, calling an accumulator for each chunk. Can be used
* @return true for ok, false else * for reading in a file, computing an md5...
*/ */
bool file_to_string(const string &filename, string &data, string *reason = 0);
class FileScanDo { class FileScanDo {
public: public:
virtual ~FileScanDo() {} virtual ~FileScanDo() {}
virtual bool init(unsigned int size, string *reason) = 0; virtual bool init(unsigned int size, string *reason) = 0;
virtual bool data(const char *buf, int cnt, string* reason) = 0; virtual bool data(const char *buf, int cnt, string* reason) = 0;
}; };
bool file_scan(const std::string &filename, FileScanDo* doer, bool file_scan(const string &filename, FileScanDo* doer, string *reason = 0);
std::string *reason = 0); /* Same but only process count cnt from offset offs. Set cnt to size_t(-1)
* for no limit */
bool file_scan(const string &fn, FileScanDo* doer, off_t offs, size_t cnt,
string *reason = 0);
/**
* Read file into string.
* @return true for ok, false else
*/
bool file_to_string(const string &filename, string &data, string *reason = 0);
/** Read file chunk into string. Set cnt to size_t(-1) for whole file */
bool file_to_string(const string &filename, string &data,
off_t offs, size_t cnt, string *reason = 0);
#endif /* _READFILE_H_INCLUDED_ */ #endif /* _READFILE_H_INCLUDED_ */