From a374b2a7b70a69107d4c6e80d3d4485965a0952d Mon Sep 17 00:00:00 2001
From: dockes <none@none>
Date: Wed, 30 Sep 2009 15:45:53 +0000
Subject: [PATCH] implemented paged text files

---
 src/internfile/mh_text.cpp         | 107 ++++++++++++++++++++++-------
 src/internfile/mh_text.h           |  14 +++-
 src/qtgui/confgui/confguiindex.cpp |  19 +++--
 src/sampleconf/recoll.conf.in      |   6 ++
 src/utils/readfile.cpp             |  95 +++++++++++++++++++------
 src/utils/readfile.h               |  25 +++++--
 6 files changed, 209 insertions(+), 57 deletions(-)

diff --git a/src/internfile/mh_text.cpp b/src/internfile/mh_text.cpp
index c5673b01..32cdb1c8 100644
--- a/src/internfile/mh_text.cpp
+++ b/src/internfile/mh_text.cpp
@@ -36,36 +36,54 @@ using namespace std;
 #include "rclconfig.h"
 
 const int MB = 1024*1024;
+const int KB = 1024;
 
 // Process a plain text file
 bool MimeHandlerText::set_document_file(const string &fn)
 {
-    RecollFilter::set_document_file(fn);
+    LOGDEB(("MimeHandlerText::set_document_file: [%s]\n", fn.c_str()));
 
-    // file size
+    RecollFilter::set_document_file(fn);
+    m_fn = fn;
+
+    // file size for oversize check
     struct stat st;
-    if (stat(fn.c_str(), &st) < 0) {
+    if (stat(m_fn.c_str(), &st) < 0) {
         LOGERR(("MimeHandlerText::set_document_file: stat(%s) errno %d\n",
-                fn.c_str(), errno));
+                m_fn.c_str(), errno));
         return false;
     }
 
-    // Handle max file size parameter. If it's too big, we just don't index
-    // the text at all (should we index the first maxmbs instead ?)
+    // Max file size parameter: texts over this size are not indexed
     int maxmbs = -1;
     RclConfig::getMainConfig()->getConfParam("textfilemaxmbs", &maxmbs);
 
-    string otext;
-    if (st.st_size / MB <= maxmbs) {
+    if (maxmbs == -1 || st.st_size / MB <= maxmbs) {
+        // Text file page size: if set, we split text files into
+        // multiple documents
+        int ps = -1;
+        RclConfig::getMainConfig()->getConfParam("textfilepagekbs", &ps);
+        if (ps != -1) {
+            ps *= KB;
+            m_paging = true;
+        }
+        m_pagesz = size_t(ps);
         string reason;
-        if (!file_to_string(fn, otext, &reason)) {
+        // file_to_string() takes pagesz == size_t(-1) to mean read all.
+        if (!file_to_string(fn, m_text, 0, m_pagesz, &reason)) {
             LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str()));
             return false;
         }
+        m_offs = m_text.length();
     }
-    return set_document_string(otext);
+
+    string md5, xmd5;
+    MD5String(m_text, md5);
+    m_metaData["md5"] = MD5HexPrint(md5, xmd5);
+    m_havedoc = true;
+    return true;
 }
-    
+
 bool MimeHandlerText::set_document_string(const string& otext)
 {
     m_text = otext;
@@ -76,29 +94,72 @@ bool MimeHandlerText::set_document_string(const string& otext)
     return true;
 }
 
+bool MimeHandlerText::skip_to_document(const string& ipath)
+{
+    sscanf(ipath.c_str(), "%lld", &m_offs);
+    readnext();
+    return true;
+}
+
 bool MimeHandlerText::next_document()
-{	
+{
+    LOGDEB(("MimeHandlerText::next_document: m_havedoc %d\n", int(m_havedoc)));
+
     if (m_havedoc == false)
 	return false;
-    m_havedoc = false;
+
+    // We transcode even if defcharset is already utf-8: 
+    // this validates the encoding.
     LOGDEB1(("MimeHandlerText::mkDoc: transcod from %s to utf-8\n", 
 	     m_defcharset.c_str()));
-
-    // Avoid unneeded copy. This gets a reference to an empty string which is
-    // the entry for "content"
-    string& utf8 = m_metaData["content"];
-
-    // Note that we transcode always even if defcharset is already utf-8: 
-    // this validates the encoding.
-    if (!transcode(m_text, utf8, m_defcharset, "UTF-8")) {
+    if (!transcode(m_text, m_metaData["content"], m_defcharset, "UTF-8")) {
 	LOGERR(("MimeHandlerText::mkDoc: transcode to utf-8 failed "
 		"for charset [%s]\n", m_defcharset.c_str()));
-	utf8.erase();
+	m_metaData["content"].erase();
 	return false;
     }
-
     m_metaData["origcharset"] = m_defcharset;
     m_metaData["charset"] = "utf-8";
     m_metaData["mimetype"] = "text/plain";
+
+    // If text length is 0 (the file is empty or oversize), or we have
+    // read all at once, we're done
+    if (m_text.length() == 0 || !m_paging) {
+        m_havedoc = false;
+        return true;
+    } else {
+        // Paging: set ipath then read next chunk
+        char buf[20];
+        sprintf(buf, "%lld", m_offs - m_text.length());
+        m_metaData["ipath"] = buf;
+        readnext();
+        return true;
+    }
+}
+
+bool MimeHandlerText::readnext()
+{
+    string reason;
+    m_text.erase();
+    if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) {
+        LOGERR(("MimeHandlerText: can't read file: %s\n", reason.c_str()));
+        m_havedoc = false;
+        return false;
+    }
+    if (m_text.length() == 0) {
+        // EOF
+        m_havedoc = false;
+        return true;
+    }
+
+    // If possible try to adjust the chunk to end right after a line 
+    // Don't do this for the last chunk
+    if (m_text.length() == m_pagesz) {
+        string::size_type pos = m_text.find_last_of("\n\r");
+        if (pos != string::npos && pos != 0) {
+            m_text.erase(pos);
+        }
+    }
+    m_offs += m_text.length();
     return true;
 }
diff --git a/src/internfile/mh_text.h b/src/internfile/mh_text.h
index 94b66bb1..2ce7d101 100644
--- a/src/internfile/mh_text.h
+++ b/src/internfile/mh_text.h
@@ -17,6 +17,7 @@
 #ifndef _MH_TEXT_H_INCLUDED_
 #define _MH_TEXT_H_INCLUDED_
 /* @(#$Id: mh_text.h,v 1.5 2008-10-04 14:26:59 dockes Exp $  (C) 2004 J.F.Dockes */
+#include <sys/types.h>
 
 #include <string>
 using std::string;
@@ -30,7 +31,8 @@ using std::string;
  */
 class MimeHandlerText : public RecollFilter {
  public:
-    MimeHandlerText(const string& mt) : RecollFilter(mt) {}
+    MimeHandlerText(const string& mt) 
+        : RecollFilter(mt), m_paging(false), m_offs(0) {}
     virtual ~MimeHandlerText() {}
     virtual bool set_document_file(const string &file_path);
     virtual bool set_document_string(const string&);
@@ -40,13 +42,23 @@ class MimeHandlerText : public RecollFilter {
 	return false;
     }
     virtual bool next_document();
+    virtual bool skip_to_document(const string& s);
     virtual void clear() 
     {
+        m_paging = false;
 	m_text.erase(); 
+        m_fn.erase();
+        m_offs = 0;
 	RecollFilter::clear();
     }
 private:
+    bool   m_paging;
     string m_text;
+    string m_fn;
+    off_t  m_offs; // Offset of next read in file if we're paging
+    size_t m_pagesz;
+    
+    bool readnext();
 };
 
 #endif /* _MH_TEXT_H_INCLUDED_ */
diff --git a/src/qtgui/confgui/confguiindex.cpp b/src/qtgui/confgui/confguiindex.cpp
index bf55de93..8d7de6a8 100644
--- a/src/qtgui/confgui/confguiindex.cpp
+++ b/src/qtgui/confgui/confguiindex.cpp
@@ -354,8 +354,7 @@ ConfSubPanelW::ConfSubPanelW(QWidget *parent, ConfNull *config)
 			  "unsupported mime type). Default true"));
     m_widgets.push_back(eafln);
 
-    ConfLink lnkzfmaxkbs(new ConfLinkRclRep(config, 
-					    "compressedfilemaxkbs"));
+    ConfLink lnkzfmaxkbs(new ConfLinkRclRep(config, "compressedfilemaxkbs"));
     ConfParamIntW *ezfmaxkbs = new 
 	ConfParamIntW(m_groupbox, lnkzfmaxkbs, 
 		      tr("Max. compressed file size (KB)"),
@@ -365,18 +364,28 @@ ConfSubPanelW::ConfSubPanelW(QWidget *parent, ConfNull *config)
 		      -1, 1000000);
     m_widgets.push_back(ezfmaxkbs);
 
-    ConfLink lnktxtmaxmbs(new ConfLinkRclRep(config, 
-					    "textfilemaxmbs"));
+    ConfLink lnktxtmaxmbs(new ConfLinkRclRep(config, "textfilemaxmbs"));
     ConfParamIntW *etxtmaxmbs = new 
 	ConfParamIntW(m_groupbox, lnktxtmaxmbs, 
 		      tr("Max. text file size (MB)"),
 		      tr("This value sets a threshold beyond which text "
 			 "files will not be processed. Set to -1 for no "
-			 "limit. This is for excluding monster "
+			 "limit. \nThis is for excluding monster "
                          "log files from the index."),
 		      -1, 1000000);
     m_widgets.push_back(etxtmaxmbs);
 
+    ConfLink lnktxtpagekbs(new ConfLinkRclRep(config, "textfilepagekbs"));
+    ConfParamIntW *etxtpagekbs = new 
+	ConfParamIntW(m_groupbox, lnktxtpagekbs, 
+		      tr("Text file page size (KB)"),
+		      tr("If this value is set (not equal to -1), text "
+                         "files will be split in chunks of this size for "
+                         "indexing.\nThis will help searching very big text "
+                         " files (ie: log files)."),
+		      -1, 1000000);
+    m_widgets.push_back(etxtpagekbs);
+
     vboxLayout->addWidget(m_groupbox);
     subDirChanged();
 }
diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in
index ff9c5092..0e93da9b 100644
--- a/src/sampleconf/recoll.conf.in
+++ b/src/sampleconf/recoll.conf.in
@@ -107,6 +107,12 @@ indexallfilenames = 1
 # Size limit for text files. This is for skipping monster logs
 textfilemaxmbs = -1
 
+# Page size for text files. If this is set, text/plain files will be
+# divided into documents of approximately this size. May be useful to
+# access pieces of big text files which would be problematic to load as one
+# piece into the preview window. Might be useful for big logs
+textfilepagekbs = -1
+
 # Length of abstracts we store while indexing. Longer will make for a
 # bigger db
 # idxabsmlen = 250
diff --git a/src/utils/readfile.cpp b/src/utils/readfile.cpp
index 2cb43faf..51ff180c 100644
--- a/src/utils/readfile.cpp
+++ b/src/utils/readfile.cpp
@@ -39,6 +39,10 @@ using std::string;
 
 #include "readfile.h"
 
+#ifndef MIN
+#define MIN(A,B) ((A) < (B) ? (A) : (B))
+#endif
+
 static void caterrno(string *reason, const char *what, int _errno)
 {
     if (reason) {
@@ -93,16 +97,28 @@ public:
 };
 
 bool file_to_string(const string &fn, string &data, string *reason)
+{
+    return file_to_string(fn, data, 0, size_t(-1), reason);
+}
+bool file_to_string(const string &fn, string &data, off_t offs, size_t cnt,
+                    string *reason)
 {
     FileToString accum(data);
-    return file_scan(fn, &accum, reason);
+    return file_scan(fn, &accum, offs, cnt, reason);
 }
 
+bool file_scan(const string &fn, FileScanDo* doer, string *reason)
+{
+    return file_scan(fn, doer, 0, size_t(-1), reason);
+}
+
+const int RDBUFSZ = 4096;
 // Note: the fstat() + reserve() (in init()) calls divide cpu usage almost by 2
 // on both linux i586 and macosx (compared to just append())
 // Also tried a version with mmap, but it's actually slower on the mac and not
 // faster on linux.
-bool file_scan(const string &fn, FileScanDo* doer, string *reason)
+bool file_scan(const string &fn, FileScanDo* doer, off_t startoffs, 
+               size_t cnttoread, string *reason)
 {
     bool ret = false;
     bool noclosing = true;
@@ -120,13 +136,36 @@ bool file_scan(const string &fn, FileScanDo* doer, string *reason)
 	}
 	noclosing = false;
     }
-    if (st.st_size > 0)
+
+    if (st.st_size > 0) {
 	doer->init(st.st_size+1, reason);
-    else 
+    } else if (cnttoread) {
+	doer->init(cnttoread+1, reason);
+    } else {
 	doer->init(0, reason);
-    char buf[4096];
+    }
+
+    off_t curoffs = 0;
+    if (startoffs > 0 && !fn.empty()) {
+        if (lseek(fd, startoffs, SEEK_SET) != startoffs) {
+            caterrno(reason, "lseek", errno);
+            return false;
+        }
+        curoffs = startoffs;
+    }
+
+    char buf[RDBUFSZ];
+    size_t totread = 0;
     for (;;) {
-	int n = read(fd, buf, 4096);
+        size_t toread = RDBUFSZ;
+        if (startoffs > 0 && curoffs < startoffs) {
+            toread = MIN(RDBUFSZ, startoffs - curoffs);
+        }
+
+        if (cnttoread != size_t(-1)) {
+            toread = MIN(toread, cnttoread - totread);
+        }
+	int n = read(fd, buf, toread);
 	if (n < 0) {
 	    caterrno(reason, "read", errno);
 	    goto out;
@@ -134,9 +173,16 @@ bool file_scan(const string &fn, FileScanDo* doer, string *reason)
 	if (n == 0)
 	    break;
 
+        curoffs += n;
+        if (curoffs - n < startoffs) 
+            continue;
+            
 	if (!doer->data(buf, n, reason)) {
 	    goto out;
 	}
+        totread += n;
+        if (cnttoread > 0 && totread >= cnttoread) 
+            break;
     }
 
     ret = true;
@@ -150,6 +196,8 @@ bool file_scan(const string &fn, FileScanDo* doer, string *reason)
 
 #include <sys/stat.h>
 #include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
 
 #include <string>
 #include <iostream>
@@ -160,11 +208,6 @@ using namespace std;
 
 using namespace std;
 
-static int     op_flags;
-#define OPT_MOINS 0x1
-#define OPT_f	  0x2
-#define OPT_F	  0x4
-
 class myCB : public FsTreeWalkerCB {
  public:
     FsTreeWalker::Status processone(const string &path, 
@@ -192,9 +235,14 @@ class myCB : public FsTreeWalkerCB {
     }
 };
 
+static int     op_flags;
+#define OPT_MOINS 0x1
+#define OPT_c     0x2
+#define OPT_o     0x4
+
 static const char *thisprog;
 static char usage [] =
-"trreadfile topdirorfile\n\n"
+"trreadfile [-o offs] [-c cnt] topdirorfile\n\n"
 ;
 static void
 Usage(void)
@@ -205,8 +253,8 @@ Usage(void)
 
 int main(int argc, const char **argv)
 {
-    list<string> patterns;
-    list<string> paths;
+    off_t offs = 0;
+    size_t cnt = size_t(-1);
     thisprog = argv[0];
     argc--; argv++;
 
@@ -217,31 +265,36 @@ int main(int argc, const char **argv)
       Usage();
     while (**argv)
       switch (*(*argv)++) {
-      case 'f':	op_flags |= OPT_f;break;
-      case 'F':	op_flags |= OPT_F;break;
+      case 'c':	op_flags |= OPT_c; if (argc < 2)  Usage();
+	  cnt = atol(*(++argv)); argc--; 
+	goto b1;
+      case 'o':	op_flags |= OPT_c; if (argc < 2)  Usage();
+	  offs = strtoul(*(++argv), 0, 0); argc--; 
+	goto b1;
       default: Usage();	break;
       }
-    argc--; argv++;
+  b1: argc--; argv++;
   }
 
   if (argc != 1)
     Usage();
   string top = *argv++;argc--;
+  cerr << "filename " << top << " offs " << offs << " cnt " << cnt << endl;
 
   struct stat st;
-  if (stat(top.c_str(), &st) < 0) {
+  if (!top.empty() && stat(top.c_str(), &st) < 0) {
       perror("stat");
       exit(1);
   }
-  if (S_ISDIR(st.st_mode)) {
+  if (!top.empty() && S_ISDIR(st.st_mode)) {
       FsTreeWalker walker;
       myCB cb;
       walker.walk(top, cb);
       if (walker.getErrCnt() > 0)
 	  cout << walker.getReason();
-  } else if (S_ISREG(st.st_mode)) {
+  } else {
       string s, reason;
-      if (!file_to_string(top, s, &reason)) {
+      if (!file_to_string(top, s, offs, cnt, &reason)) {
 	  cerr << reason << endl;
 	  exit(1);
       } else {
diff --git a/src/utils/readfile.h b/src/utils/readfile.h
index ca067db7..a28f28a8 100644
--- a/src/utils/readfile.h
+++ b/src/utils/readfile.h
@@ -21,19 +21,30 @@
 #include <string>
 using std::string;
 
-/**
- * Read whole file into string. 
- * @return true for ok, false else
+/** 
+ * Read file in chunks, calling an accumulator for each chunk. Can be used 
+ * for reading in a file, computing an md5...
  */
-bool file_to_string(const string &filename, string &data, string *reason = 0);
-
 class FileScanDo {
 public:
     virtual ~FileScanDo() {}
     virtual bool init(unsigned int size, string *reason) = 0;
     virtual bool data(const char *buf, int cnt, string* reason) = 0;
 };
-bool file_scan(const std::string &filename, FileScanDo* doer,  
-	       std::string *reason = 0);
+bool file_scan(const string &filename, FileScanDo* doer, string *reason = 0);
+/* Same but only process count cnt from offset offs. Set cnt to size_t(-1) 
+ * for no limit */
+bool file_scan(const string &fn, FileScanDo* doer, off_t offs, size_t cnt,
+               string *reason = 0);
+
+/**
+ * Read file into string.
+ * @return true for ok, false else
+ */
+bool file_to_string(const string &filename, string &data, string *reason = 0);
+
+/** Read file chunk into string. Set cnt to size_t(-1) for whole file */
+bool file_to_string(const string &filename, string &data, 
+                    off_t offs, size_t cnt, string *reason = 0);
 
 #endif /* _READFILE_H_INCLUDED_ */