From 15924ce03728f384ef92d466ca165b3bf33afd81 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Sat, 15 Aug 2020 10:20:48 +0200
Subject: [PATCH] Process text/plain subdocuments like .txt files (paging big
 ones, etc.)

---
 src/internfile/internfile.cpp  |   5 +-
 src/internfile/mh_execm.cpp    |   9 ++
 src/internfile/mh_text.cpp     | 127 ++++++++++++---------
 src/internfile/mh_text.cpp-old | 201 +++++++++++++++++++++++++++++++++
 src/internfile/mh_text.h       |   6 +-
 src/sampleconf/mimeconf        |   1 +
 6 files changed, 294 insertions(+), 55 deletions(-)
 create mode 100644 src/internfile/mh_text.cpp-old

diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp
index cf5df032..68336a31 100644
--- a/src/internfile/internfile.cpp
+++ b/src/internfile/internfile.cpp
@@ -939,7 +939,10 @@ breakloop:
     }
     // Keep this AFTER collectIpathAndMT
     dijontorcl(doc);
-
+    // Fix the bogus mtype used to force mh_text processing of text subdocs
+    if (doc.mimetype == "text/plain1") {
+        doc.mimetype = "text/plain";
+    }
     // Possibly destack so that we can test for FIDone. While doing this
     // possibly set aside an ancestor html text (for the GUI preview)
     while (!m_handlers.empty() && !m_handlers.back()->has_documents()) {
diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp
index d15975b0..5b5394ee 100644
--- a/src/internfile/mh_execm.cpp
+++ b/src/internfile/mh_execm.cpp
@@ -314,6 +314,15 @@ bool MimeHandlerExecMultiple::next_document()
                     mtype = "application/octet-stream";
                 }
             }
+            /* If we identify text/plain from the suffix (as opposed
+               to the handler setting the type), we use text/plain1
+               instead. As directed in mimeconf, this will cause the
+               text handler to be applied (instead of internfile just
+               ending things there), allowing splitting and default
+               charset conversions. */
+            if (mtype == "text/plain") {
+                mtype = "text/plain1";
+            }
         }
         m_metaData[cstr_dj_keymt] = mtype;
         if (!m_forPreview) {
diff --git a/src/internfile/mh_text.cpp b/src/internfile/mh_text.cpp
index a3fec423..a80008f6 100644
--- a/src/internfile/mh_text.cpp
+++ b/src/internfile/mh_text.cpp
@@ -36,8 +36,23 @@
 
 using namespace std;
 
-const int MB = 1024*1024;
-const int KB = 1024;
+void MimeHandlerText::getparams()
+{
+    m_config->getConfParam("textfilemaxmbs", &m_maxmbs);
+
+    // Text file page size: if set, we split text files into
+    // multiple documents
+    int ps = 1000;
+    m_config->getConfParam("textfilepagekbs", &ps);
+    if (ps != -1) {
+        ps *= 1024;
+        m_paging = true;
+    } else {
+        m_paging = false;
+    }
+    m_pagesz = size_t(ps);
+    m_offs = 0;
+}
 
 // Process a plain text file
 bool MimeHandlerText::set_document_file_impl(const string&, const string &fn)
@@ -46,13 +61,9 @@ bool MimeHandlerText::set_document_file_impl(const string&, const string &fn)
            m_offs << "\n");
 
     m_fn = fn;
-    // This should not be necessary, but it happens on msw that offset is large
-    // negative at this point, could not find the reason (still trying).
-    m_offs = 0;
-
     // file size for oversize check
-    long long fsize = path_filesize(m_fn);
-    if (fsize < 0) {
+    m_totlen = path_filesize(m_fn);
+    if (m_totlen < 0) {
         LOGERR("MimeHandlerText::set_document_file: stat " << m_fn <<
                " errno " << errno << "\n");
         return false;
@@ -64,31 +75,14 @@ bool MimeHandlerText::set_document_file_impl(const string&, const string &fn)
     pxattr::get(m_fn, "charset", &m_charsetfromxattr);
 #endif
 
-    // Max file size parameter: texts over this size are not indexed
-    int maxmbs = 20;
-    m_config->getConfParam("textfilemaxmbs", &maxmbs);
-
-    if (maxmbs == -1 || fsize / MB <= maxmbs) {
-        // Text file page size: if set, we split text files into
-        // multiple documents
-        int ps = 1000;
-        m_config->getConfParam("textfilepagekbs", &ps);
-        if (ps != -1) {
-            ps *= KB;
-            m_paging = true;
-        }
-        // Note: size_t is guaranteed unsigned, so max if ps is -1
-        m_pagesz = size_t(ps);
-        if (!readnext())
-            return false;
-    } else {
-        LOGINF("MimeHandlerText: file too big (textfilemaxmbs=" << maxmbs <<
+    getparams();
+    if (m_maxmbs != -1 && m_totlen / (1024*1024) > m_maxmbs) {
+        LOGINF("MimeHandlerText: file too big (textfilemaxmbs=" << m_maxmbs <<
                "), contents will not be indexed: " << fn << endl);
-    }
-    if (!m_forPreview) {
-    string md5, xmd5;
-    MD5String(m_text, md5);
-    m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
+    } else {
+        if (!readnext()) {
+            return false;
+        } 
     }
     m_havedoc = true;
     return true;
@@ -97,12 +91,25 @@ bool MimeHandlerText::set_document_file_impl(const string&, const string &fn)
 bool MimeHandlerText::set_document_string_impl(const string&,
                                                const string& otext)
 {
-    m_text = otext;
-    if (!m_forPreview) {
-    string md5, xmd5;
-    MD5String(m_text, md5);
-    m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
+    m_fn.clear();
+    m_totlen = otext.size();
+
+    getparams();
+    if (m_maxmbs != -1 && m_totlen / (1024*1024) > m_maxmbs) {
+        LOGINF("MimeHandlerText: text too big (textfilemaxmbs=" << m_maxmbs <<
+               "), contents will not be indexed\n");
+    } else {
+        if (!m_paging || (m_totlen <= (int64_t)m_pagesz)) {
+            // Avoid copy for texts smaller than page size
+            m_paging = false;
+            m_text = otext;
+            m_offs = m_totlen;
+        } else {
+            m_alltext = otext;
+            readnext();
+        }
     }
+        
     m_havedoc = true;
     return true;
 }
@@ -112,9 +119,9 @@ bool MimeHandlerText::skip_to_document(const string& ipath)
     char *endptr;
     int64_t t = strtoll(ipath.c_str(), &endptr, 10);
     if (endptr == ipath.c_str()) {
-    LOGERR("MimeHandlerText::skip_to_document: bad ipath offs ["  <<
+        LOGERR("MimeHandlerText::skip_to_document: bad ipath offs ["  <<
                ipath << "]\n");
-    return false;
+        return false;
     }
     m_offs = t;
     readnext();
@@ -126,16 +133,21 @@ bool MimeHandlerText::next_document()
     LOGDEB("MimeHandlerText::next_document: m_havedoc "  << m_havedoc << "\n");
 
     if (m_havedoc == false)
-    return false;
+        return false;
 
     if (m_charsetfromxattr.empty())
-    m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset;
+        m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset;
     else 
-    m_metaData[cstr_dj_keyorigcharset] = m_charsetfromxattr;
+        m_metaData[cstr_dj_keyorigcharset] = m_charsetfromxattr;
 
     m_metaData[cstr_dj_keymt] = cstr_textplain;
 
     size_t srclen = m_text.length();
+    if (!m_forPreview) {
+        string md5, xmd5;
+        MD5String(m_text, md5);
+        m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
+    }
     m_metaData[cstr_dj_keycontent].swap(m_text);
 
     // We transcode even if defcharset is supposedly already utf-8:
@@ -143,7 +155,6 @@ bool MimeHandlerText::next_document()
     // txtdcode() truncates the text if transcoding fails
     (void)txtdcode("mh_text");
 
-
     // If the text length is 0 (the file is empty or oversize), or we are 
     // not paging, we're done
     if (srclen == 0 || !m_paging) {
@@ -152,21 +163,26 @@ bool MimeHandlerText::next_document()
     } else {
         // Paging: set ipath then read next chunk. 
 
+        int64_t start_offset = m_offs - srclen;
+        string buf = lltodecstr(start_offset);
+
         // Don't set ipath for the first chunk to avoid having 2
         // records for small files (one for the file, one for the
         // first chunk). This is a hack. The right thing to do would
         // be to use a different mtype for files over the page size,
         // and keep text/plain only for smaller files.
-        string buf = lltodecstr(m_offs - srclen);
-        if (m_offs - srclen != 0)
+        if (start_offset != 0)
             m_metaData[cstr_dj_keyipath] = buf;
+
         readnext();
+
         // This ensures that the first chunk (offs==srclen) of a
         // multi-chunk file does have an ipath. Else it stands for the
-        // whole file, which used to be the case but does not seem
-        // right
+        // whole file (see just above), which used to be the case but
+        // does not seem right
         if (m_havedoc)
             m_metaData[cstr_dj_keyipath] = buf;
+
         return true;
     }
 }
@@ -175,11 +191,16 @@ bool MimeHandlerText::readnext()
 {
     string reason;
     m_text.clear();
-    if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) {
-        LOGERR("MimeHandlerText: can't read file: "  << reason << "\n" );
-        m_havedoc = false;
-        return false;
+    if (!m_fn.empty()) {
+        if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) {
+            LOGERR("MimeHandlerText: can't read file: "  << reason << "\n" );
+            m_havedoc = false;
+            return false;
+        }
+    } else {
+        m_text = m_alltext.substr(m_offs, m_pagesz);
     }
+
     if (m_text.length() == 0) {
         // EOF
         m_havedoc = false;
@@ -189,7 +210,8 @@ bool MimeHandlerText::readnext()
     // If possible try to adjust the chunk to end right after a line
     // Don't do this for the last chunk. Last chunk of exactly the
     // page size might be unduly split, no big deal
-    if (m_text.length() == m_pagesz) {
+    if (m_text.length() == m_pagesz && m_text.back() != '\n' &&
+        m_text.back() != '\r') {
         string::size_type pos = m_text.find_last_of("\n\r");
         if (pos != string::npos && pos != 0) {
             m_text.erase(pos);
@@ -198,4 +220,3 @@ bool MimeHandlerText::readnext()
     m_offs += m_text.length();
     return true;
 }
-
diff --git a/src/internfile/mh_text.cpp-old b/src/internfile/mh_text.cpp-old
new file mode 100644
index 00000000..306a63c8
--- /dev/null
+++ b/src/internfile/mh_text.cpp-old
@@ -0,0 +1,201 @@
+/* Copyright (C) 2005 J.F.Dockes 
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include "autoconfig.h"
+
+#include <stdio.h>
+#include <errno.h>
+#include "safefcntl.h"
+#include <sys/types.h>
+#include "safeunistd.h"
+
+#include <iostream>
+#include <string>
+
+#include "cstr.h"
+#include "mh_text.h"
+#include "log.h"
+#include "readfile.h"
+#include "md5ut.h"
+#include "rclconfig.h"
+#include "pxattr.h"
+#include "pathut.h"
+
+using namespace std;
+
+const int MB = 1024*1024;
+const int KB = 1024;
+
+// Process a plain text file
+bool MimeHandlerText::set_document_file_impl(const string&, const string &fn)
+{
+    LOGDEB("MimeHandlerText::set_document_file: [" << fn << "] offs " <<
+           m_offs << "\n");
+
+    m_fn = fn;
+    // This should not be necessary, but it happens on msw that offset is large
+    // negative at this point, could not find the reason (still trying).
+    m_offs = 0;
+
+    // file size for oversize check
+    long long fsize = path_filesize(m_fn);
+    if (fsize < 0) {
+        LOGERR("MimeHandlerText::set_document_file: stat " << m_fn <<
+               " errno " << errno << "\n");
+        return false;
+    }
+
+#ifndef _WIN32
+    // Check for charset defined in extended attribute as per:
+    // http://freedesktop.org/wiki/CommonExtendedAttributes
+    pxattr::get(m_fn, "charset", &m_charsetfromxattr);
+#endif
+
+    // Max file size parameter: texts over this size are not indexed
+    int maxmbs = 20;
+    m_config->getConfParam("textfilemaxmbs", &maxmbs);
+
+    if (maxmbs == -1 || fsize / MB <= maxmbs) {
+        // Text file page size: if set, we split text files into
+        // multiple documents
+        int ps = 1000;
+        m_config->getConfParam("textfilepagekbs", &ps);
+        if (ps != -1) {
+            ps *= KB;
+            m_paging = true;
+        }
+        // Note: size_t is guaranteed unsigned, so max if ps is -1
+        m_pagesz = size_t(ps);
+        if (!readnext())
+            return false;
+    } else {
+        LOGINF("MimeHandlerText: file too big (textfilemaxmbs=" << maxmbs <<
+               "), contents will not be indexed: " << fn << endl);
+    }
+    if (!m_forPreview) {
+        string md5, xmd5;
+        MD5String(m_text, md5);
+        m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
+    }
+    m_havedoc = true;
+    return true;
+}
+
+bool MimeHandlerText::set_document_string_impl(const string&,
+                                               const string& otext)
+{
+    m_text = otext;
+    if (!m_forPreview) {
+        string md5, xmd5;
+        MD5String(m_text, md5);
+        m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
+    }
+    m_havedoc = true;
+    return true;
+}
+
+bool MimeHandlerText::skip_to_document(const string& ipath)
+{
+    char *endptr;
+    int64_t t = strtoll(ipath.c_str(), &endptr, 10);
+    if (endptr == ipath.c_str()) {
+        LOGERR("MimeHandlerText::skip_to_document: bad ipath offs ["  <<
+               ipath << "]\n");
+        return false;
+    }
+    m_offs = t;
+    readnext();
+    return true;
+}
+
+bool MimeHandlerText::next_document()
+{
+    LOGDEB("MimeHandlerText::next_document: m_havedoc "  << m_havedoc << "\n");
+
+    if (m_havedoc == false)
+        return false;
+
+    if (m_charsetfromxattr.empty())
+        m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset;
+    else 
+        m_metaData[cstr_dj_keyorigcharset] = m_charsetfromxattr;
+
+    m_metaData[cstr_dj_keymt] = cstr_textplain;
+
+    size_t srclen = m_text.length();
+    m_metaData[cstr_dj_keycontent].swap(m_text);
+
+    // We transcode even if defcharset is supposedly already utf-8:
+    // this validates the encoding.
+    // txtdcode() truncates the text if transcoding fails
+    (void)txtdcode("mh_text");
+
+
+    // If the text length is 0 (the file is empty or oversize), or we are 
+    // not paging, we're done
+    if (srclen == 0 || !m_paging) {
+        m_havedoc = false;
+        return true;
+    } else {
+        // Paging: set ipath then read next chunk. 
+
+        // Don't set ipath for the first chunk to avoid having 2
+        // records for small files (one for the file, one for the
+        // first chunk). This is a hack. The right thing to do would
+        // be to use a different mtype for files over the page size,
+        // and keep text/plain only for smaller files.
+        string buf = lltodecstr(m_offs - srclen);
+        if (m_offs - srclen != 0)
+            m_metaData[cstr_dj_keyipath] = buf;
+        readnext();
+        // This ensures that the first chunk (offs==srclen) of a
+        // multi-chunk file does have an ipath. Else it stands for the
+        // whole file, which used to be the case but does not seem
+        // right
+        if (m_havedoc)
+            m_metaData[cstr_dj_keyipath] = buf;
+        return true;
+    }
+}
+
+bool MimeHandlerText::readnext()
+{
+    string reason;
+    m_text.clear();
+    if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) {
+        LOGERR("MimeHandlerText: can't read file: "  << reason << "\n" );
+        m_havedoc = false;
+        return false;
+    }
+    if (m_text.length() == 0) {
+        // EOF
+        m_havedoc = false;
+        return true;
+    }
+
+    // If possible try to adjust the chunk to end right after a line
+    // Don't do this for the last chunk. Last chunk of exactly the
+    // page size might be unduly split, no big deal
+    if (m_text.length() == m_pagesz) {
+        string::size_type pos = m_text.find_last_of("\n\r");
+        if (pos != string::npos && pos != 0) {
+            m_text.erase(pos);
+        }
+    }
+    m_offs += m_text.length();
+    return true;
+}
+
diff --git a/src/internfile/mh_text.h b/src/internfile/mh_text.h
index 8112857d..e7bb449d 100644
--- a/src/internfile/mh_text.h
+++ b/src/internfile/mh_text.h
@@ -61,12 +61,16 @@ protected:
 private:
     bool   m_paging{false};
     std::string m_text;
+    std::string m_alltext;
     std::string m_fn;
-    int64_t  m_offs{0}; // Offset of next read in file if we're paging
+    int64_t m_offs{0}; // Offset of next read in file if we're paging
+    int64_t m_totlen{0};
     size_t m_pagesz{0};
+    int m_maxmbs{20};
     std::string m_charsetfromxattr; 
 
     bool readnext();
+    void getparams();
 };
 
 #endif /* _MH_TEXT_H_INCLUDED_ */
diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf
index 5e895871..31182e95 100644
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@@ -205,6 +205,7 @@ text/calendar = execm rclics;mimetype=text/plain
 text/css = internal text/plain
 text/html  = internal 
 text/plain = internal 
+text/plain1 = internal 
 text/rtf = exec unrtf --nopict --html;mimetype=text/html
 text/x-bibtex = exec rclbibtex.sh ; mimetype = text/plain
 text/x-c = internal