From 15924ce03728f384ef92d466ca165b3bf33afd81 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Sat, 15 Aug 2020 10:20:48 +0200 Subject: [PATCH] Process text/plain subdocuments like .txt files (paging big ones, etc.) --- src/internfile/internfile.cpp | 5 +- src/internfile/mh_execm.cpp | 9 ++ src/internfile/mh_text.cpp | 127 ++++++++++++--------- src/internfile/mh_text.cpp-old | 201 +++++++++++++++++++++++++++++++++ src/internfile/mh_text.h | 6 +- src/sampleconf/mimeconf | 1 + 6 files changed, 294 insertions(+), 55 deletions(-) create mode 100644 src/internfile/mh_text.cpp-old diff --git a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp index cf5df032..68336a31 100644 --- a/src/internfile/internfile.cpp +++ b/src/internfile/internfile.cpp @@ -939,7 +939,10 @@ breakloop: } // Keep this AFTER collectIpathAndMT dijontorcl(doc); - + // Fix the bogus mtype used to force mh_text processing of text subdocs + if (doc.mimetype == "text/plain1") { + doc.mimetype = "text/plain"; + } // Possibly destack so that we can test for FIDone. While doing this // possibly set aside an ancestor html text (for the GUI preview) while (!m_handlers.empty() && !m_handlers.back()->has_documents()) { diff --git a/src/internfile/mh_execm.cpp b/src/internfile/mh_execm.cpp index d15975b0..5b5394ee 100644 --- a/src/internfile/mh_execm.cpp +++ b/src/internfile/mh_execm.cpp @@ -314,6 +314,15 @@ bool MimeHandlerExecMultiple::next_document() mtype = "application/octet-stream"; } } + /* If we identify text/plain from the suffix (as opposed + to the handler setting the type), we use text/plain1 + instead. As directed in mimeconf, this will cause the + text handler to be applied (instead of internfile just + ending things there), allowing splitting and default + charset conversions. */ + if (mtype == "text/plain") { + mtype = "text/plain1"; + } } m_metaData[cstr_dj_keymt] = mtype; if (!m_forPreview) { diff --git a/src/internfile/mh_text.cpp b/src/internfile/mh_text.cpp index a3fec423..a80008f6 100644 --- a/src/internfile/mh_text.cpp +++ b/src/internfile/mh_text.cpp @@ -36,8 +36,23 @@ using namespace std; -const int MB = 1024*1024; -const int KB = 1024; +void MimeHandlerText::getparams() +{ + m_config->getConfParam("textfilemaxmbs", &m_maxmbs); + + // Text file page size: if set, we split text files into + // multiple documents + int ps = 1000; + m_config->getConfParam("textfilepagekbs", &ps); + if (ps != -1) { + ps *= 1024; + m_paging = true; + } else { + m_paging = false; + } + m_pagesz = size_t(ps); + m_offs = 0; +} // Process a plain text file bool MimeHandlerText::set_document_file_impl(const string&, const string &fn) @@ -46,13 +61,9 @@ bool MimeHandlerText::set_document_file_impl(const string&, const string &fn) m_offs << "\n"); m_fn = fn; - // This should not be necessary, but it happens on msw that offset is large - // negative at this point, could not find the reason (still trying). - m_offs = 0; - // file size for oversize check - long long fsize = path_filesize(m_fn); - if (fsize < 0) { + m_totlen = path_filesize(m_fn); + if (m_totlen < 0) { LOGERR("MimeHandlerText::set_document_file: stat " << m_fn << " errno " << errno << "\n"); return false; @@ -64,31 +75,14 @@ bool MimeHandlerText::set_document_file_impl(const string&, const string &fn) pxattr::get(m_fn, "charset", &m_charsetfromxattr); #endif - // Max file size parameter: texts over this size are not indexed - int maxmbs = 20; - m_config->getConfParam("textfilemaxmbs", &maxmbs); - - if (maxmbs == -1 || fsize / MB <= maxmbs) { - // Text file page size: if set, we split text files into - // multiple documents - int ps = 1000; - m_config->getConfParam("textfilepagekbs", &ps); - if (ps != -1) { - ps *= KB; - m_paging = true; - } - // Note: size_t is guaranteed unsigned, so max if ps is -1 - m_pagesz = size_t(ps); - if (!readnext()) - return false; - } else { - LOGINF("MimeHandlerText: file too big (textfilemaxmbs=" << maxmbs << + getparams(); + if (m_maxmbs != -1 && m_totlen / (1024*1024) > m_maxmbs) { + LOGINF("MimeHandlerText: file too big (textfilemaxmbs=" << m_maxmbs << "), contents will not be indexed: " << fn << endl); - } - if (!m_forPreview) { - string md5, xmd5; - MD5String(m_text, md5); - m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); + } else { + if (!readnext()) { + return false; + } } m_havedoc = true; return true; @@ -97,12 +91,25 @@ bool MimeHandlerText::set_document_file_impl(const string&, const string &fn) bool MimeHandlerText::set_document_string_impl(const string&, const string& otext) { - m_text = otext; - if (!m_forPreview) { - string md5, xmd5; - MD5String(m_text, md5); - m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); + m_fn.clear(); + m_totlen = otext.size(); + + getparams(); + if (m_maxmbs != -1 && m_totlen / (1024*1024) > m_maxmbs) { + LOGINF("MimeHandlerText: text too big (textfilemaxmbs=" << m_maxmbs << + "), contents will not be indexed\n"); + } else { + if (!m_paging || (m_totlen <= (int64_t)m_pagesz)) { + // Avoid copy for texts smaller than page size + m_paging = false; + m_text = otext; + m_offs = m_totlen; + } else { + m_alltext = otext; + readnext(); + } } + m_havedoc = true; return true; } @@ -112,9 +119,9 @@ bool MimeHandlerText::skip_to_document(const string& ipath) char *endptr; int64_t t = strtoll(ipath.c_str(), &endptr, 10); if (endptr == ipath.c_str()) { - LOGERR("MimeHandlerText::skip_to_document: bad ipath offs [" << + LOGERR("MimeHandlerText::skip_to_document: bad ipath offs [" << ipath << "]\n"); - return false; + return false; } m_offs = t; readnext(); @@ -126,16 +133,21 @@ bool MimeHandlerText::next_document() LOGDEB("MimeHandlerText::next_document: m_havedoc " << m_havedoc << "\n"); if (m_havedoc == false) - return false; + return false; if (m_charsetfromxattr.empty()) - m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset; + m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset; else - m_metaData[cstr_dj_keyorigcharset] = m_charsetfromxattr; + m_metaData[cstr_dj_keyorigcharset] = m_charsetfromxattr; m_metaData[cstr_dj_keymt] = cstr_textplain; size_t srclen = m_text.length(); + if (!m_forPreview) { + string md5, xmd5; + MD5String(m_text, md5); + m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); + } m_metaData[cstr_dj_keycontent].swap(m_text); // We transcode even if defcharset is supposedly already utf-8: @@ -143,7 +155,6 @@ bool MimeHandlerText::next_document() // txtdcode() truncates the text if transcoding fails (void)txtdcode("mh_text"); - // If the text length is 0 (the file is empty or oversize), or we are // not paging, we're done if (srclen == 0 || !m_paging) { @@ -152,21 +163,26 @@ bool MimeHandlerText::next_document() } else { // Paging: set ipath then read next chunk. + int64_t start_offset = m_offs - srclen; + string buf = lltodecstr(start_offset); + // Don't set ipath for the first chunk to avoid having 2 // records for small files (one for the file, one for the // first chunk). This is a hack. The right thing to do would // be to use a different mtype for files over the page size, // and keep text/plain only for smaller files. - string buf = lltodecstr(m_offs - srclen); - if (m_offs - srclen != 0) + if (start_offset != 0) m_metaData[cstr_dj_keyipath] = buf; + readnext(); + // This ensures that the first chunk (offs==srclen) of a // multi-chunk file does have an ipath. Else it stands for the - // whole file, which used to be the case but does not seem - // right + // whole file (see just above), which used to be the case but + // does not seem right if (m_havedoc) m_metaData[cstr_dj_keyipath] = buf; + return true; } } @@ -175,11 +191,16 @@ bool MimeHandlerText::readnext() { string reason; m_text.clear(); - if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) { - LOGERR("MimeHandlerText: can't read file: " << reason << "\n" ); - m_havedoc = false; - return false; + if (!m_fn.empty()) { + if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) { + LOGERR("MimeHandlerText: can't read file: " << reason << "\n" ); + m_havedoc = false; + return false; + } + } else { + m_text = m_alltext.substr(m_offs, m_pagesz); } + if (m_text.length() == 0) { // EOF m_havedoc = false; @@ -189,7 +210,8 @@ bool MimeHandlerText::readnext() // If possible try to adjust the chunk to end right after a line // Don't do this for the last chunk. Last chunk of exactly the // page size might be unduly split, no big deal - if (m_text.length() == m_pagesz) { + if (m_text.length() == m_pagesz && m_text.back() != '\n' && + m_text.back() != '\r') { string::size_type pos = m_text.find_last_of("\n\r"); if (pos != string::npos && pos != 0) { m_text.erase(pos); @@ -198,4 +220,3 @@ bool MimeHandlerText::readnext() m_offs += m_text.length(); return true; } - diff --git a/src/internfile/mh_text.cpp-old b/src/internfile/mh_text.cpp-old new file mode 100644 index 00000000..306a63c8 --- /dev/null +++ b/src/internfile/mh_text.cpp-old @@ -0,0 +1,201 @@ +/* Copyright (C) 2005 J.F.Dockes + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include "autoconfig.h" + +#include +#include +#include "safefcntl.h" +#include +#include "safeunistd.h" + +#include +#include + +#include "cstr.h" +#include "mh_text.h" +#include "log.h" +#include "readfile.h" +#include "md5ut.h" +#include "rclconfig.h" +#include "pxattr.h" +#include "pathut.h" + +using namespace std; + +const int MB = 1024*1024; +const int KB = 1024; + +// Process a plain text file +bool MimeHandlerText::set_document_file_impl(const string&, const string &fn) +{ + LOGDEB("MimeHandlerText::set_document_file: [" << fn << "] offs " << + m_offs << "\n"); + + m_fn = fn; + // This should not be necessary, but it happens on msw that offset is large + // negative at this point, could not find the reason (still trying). + m_offs = 0; + + // file size for oversize check + long long fsize = path_filesize(m_fn); + if (fsize < 0) { + LOGERR("MimeHandlerText::set_document_file: stat " << m_fn << + " errno " << errno << "\n"); + return false; + } + +#ifndef _WIN32 + // Check for charset defined in extended attribute as per: + // http://freedesktop.org/wiki/CommonExtendedAttributes + pxattr::get(m_fn, "charset", &m_charsetfromxattr); +#endif + + // Max file size parameter: texts over this size are not indexed + int maxmbs = 20; + m_config->getConfParam("textfilemaxmbs", &maxmbs); + + if (maxmbs == -1 || fsize / MB <= maxmbs) { + // Text file page size: if set, we split text files into + // multiple documents + int ps = 1000; + m_config->getConfParam("textfilepagekbs", &ps); + if (ps != -1) { + ps *= KB; + m_paging = true; + } + // Note: size_t is guaranteed unsigned, so max if ps is -1 + m_pagesz = size_t(ps); + if (!readnext()) + return false; + } else { + LOGINF("MimeHandlerText: file too big (textfilemaxmbs=" << maxmbs << + "), contents will not be indexed: " << fn << endl); + } + if (!m_forPreview) { + string md5, xmd5; + MD5String(m_text, md5); + m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); + } + m_havedoc = true; + return true; +} + +bool MimeHandlerText::set_document_string_impl(const string&, + const string& otext) +{ + m_text = otext; + if (!m_forPreview) { + string md5, xmd5; + MD5String(m_text, md5); + m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5); + } + m_havedoc = true; + return true; +} + +bool MimeHandlerText::skip_to_document(const string& ipath) +{ + char *endptr; + int64_t t = strtoll(ipath.c_str(), &endptr, 10); + if (endptr == ipath.c_str()) { + LOGERR("MimeHandlerText::skip_to_document: bad ipath offs [" << + ipath << "]\n"); + return false; + } + m_offs = t; + readnext(); + return true; +} + +bool MimeHandlerText::next_document() +{ + LOGDEB("MimeHandlerText::next_document: m_havedoc " << m_havedoc << "\n"); + + if (m_havedoc == false) + return false; + + if (m_charsetfromxattr.empty()) + m_metaData[cstr_dj_keyorigcharset] = m_dfltInputCharset; + else + m_metaData[cstr_dj_keyorigcharset] = m_charsetfromxattr; + + m_metaData[cstr_dj_keymt] = cstr_textplain; + + size_t srclen = m_text.length(); + m_metaData[cstr_dj_keycontent].swap(m_text); + + // We transcode even if defcharset is supposedly already utf-8: + // this validates the encoding. + // txtdcode() truncates the text if transcoding fails + (void)txtdcode("mh_text"); + + + // If the text length is 0 (the file is empty or oversize), or we are + // not paging, we're done + if (srclen == 0 || !m_paging) { + m_havedoc = false; + return true; + } else { + // Paging: set ipath then read next chunk. + + // Don't set ipath for the first chunk to avoid having 2 + // records for small files (one for the file, one for the + // first chunk). This is a hack. The right thing to do would + // be to use a different mtype for files over the page size, + // and keep text/plain only for smaller files. + string buf = lltodecstr(m_offs - srclen); + if (m_offs - srclen != 0) + m_metaData[cstr_dj_keyipath] = buf; + readnext(); + // This ensures that the first chunk (offs==srclen) of a + // multi-chunk file does have an ipath. Else it stands for the + // whole file, which used to be the case but does not seem + // right + if (m_havedoc) + m_metaData[cstr_dj_keyipath] = buf; + return true; + } +} + +bool MimeHandlerText::readnext() +{ + string reason; + m_text.clear(); + if (!file_to_string(m_fn, m_text, m_offs, m_pagesz, &reason)) { + LOGERR("MimeHandlerText: can't read file: " << reason << "\n" ); + m_havedoc = false; + return false; + } + if (m_text.length() == 0) { + // EOF + m_havedoc = false; + return true; + } + + // If possible try to adjust the chunk to end right after a line + // Don't do this for the last chunk. Last chunk of exactly the + // page size might be unduly split, no big deal + if (m_text.length() == m_pagesz) { + string::size_type pos = m_text.find_last_of("\n\r"); + if (pos != string::npos && pos != 0) { + m_text.erase(pos); + } + } + m_offs += m_text.length(); + return true; +} + diff --git a/src/internfile/mh_text.h b/src/internfile/mh_text.h index 8112857d..e7bb449d 100644 --- a/src/internfile/mh_text.h +++ b/src/internfile/mh_text.h @@ -61,12 +61,16 @@ protected: private: bool m_paging{false}; std::string m_text; + std::string m_alltext; std::string m_fn; - int64_t m_offs{0}; // Offset of next read in file if we're paging + int64_t m_offs{0}; // Offset of next read in file if we're paging + int64_t m_totlen{0}; size_t m_pagesz{0}; + int m_maxmbs{20}; std::string m_charsetfromxattr; bool readnext(); + void getparams(); }; #endif /* _MH_TEXT_H_INCLUDED_ */ diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index 5e895871..31182e95 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -205,6 +205,7 @@ text/calendar = execm rclics;mimetype=text/plain text/css = internal text/plain text/html = internal text/plain = internal +text/plain1 = internal text/rtf = exec unrtf --nopict --html;mimetype=text/html text/x-bibtex = exec rclbibtex.sh ; mimetype = text/plain text/x-c = internal