internal xslt working for single-sheet (abw). Still leaking memory?

2018-12-25 10:57:26 +01:00 · 2018-12-25 10:57:26 +01:00 · 00c0c5168b
commit 00c0c5168b
parent 2bd4b5ef4f
13 changed files with 9812 additions and 271 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -2,6 +2,8 @@
 CXXFLAGS ?= @CXXFLAGS@
 LIBXAPIAN=@LIBXAPIAN@
 XAPIANCXXFLAGS=@XAPIANCXXFLAGS@
+XSLT_CFLAGS=@XSLT_CFLAGS@
+XSLT_LINKADD=@XSLT_LINKADD@
 LIBICONV=@LIBICONV@
 INCICONV=@INCICONV@
 LIBFAM = @LIBFAM@
@ -29,8 +31,10 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
    $(COMMONCPPFLAGS) \
    $(INCICONV) \
    $(XAPIANCXXFLAGS) \
+    $(XSLT_CFLAGS) \
    $(X_CFLAGS) \
    -DRECOLL_DATADIR=\"${pkgdatadir}\" \
+    -DREADFILE_ENABLE_ZLIB -DREADFILE_ENABLE_MINIZ \
    -D_GNU_SOURCE \
    $(DEFS)

@ -121,6 +125,8 @@ internfile/mh_symlink.h \
 internfile/mh_text.cpp \
 internfile/mh_text.h \
 internfile/mh_unknown.h \
+internfile/mh_xslt.cpp \
+internfile/mh_xslt.h \
 internfile/mimehandler.cpp \
 internfile/mimehandler.h \
 internfile/myhtmlparse.cpp \
@ -224,6 +230,8 @@ utils/md5ut.cpp \
 utils/md5ut.h \
 utils/mimeparse.cpp \
 utils/mimeparse.h \
+utils/miniz.cpp \
+utils/miniz.h \
 utils/netcon.cpp \
 utils/netcon.h \
 utils/pathut.cpp \
@ -262,7 +270,7 @@ AM_YFLAGS = -d
 librecoll_la_LDFLAGS = -release $(VERSION) \
    -Wl,--no-undefined -Wl,--warn-unresolved-symbols

-librecoll_la_LIBADD = $(LIBXAPIAN) $(LIBICONV) $(LIBTHREADS)
+librecoll_la_LIBADD = $(XSLT_LINKADD) $(LIBXAPIAN) $(LIBICONV) $(LIBTHREADS)

 # There is probably a better way to do this. The KIO needs to be linked
 # with librecoll, but librecoll is installed into a non-standard place
@ -640,6 +648,7 @@ sampleconf/mimeview
 filterdir = $(pkgdatadir)/filters
 filter_DATA = \
 desktop/hotrecoll.py \
+filters/abiword.xsl \
 filters/rcl7z \
 filters/rclabw.py \
 filters/rclaptosidman \
--- a/src/configure.ac
+++ b/src/configure.ac
@ -321,6 +321,21 @@ XAPIANCXXFLAGS=`$XAPIAN_CONFIG --cxxflags`
 #echo LIBXAPIANSTATICEXTRA: $LIBXAPIANSTATICEXTRA
 #echo XAPIANCXXFLAGS: $XAPIANCXXFLAGS

+
+XSLT_CONFIG=${XSLT_CONFIG:-no}
+if test "$XSLT_CONFIG" = "no"; then 
+    AC_PATH_PROG(XSLT_CONFIG0, [xslt-config], no)
+    XSLT_CONFIG=$XSLT_CONFIG0
+fi
+if test "$XSLT_CONFIG" = "no" ; then
+   AC_MSG_ERROR([Cannot find xslt-config command in $PATH. Is
+libxslt installed ?])
+   exit 1
+fi
+
+XSLT_CFLAGS=`xslt-config --cflags`
+XSLT_LINKADD=`xslt-config --libs`
+
 AC_ARG_ENABLE(xadump, 
    AC_HELP_STRING([--enable-xadump],
   [Enable building the xadump low level Xapian access program.]),
@ -527,6 +542,8 @@ AC_SUBST(QMAKE_DISABLE_ZEITGEIST)
 AC_SUBST(LIBQZEITGEIST)
 AC_SUBST(RCLVERSION)
 AC_SUBST(RCLLIBVERSION)
+AC_SUBST(XSLT_CFLAGS)
+AC_SUBST(XSLT_LINKADD)

 # All object files depend on localdefs which has the cc flags. Avoid
 # changing it unless necessary
--- a/src/filters/abiword.xsl
+++ b/src/filters/abiword.xsl
@ -0,0 +1,88 @@
+<?xml version="1.0"?>
+<xsl:stylesheet version="1.0"
+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+  xmlns:ab="http://www.abisource.com/awml.dtd" 
+  exclude-result-prefixes="ab"
+  >
+
+<xsl:output method="html" encoding="UTF-8"/>
+
+<xsl:template match="/">
+<html>
+  <head>
+    <xsl:apply-templates select="ab:abiword/ab:metadata"/>
+  </head>
+  <body>
+
+    <!-- This is for the older abiword format with no namespaces -->
+    <xsl:for-each select="abiword/section">
+      <xsl:apply-templates select="p"/>
+    </xsl:for-each>
+
+    <!-- Newer namespaced format -->
+    <xsl:for-each select="ab:abiword/ab:section">
+      <xsl:for-each select="ab:p">
+        <p><xsl:value-of select="."/></p><xsl:text>
+        </xsl:text>
+      </xsl:for-each>
+    </xsl:for-each>
+
+  </body>
+</html>
+</xsl:template>
+
+<xsl:template match="p">
+  <p><xsl:value-of select="."/></p><xsl:text>
+      </xsl:text>
+</xsl:template>
+
+<xsl:template match="ab:metadata">
+    <xsl:for-each select="ab:m">
+      <xsl:choose>
+        <xsl:when test="@key = 'dc.creator'">
+	  <meta>
+	    <xsl:attribute name="name">author</xsl:attribute>
+	    <xsl:attribute name="content">
+	    <xsl:value-of select="."/>
+	    </xsl:attribute>
+          </meta><xsl:text>
+	    </xsl:text>
+        </xsl:when>
+        <xsl:when test="@key = 'abiword.keywords'">
+	  <meta>
+	    <xsl:attribute name="name">keywords</xsl:attribute>
+	    <xsl:attribute name="content">
+	    <xsl:value-of select="."/>
+	    </xsl:attribute>
+          </meta><xsl:text>
+	    </xsl:text>
+        </xsl:when>
+        <xsl:when test="@key = 'dc.subject'">
+	  <meta>
+	    <xsl:attribute name="name">keywords</xsl:attribute>
+	    <xsl:attribute name="content">
+	    <xsl:value-of select="."/>
+	    </xsl:attribute>
+          </meta><xsl:text>
+	    </xsl:text>
+        </xsl:when>
+        <xsl:when test="@key = 'dc.description'">
+	  <meta>
+	    <xsl:attribute name="name">abstract</xsl:attribute>
+	    <xsl:attribute name="content">
+	    <xsl:value-of select="."/>
+	    </xsl:attribute>
+          </meta><xsl:text>
+	    </xsl:text>
+        </xsl:when>
+        <xsl:when test="@key = 'dc.title'">
+	  <title><xsl:value-of select="."/></title><xsl:text>
+	    </xsl:text>
+        </xsl:when>
+        <xsl:otherwise>
+        </xsl:otherwise>
+      </xsl:choose>
+    </xsl:for-each>
+</xsl:template>
+
+</xsl:stylesheet>
--- a/src/internfile/mh_mail.h
+++ b/src/internfile/mh_mail.h
@ -38,9 +38,7 @@ public:
    MimeHandlerMail(RclConfig *cnf, const std::string &id);
    virtual ~MimeHandlerMail();
    virtual bool is_data_input_ok(DataInput input) const {
-        if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING)
-            return true;
-        return false;
+        return (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING);
    }
    virtual bool next_document();
    virtual bool skip_to_document(const std::string& ipath);
--- a/src/internfile/mh_xslt.cpp
+++ b/src/internfile/mh_xslt.cpp
@ -0,0 +1,232 @@
+/* Copyright (C) 2005 J.F.Dockes 
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+#include "autoconfig.h"
+
+#include <libxml/parser.h>
+#include <libxml/tree.h>
+#include <libxslt/transform.h>
+#include <libxslt/xsltInternals.h>
+#include <libxslt/xsltutils.h>
+
+#include "cstr.h"
+#include "mh_xslt.h"
+#include "log.h"
+#include "smallut.h"
+#include "md5ut.h"
+#include "rclconfig.h"
+#include "readfile.h"
+
+using namespace std;
+
+
+class FileScanXML : public FileScanDo {
+public:
+    FileScanXML(const string& fn) : m_fn(fn) {}
+    virtual ~FileScanXML() {
+        if (ctxt) {
+            xmlFreeParserCtxt(ctxt);
+        }
+    }
+
+    xmlDocPtr getDoc() {
+        int ret;
+        if ((ret = xmlParseChunk(ctxt, nullptr, 0, 1))) {
+            xmlError *error = xmlGetLastError();
+            LOGERR("FileScanXML: final xmlParseChunk failed with error " <<
+                   ret << " error: " <<
+                   (error ? error->message :
+                    " null return from xmlGetLastError()") << "\n");
+            return nullptr;
+        }
+        return ctxt->myDoc;
+    }
+
+    virtual bool init(int64_t size, string *) {
+        LOGDEB1("FileScanXML: init: size " << size << endl);
+        ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, m_fn.c_str());
+        if (ctxt == nullptr) {
+            LOGERR("FileScanXML: xmlCreatePushParserCtxt failed\n");
+            return false;
+        } else {
+            return true;
+        }
+    }
+    
+    virtual bool data(const char *buf, int cnt, string*) {
+        if (0) {
+            string dt(buf, cnt);
+            LOGDEB1("FileScanXML: data: cnt " << cnt << " data " << dt << endl);
+        } else {
+            LOGDEB1("FileScanXML: data: cnt " << cnt << endl);
+        }            
+        int ret;
+        if ((ret = xmlParseChunk(ctxt, buf, cnt, 0))) {
+            xmlError *error = xmlGetLastError();
+            LOGERR("FileScanXML: xmlParseChunk failed with error " <<
+                   ret << " for [" << buf << "] error " <<
+                   (error ? error->message :
+                    " null return from xmlGetLastError()") << "\n");
+            return false;
+        } else {
+            LOGDEB1("xmlParseChunk ok (sent " << cnt << " bytes)\n");
+            return true;
+        }
+    }
+
+private:
+    xmlParserCtxtPtr ctxt{nullptr};
+    string m_fn;
+};
+
+class MimeHandlerXslt::Internal {
+public:
+    ~Internal() {
+        if (metaOrAllSS) {
+            xsltFreeStylesheet(metaOrAllSS);
+        }
+        if (dataSS) {
+            xsltFreeStylesheet(dataSS);
+        }
+    }
+    bool ok{false};
+    xsltStylesheet *metaOrAllSS{nullptr};
+    xsltStylesheet *dataSS{nullptr};
+    string result;
+};
+
+MimeHandlerXslt::~MimeHandlerXslt()
+{
+    delete m;
+}
+
+MimeHandlerXslt::MimeHandlerXslt(RclConfig *cnf, const std::string& id,
+                                 const std::vector<std::string>& params)
+    : RecollFilter(cnf, id), m(new Internal)
+{
+    LOGDEB("MimeHandlerXslt: params: " << stringsToString(params) << endl);
+    string filtersdir = path_cat(cnf->getDatadir(), "filters");
+
+    xmlSubstituteEntitiesDefault(0);
+    xmlLoadExtDtdDefaultValue = 0;
+
+    // params can be "xslt stylesheetall" or
+    // "xslt metamember stylesheetmeta datamember stylesheetdata"
+    if (params.size() == 2) {
+        string ssfn = path_cat(filtersdir, params[1]);
+        FileScanXML XMLstyle(ssfn);
+        string reason;
+        if (!file_scan(ssfn, &XMLstyle, &reason)) {
+            LOGERR("MimeHandlerXslt: file_scan failed for style sheet " <<
+                   ssfn << " : " << reason << endl);
+            return;
+        }
+        xmlDoc *stl = XMLstyle.getDoc();
+        if (stl == nullptr) {
+            LOGERR("MimeHandlerXslt: getDoc failed for style sheet " <<
+                   ssfn << endl);
+            return;
+        }
+        m->metaOrAllSS = xsltParseStylesheetDoc(stl);
+        if (m->metaOrAllSS) {
+            m->ok = true;
+        }
+    } else if (params.size() == 4) {
+    } else {
+        LOGERR("MimeHandlerXslt: constructor with wrong param vector: " <<
+               stringsToString(params) << endl);
+    }
+}
+
+bool MimeHandlerXslt::set_document_file_impl(const std::string& mt, 
+                                             const std::string &file_path)
+{
+    LOGDEB0("MimeHandlerXslt::set_document_file_: fn: " << file_path << endl);
+    if (!m || !m->ok) {
+        return false;
+    }
+    if (nullptr == m->dataSS) {
+        if (nullptr == m->metaOrAllSS) {
+            LOGERR("MimeHandlerXslt::set_document_file_impl: both ss empty??\n");
+            return false;
+        }
+        FileScanXML XMLdoc(file_path);
+        string md5, reason;
+        if (!file_scan(file_path, &XMLdoc, 0, -1, &reason,
+                       m_forPreview ? nullptr : &md5)) {
+            LOGERR("MimeHandlerXslt::set_document_file_impl: file_scan failed "
+                   "for " << file_path << " : " << reason << endl);
+            return false;
+        }
+        if (!m_forPreview) {
+            m_metaData[cstr_dj_keymd5] = md5;
+        }
+        xmlDocPtr doc = XMLdoc.getDoc();
+        if (nullptr == doc) {
+            LOGERR("MimeHandlerXslt::set_doc_file_impl: no parsed doc\n");
+            return false;
+        }
+        xmlDocPtr transformed = xsltApplyStylesheet(m->metaOrAllSS, doc, NULL);
+        if (nullptr == transformed) {
+            LOGERR("MimeHandlerXslt::set_doc_file_: xslt transform failed\n");
+            xmlFreeDoc(doc);
+            return false;
+        }
+        xmlChar *outstr;
+        int outlen;
+        xsltSaveResultToString(&outstr, &outlen, transformed, m->metaOrAllSS);
+        m->result = string((const char*)outstr, outlen);
+        xmlFree(outstr);
+        xmlFreeDoc(transformed);
+        xmlFreeDoc(doc);
+    } else {
+        LOGERR("Not ready for multipart yet\n");
+        abort();
+    }
+            
+    m_havedoc = true;
+    return true;
+}
+
+bool MimeHandlerXslt::set_document_string_impl(const string& mt, 
+                                               const string& msgtxt)
+{
+    if (!m || !m->ok) {
+        return false;
+    }
+    return true;
+}
+
+bool MimeHandlerXslt::next_document()
+{
+    if (!m || !m->ok) {
+        return false;
+    }
+    if (m_havedoc == false)
+	return false;
+    m_havedoc = false;
+    m_metaData[cstr_dj_keymt] = cstr_texthtml;
+    m_metaData[cstr_dj_keycontent].swap(m->result);
+    LOGDEB1("MimeHandlerXslt::next_document: result: [" <<
+            m_metaData[cstr_dj_keycontent] << "]\n");
+    return true;
+}
+
+void MimeHandlerXslt::clear_impl()
+{
+    m_havedoc = false;
+    m->result.clear();
+}
--- a/src/internfile/mh_xslt.h
+++ b/src/internfile/mh_xslt.h
@ -0,0 +1,49 @@
+/* Copyright (C) 2018 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+#ifndef _MH_XSLT_H_INCLUDED_
+#define _MH_XSLT_H_INCLUDED_
+
+#include <string>
+
+#include "mimehandler.h"
+
+class MimeHandlerXslt : public RecollFilter {
+ public:
+    MimeHandlerXslt(RclConfig *cnf, const std::string& id,
+                    const std::vector<std::string>& params);
+    virtual ~MimeHandlerXslt();
+
+    virtual bool next_document() override;
+    virtual void clear_impl() override;
+
+    virtual bool is_data_input_ok(DataInput input) const override {
+        return (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING);
+    }
+
+protected:
+    virtual bool set_document_file_impl(const std::string& mt, 
+                                        const std::string& file_path);
+    virtual bool set_document_string_impl(const std::string& mt,
+                                          const std::string& data);
+
+    class Internal;
+private:
+    Internal *m{nullptr};
+};
+
+
+#endif /* _MH_XSLT_H_INCLUDED_ */
--- a/src/internfile/mimehandler.cpp
+++ b/src/internfile/mimehandler.cpp
@ -41,6 +41,7 @@ using namespace std;
 #include "mh_symlink.h"
 #include "mh_unknown.h"
 #include "mh_null.h"
+#include "mh_xslt.h"

 // Performance help: we use a pool of already known and created
 // handlers. There can be several instances for a given mime type
@ -137,11 +138,17 @@ void clearMimeHandlerCache()

 /** For mime types set as "internal" in mimeconf: 
  * create appropriate handler object. */
-static RecollFilter *mhFactory(RclConfig *config, const string &mime,
+static RecollFilter *mhFactory(RclConfig *config, const string &mimeOrParams,
 				bool nobuild, string& id)
 {
-    LOGDEB2("mhFactory(" << mime << ")\n");
-    string lmime(mime);
+    LOGDEB1("mhFactory(" << mimeOrParams << ")\n");
+    vector<string> lparams;
+    stringToStrings(mimeOrParams, lparams);
+    if (lparams.empty()) {
+        // ??
+        return nullptr;
+    }
+    string lmime(lparams[0]);
    stringtolower(lmime);
    if (cstr_textplain == lmime) {
 	LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText\n");
@ -160,11 +167,11 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime,
 	MD5String("MimeHandlerMail", id);
 	return nobuild ? 0 : new MimeHandlerMail(config, id);
    } else if ("inode/symlink" == lmime) {
-	LOGDEB2("mhFactory(" << mime << "): ret MimeHandlerSymlink\n");
+	LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerSymlink\n");
 	MD5String("MimeHandlerSymlink", id);
 	return nobuild ? 0 : new MimeHandlerSymlink(config, id);
    } else if ("application/x-zerosize" == lmime) {
-	LOGDEB("mhFactory(" << mime << "): ret MimeHandlerNull\n");
+	LOGDEB("mhFactory(" << lmime << "): returning MimeHandlerNull\n");
 	MD5String("MimeHandlerNull", id);
 	return nobuild ? 0 : new MimeHandlerNull(config, id);
    } else if (lmime.find("text/") == 0) {
@ -175,7 +182,11 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime,
        // exec) but still opening with a specific editor.
 	LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText(x)\n");
 	MD5String("MimeHandlerText", id);
-        return nobuild ? 0 : new MimeHandlerText(config, id); 
+        return nobuild ? 0 : new MimeHandlerText(config, id);
+    } else if ("xsltproc" == lmime) {
+        // XML Types processed with one or several xslt style sheets.
+        MD5String(mimeOrParams, id);
+        return nobuild ? 0 : new MimeHandlerXslt(config, id, lparams);
    } else {
 	// We should not get there. It means that "internal" was set
 	// as a handler in mimeconf for a mime type we actually can't
@ -262,7 +273,7 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,

 /* Get handler/filter object for given mime type: */
 RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg, 
-			      bool filtertypes)
+                             bool filtertypes)
 {
    LOGDEB("getMimeHandler: mtype [" << mtype << "] filtertypes " <<
           filtertypes << "\n");
@ -291,7 +302,7 @@ RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg,
 	}
 	bool internal = !stringlowercmp("internal", handlertype);
 	if (internal) {
-	    // For internal types let the factory compute the id
+	    // For internal types let the factory compute the cache id
 	    mhFactory(cfg, cmdstr.empty() ? mtype : cmdstr, true, id);
 	} else {
 	    // exec/execm: use the md5 of the def line
@ -304,16 +315,15 @@ RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg,
 	    goto out;

 	LOGDEB2("getMimeHandler: " << mtype << " not in cache\n");
-
-	// Not in cache. 
 	if (internal) {
 	    // If there is a parameter after "internal" it's the mime
-	    // type to use. This is so that we can have bogus mime
-	    // types like text/x-purple-html-log (for ie: specific
-	    // icon) and still use the html filter on them. This is
-	    // partly redundant with the localfields/rclaptg, but
-	    // better and the latter will probably go away at some
-	    // point in the future.
+	    // type to use, or the further qualifier (e.g. style sheet
+	    // name for xslt types). This is so that we can have bogus
+	    // mime types like text/x-purple-html-log (for ie:
+	    // specific icon) and still use the html filter on
+	    // them. This is partly redundant with the
+	    // localfields/rclaptg, but better? (and the latter will
+	    // probably go away at some point in the future?).
 	    LOGDEB2("handlertype internal, cmdstr [" << cmdstr << "]\n");
 	    h = mhFactory(cfg, cmdstr.empty() ? mtype : cmdstr, false, id);
 	    goto out;
@ -336,14 +346,10 @@ RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg,
 		goto out;
            }
 	}
-    }
-
-    // We get here if there was no specific error, but there is no
-    // identified mime type, or no handler associated.
-
-    // Finally, unhandled files are either ignored or their name and
-    // generic metadata is indexed, depending on configuration
-    {
+    } else {
+        // No identified mime type, or no handler associated.
+        // Unhandled files are either ignored or their name and
+        // generic metadata is indexed, depending on configuration
 	bool indexunknown = false;
 	cfg->getConfParam("indexallfilenames", &indexunknown);
 	if (indexunknown) {
--- a/src/sampleconf/mimeconf
+++ b/src/sampleconf/mimeconf
@ -110,7 +110,7 @@ application/vnd.sun.xml.writer = execm rclsoff.py
 application/vnd.sun.xml.writer.global = execm rclsoff.py
 application/vnd.sun.xml.writer.template = execm rclsoff.py
 application/vnd.wordperfect = exec wpd2html;mimetype=text/html
-application/x-abiword = execm rclabw.py
+application/x-abiword = internal xsltproc abw.xsl
 application/x-awk = internal text/plain
 application/x-chm = execm rclchm
 application/x-dia-diagram = execm rcldia;mimetype=text/plain
--- a/src/utils/md5ut.cpp
+++ b/src/utils/md5ut.cpp
@ -20,15 +20,17 @@
 #include <string.h>

 #include "md5ut.h"
-
 #include "readfile.h"

 using namespace std;

-class FileScanMd5 : public FileScanDo {
+// Quite incredibly if this class is named FileScanMd5 like the
+// different one in readfile.cpp, the vtables get mixed up and mh_xslt
+// crashes while calling a virtual function (gcc 6.3 and 7.3)
+class FileScanMd5loc : public FileScanDo {
 public:
-    FileScanMd5(string& d) : digest(d) {}
-    virtual bool init(size_t size, string *)
+    FileScanMd5loc(string& d) : digest(d) {}
+    virtual bool init(int64_t, string *)
    {
 	MD5Init(&ctx);
 	return true;
@ -44,7 +46,7 @@ public:

 bool MD5File(const string& filename, string &digest, string *reason)
 {
-    FileScanMd5 md5er(digest);
+    FileScanMd5loc md5er(digest);
    if (!file_scan(filename, &md5er, reason))
 	return false;
    // We happen to know that digest and md5er.digest are the same object
--- a/src/utils/miniz.cpp
+++ b/src/utils/miniz.cpp
--- a/src/utils/miniz.h
+++ b/src/utils/miniz.h
--- a/src/utils/readfile.cpp
+++ b/src/utils/readfile.cpp
@ -14,7 +14,6 @@
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
-#ifndef TEST_READFILE
 #ifdef BUILDING_RECOLL
 #include "autoconfig.h"
 #else
@ -37,20 +36,33 @@

 #include "readfile.h"
 #include "smallut.h"
+#include "md5.h"

-using std::string;
+#ifdef MDU_INCLUDE_LOG
+#include MDU_INCLUDE_LOG
+#else
+#include "log.h"
+#endif

+using namespace std;
+
+///////////////
+// Implementation of basic interface: read whole file to memory buffer
 class FileToString : public FileScanDo {
 public:
    FileToString(string& data) : m_data(data) {}
-    string& m_data;
-    bool init(size_t size, string *reason) {
+
+    // Note: the fstat() + reserve() (in init()) calls divide cpu
+    // usage almost by 2 on both linux i586 and macosx (compared to
+    // just append()) Also tried a version with mmap, but it's
+    // actually slower on the mac and not faster on linux.
+    virtual bool init(int64_t size, string *reason) {
        if (size > 0) {
            m_data.reserve(size);
        }
        return true;
    }
-    bool data(const char *buf, int cnt, string *reason) {
+    virtual bool data(const char *buf, int cnt, string *reason) {
        try {
            m_data.append(buf, cnt);
        } catch (...) {
@ -59,248 +71,449 @@ public:
        }
        return true;
    }
+
+    string& m_data;
 };

+bool file_to_string(const string& fn, string& data, int64_t offs, size_t cnt,
+                    string *reason)
+{
+    FileToString accum(data);
+    return file_scan(fn, &accum, offs, cnt, reason, nullptr);
+}
+
 bool file_to_string(const string& fn, string& data, string *reason)
 {
    return file_to_string(fn, data, 0, size_t(-1), reason);
 }
-bool file_to_string(const string& fn, string& data, int64_t offs, size_t cnt,
-                    string *reason)
-{
-    FileToString accum(data);
-    return file_scan(fn, &accum, offs, cnt, reason);
-}

-bool file_scan(const string& fn, FileScanDo* doer, string *reason)
-{
-    return file_scan(fn, doer, 0, size_t(-1), reason);
-}

-const int RDBUFSZ = 8192;
-// Note: the fstat() + reserve() (in init()) calls divide cpu usage almost by 2
-// on both linux i586 and macosx (compared to just append())
-// Also tried a version with mmap, but it's actually slower on the mac and not
-// faster on linux.
-bool file_scan(const string& fn, FileScanDo* doer, int64_t startoffs,
-               size_t cnttoread, string *reason)
-{
-    if (startoffs < 0) {
-        *reason += " file_scan: negative startoffs not allowed";
-        return false;
+/////////////
+//  Callback/filtering interface
+
+// Abstract class base for both source (origin) and filter
+// (midstream). Both have a downstream
+class FileScanUpstream {
+public:
+    virtual void setDownstream(FileScanDo *down) {
+        m_down = down;
+    }
+    virtual FileScanDo *out() {
+        return m_down;
+    }
+protected:        
+    FileScanDo *m_down{nullptr};
+};
+
+// Source element.
+class FileScanSource : public FileScanUpstream {
+public:
+    FileScanSource(FileScanDo *down) {
+        setDownstream(down);
+    }
+    virtual bool scan() = 0;
+};
+
+// Inside element of a transformation pipe. The idea is that elements
+// which don't recognize the data get themselves out of the pipe
+// (pop()). Typically, only one of the decompression modules
+// (e.g. gzip/bzip2/xz...) would remain. For now there is only gzip,
+// it pops itself if the data does not have the right magic number
+class FileScanFilter : public FileScanDo, public FileScanUpstream {
+public:
+    virtual void insertAtSink(FileScanDo *sink, FileScanUpstream *upstream) {
+        setDownstream(sink);
+        if (m_down) {
+            m_down->setUpstream(this);
+        }
+        setUpstream(upstream);
+        if (m_up) {
+            m_up->setDownstream(this);
+        }
    }

-    bool ret = false;
-    bool noclosing = true;
-    int fd = 0;
-    struct stat st;
-    // Initialize st_size: if fn.empty() , the fstat() call won't happen.
-    st.st_size = 0;
+    // Remove myself from the pipe. 
+    virtual void pop() {
+        if (m_down) {
+            m_down->setUpstream(m_up);
+        }
+        if (m_up) {
+            m_up->setDownstream(m_down);
+        }
+    }

-    // If we have a file name, open it, else use stdin.
-    if (!fn.empty()) {
-        fd = open(fn.c_str(), O_RDONLY | O_BINARY);
-        if (fd < 0 || fstat(fd, &st) < 0) {
-            catstrerror(reason, "open/stat", errno);
+    virtual void setUpstream(FileScanUpstream *up) override {
+        m_up = up;
+    }
+
+private:
+    FileScanUpstream *m_up{nullptr};
+};
+
+
+#if defined(READFILE_ENABLE_ZLIB)
+#include <zlib.h>
+
+class GzFilter : public FileScanFilter {
+public:
+    virtual ~GzFilter() {
+        if (m_initdone) {
+            inflateEnd(&m_stream);
+        }
+    }
+
+    virtual bool init(int64_t size, string *reason) override {
+        LOGDEB1("GzFilter::init\n");
+        if (out()) {
+            return out()->init(size, reason);
+        }
+        return true;
+    }
+
+    virtual bool data(const char *buf, int cnt, string *reason) override {
+        LOGDEB1("GzFilter::data: cnt " << cnt << endl);
+
+        int error;
+        m_stream.next_in = (Bytef*)buf;
+        m_stream.avail_in = cnt;
+        
+        if (m_initdone == false) {
+            m_initdone = true;
+            // We do not support a first read cnt < 2. We probably should.
+            if (cnt < 2) {
+                if (reason)
+                    *reason += "GzFilter: first data count < 2";
+                return false;
+            }
+            const unsigned char *ubuf = (const unsigned char *)buf;
+            if (ubuf[0] != 0x1f || ubuf[1] != 0x8b) {
+                LOGDEB1("GzFilter::data: not gzip. out() is " << out() << "\n");
+                pop();
+                if (out()) {
+                    return out()->data(buf, cnt, reason);
+                } else {
+                    return false;
+                }
+            }
+            m_stream.opaque = nullptr;
+            m_stream.zalloc = alloc_func;
+            m_stream.zfree = free_func;
+            m_stream.next_out = (Bytef*)m_obuf;
+            m_stream.avail_out = m_obs;
+            if ((error = inflateInit2(&m_stream, 15+32)) != Z_OK) {
+                LOGERR("inflateInit2 error: " << error << endl);
+                if (reason) {
+                    *reason += " Zlib inflateinit failed";
+                    if (m_stream.msg && *m_stream.msg) {
+                        *reason += string(": ") + m_stream.msg;
+                    }
+                }
+                return false;
+            }
+        }
+        
+        while (m_stream.avail_in != 0) {
+            m_stream.next_out = (Bytef*)m_obuf;
+            m_stream.avail_out = m_obs;
+            if ((error = inflate(&m_stream, Z_SYNC_FLUSH)) < Z_OK) {
+                LOGERR("inflate error: " << error << endl);
+                if (reason) {
+                    *reason += " Zlib inflate failed";
+                    if (m_stream.msg && *m_stream.msg) {
+                        *reason += string(": ") + m_stream.msg;
+                    }
+                }
+                return false;
+            }
+            if (out() &&
+                !out()->data(m_obuf, m_obs - m_stream.avail_out, reason)) {
+                return false;
+            }
+        }
+        return true;
+    }
+    
+    static voidpf alloc_func(voidpf opaque, uInt items, uInt size) {
+        return malloc(items * size);
+    }
+    static void free_func(voidpf opaque, voidpf address) {
+        free(address);
+    }
+
+    bool m_initdone{false};
+    z_stream m_stream;
+    char m_obuf[10000];
+    const int m_obs{10000};
+};
+#endif // GZ
+
+class FileScanMd5 : public FileScanFilter {
+public:
+    FileScanMd5(string& d) : digest(d) {}
+    virtual bool init(int64_t size, string *reason) override {
+        LOGDEB1("FileScanMd5: init\n");
+	MD5Init(&ctx);
+        if (out()) {
+            return out()->init(size, reason);
+        }
+	return true;
+    }
+    virtual bool data(const char *buf, int cnt, string *reason) override {
+        LOGDEB1("FileScanMd5: data. cnt " << cnt << endl);
+	MD5Update(&ctx, (const unsigned char*)buf, cnt);
+        if (out() && !out()->data(buf, cnt, reason)) {
            return false;
        }
-        noclosing = false;
+	return true;
    }
+    bool finish() {
+        LOGDEB1("FileScanMd5: finish\n");
+        MD5Final(digest, &ctx);
+        return true;
+    }
+    string &digest;
+    MD5_CTX ctx;
+};
+
+
+// Source taking data from a regular file
+class FileScanSourceFile : public FileScanSource {
+public:
+    FileScanSourceFile(FileScanDo *next, const string& fn, int64_t startoffs,
+                       int64_t cnttoread, string *reason)
+        : FileScanSource(next), m_fn(fn), m_startoffs(startoffs),
+          m_cnttoread(cnttoread), m_reason(reason) { }
+
+    virtual bool scan() {
+        LOGDEB1("FileScanSourceFile: reading " << m_fn << " offs " <<
+               m_startoffs<< " cnt " << m_cnttoread << " out " << out() << endl);
+        const int RDBUFSZ = 8192;
+        bool ret = false;
+        bool noclosing = true;
+        int fd = 0;
+        struct stat st;
+        // Initialize st_size: if fn.empty() , the fstat() call won't happen.
+        st.st_size = 0;
+
+        // If we have a file name, open it, else use stdin.
+        if (!m_fn.empty()) {
+            fd = open(m_fn.c_str(), O_RDONLY | O_BINARY);
+            if (fd < 0 || fstat(fd, &st) < 0) {
+                catstrerror(m_reason, "open/stat", errno);
+                return false;
+            }
+            noclosing = false;
+        }

 #if defined O_NOATIME && O_NOATIME != 0
-    if (fcntl(fd, F_SETFL, O_NOATIME) < 0) {
-        // perror("fcntl");
+        if (fcntl(fd, F_SETFL, O_NOATIME) < 0) {
+            // perror("fcntl");
+        }
+#endif
+        if (out()) {
+            if (m_cnttoread != -1 && m_cnttoread) {
+                out()->init(m_cnttoread + 1, m_reason);
+            } else if (st.st_size > 0) {
+                out()->init(st.st_size + 1, m_reason);
+            } else {
+                out()->init(0, m_reason);
+            }
+        }
+
+        int64_t curoffs = 0;
+        if (m_startoffs > 0 && !m_fn.empty()) {
+            if (lseek(fd, m_startoffs, SEEK_SET) != m_startoffs) {
+                catstrerror(m_reason, "lseek", errno);
+                return false;
+            }
+            curoffs = m_startoffs;
+        }
+
+        char buf[RDBUFSZ];
+        int64_t totread = 0;
+        for (;;) {
+            size_t toread = RDBUFSZ;
+            if (m_startoffs > 0 && curoffs < m_startoffs) {
+                toread = size_t(MIN(RDBUFSZ, m_startoffs - curoffs));
+            }
+
+            if (m_cnttoread != -1) {
+                toread = MIN(toread, (uint64_t)(m_cnttoread - totread));
+            }
+            ssize_t n = static_cast<ssize_t>(read(fd, buf, toread));
+            if (n < 0) {
+                catstrerror(m_reason, "read", errno);
+                goto out;
+            }
+            if (n == 0) {
+                break;
+            }
+            curoffs += n;
+            if (curoffs - n < m_startoffs) {
+                continue;
+            }
+            if (!out()->data(buf, n, m_reason)) {
+                goto out;
+            }
+            totread += n;
+            if (m_cnttoread > 0 && totread >= m_cnttoread) {
+                break;
+            }
+        }
+
+        ret = true;
+    out:
+        if (fd >= 0 && !noclosing) {
+            close(fd);
+        }
+        return ret;
+    }
+    
+protected:
+    string m_fn;
+    int64_t m_startoffs;
+    int64_t m_cnttoread;
+    string *m_reason;
+};
+
+
+#if defined(READFILE_ENABLE_MINIZ)
+#include "miniz.h"
+
+// Source taking data from a ZIP archive member
+class FileScanSourceZip : public FileScanSource {
+public:
+    FileScanSourceZip(FileScanDo *next, const string& fn, const string& member,
+                       string *reason)
+        : FileScanSource(next), m_fn(fn), m_member(member),
+          m_reason(reason) { }
+
+    virtual bool scan() {
+        bool ret = false;
+        mz_zip_archive zip;
+        mz_zip_zero_struct(&zip);
+        void *opaque = this;
+
+        if (!mz_zip_reader_init_file(&zip, m_fn.c_str(), 0)) {
+            if (m_reason) {
+                *m_reason += "mz_zip_reader_init_file() failed: ";
+                *m_reason += string(mz_zip_get_error_string(zip.m_last_error));
+            }
+            return false;
+        }
+        mz_uint32 file_index;
+        if (mz_zip_reader_locate_file_v2(&zip, m_member.c_str(), NULL, 0,
+                                         &file_index) < 0) {
+            if (m_reason) {
+                *m_reason += "mz_zip_reader_locate_file() failed: ";
+                *m_reason += string(mz_zip_get_error_string(zip.m_last_error));
+            }
+            goto out;
+        }
+
+        mz_zip_archive_file_stat zstat;
+        if (!mz_zip_reader_file_stat(&zip, file_index, &zstat)) {
+            if (m_reason) {
+                *m_reason += "mz_zip_reader_file_stat() failed: ";
+                *m_reason += string(mz_zip_get_error_string(zip.m_last_error));
+            }
+            goto out;
+        }
+        if (out()) {
+            if (!out()->init(zstat.m_uncomp_size, m_reason)) {
+                goto out;
+            }
+        }
+                
+        if (!mz_zip_reader_extract_to_callback(
+                &zip, file_index, write_cb, opaque, 0)) {
+            if (m_reason) {
+                *m_reason += "mz_zip_reader_extract_to_callback() failed: ";
+                *m_reason += string(mz_zip_get_error_string(zip.m_last_error));
+            }
+            goto out;
+        }
+        
+        ret = true;
+    out:
+        mz_zip_reader_end(&zip);
+        return ret;
+    }
+
+    static size_t write_cb(void *pOpaque, mz_uint64 file_ofs,
+                           const void *pBuf, size_t n) {
+        const char *cp = (const char*)pBuf;
+        LOGDEB1("write_cb: ofs " << file_ofs << " cnt " << n << " data: " <<
+                string(cp, n) << endl);
+        FileScanSourceZip *ths = (FileScanSourceZip *)pOpaque;
+        if (ths->out()) {
+            if (!ths->out()->data(cp, n, ths->m_reason)) {
+                return (size_t)-1;
+            }
+        }
+        return n;
+    }
+    
+protected:
+    string m_fn;
+    string m_member;
+    string *m_reason;
+};
+
+bool file_scan(const std::string& filename, const std::string& membername,
+               FileScanDo* doer, std::string *reason)
+{
+    if (membername.empty()) {
+        return file_scan(filename, doer, 0, -1, reason, nullptr);
+    } else {
+            FileScanSourceZip source(doer, filename, membername, reason);
+            return source.scan();
+    }
+}
+
+#endif // READFILE_ENABLE_ZIP
+
+bool file_scan(const string& fn, FileScanDo* doer, int64_t startoffs,
+               int64_t cnttoread, string *reason, string *md5p)
+{
+    LOGDEB("file_scan: doer " << doer << endl);
+#if defined(READFILE_ENABLE_ZLIB)
+    bool nodecomp = startoffs != 0;
+#endif
+    if (startoffs < 0) {
+        startoffs = 0;
+    }
+    
+    FileScanSourceFile source(doer, fn, startoffs, cnttoread, reason);
+    FileScanUpstream *up = &source;
+
+    // We compute the MD5 on the uncompressed data, so insert this
+    // right at the source.
+    string digest;
+    FileScanMd5 md5filter(digest);
+    if (md5p) {
+        md5filter.insertAtSink(doer, up);
+        up = &md5filter;
+    }
+    
+#if defined(READFILE_ENABLE_ZLIB)
+    GzFilter gzfilter;
+    if (!nodecomp) {
+        gzfilter.insertAtSink(doer, up);
+        up = &gzfilter;
    }
 #endif

-    if (cnttoread != (size_t) - 1 && cnttoread) {
-        doer->init(cnttoread + 1, reason);
-    } else if (st.st_size > 0) {
-        doer->init(size_t(st.st_size + 1), reason);
-    } else {
-        doer->init(0, reason);
-    }
+    bool ret = source.scan();

-    int64_t curoffs = 0;
-    if (startoffs > 0 && !fn.empty()) {
-        if (lseek(fd, startoffs, SEEK_SET) != startoffs) {
-            catstrerror(reason, "lseek", errno);
-            return false;
-        }
-        curoffs = startoffs;
-    }
-
-    char buf[RDBUFSZ];
-    size_t totread = 0;
-    for (;;) {
-        size_t toread = RDBUFSZ;
-        if (startoffs > 0 && curoffs < startoffs) {
-            toread = size_t(MIN(RDBUFSZ, startoffs - curoffs));
-        }
-
-        if (cnttoread != size_t(-1)) {
-            toread = MIN(toread, cnttoread - totread);
-        }
-        ssize_t n = static_cast<ssize_t>(read(fd, buf, toread));
-        if (n < 0) {
-            catstrerror(reason, "read", errno);
-            goto out;
-        }
-        if (n == 0) {
-            break;
-        }
-
-        curoffs += n;
-        if (curoffs - n < startoffs) {
-            continue;
-        }
-
-        if (!doer->data(buf, n, reason)) {
-            goto out;
-        }
-        totread += n;
-        if (cnttoread > 0 && totread >= cnttoread) {
-            break;
-        }
-    }
-
-    ret = true;
-out:
-    if (fd >= 0 && !noclosing) {
-        close(fd);
+    if (md5p) {
+        md5filter.finish();
+        MD5HexPrint(digest, *md5p);
    }
    return ret;
 }

-#else // Test
-#include "autoconfig.h"
-
-#include <stdio.h>
-#include <sys/types.h>
-#include "safesysstat.h"
-#include <stdlib.h>
-
-#include <string>
-#include <iostream>
-using namespace std;
-
-#include "readfile.h"
-#include "fstreewalk.h"
-
-using namespace std;
-
-class myCB : public FsTreeWalkerCB {
-public:
-    FsTreeWalker::Status processone(const string& path,
-                                    const struct stat *st,
-                                    FsTreeWalker::CbFlag flg) {
-        if (flg == FsTreeWalker::FtwDirEnter) {
-            //cout << "[Entering " << path << "]" << endl;
-        } else if (flg == FsTreeWalker::FtwDirReturn) {
-            //cout << "[Returning to " << path << "]" << endl;
-        } else if (flg == FsTreeWalker::FtwRegular) {
-            //cout << path << endl;
-            string s, reason;
-            if (!file_to_string(path, s, &reason)) {
-                cerr << "Failed: " << reason << " : " << path << endl;
-            } else {
-                //cout <<
-                //"================================================" << endl;
-                cout << path << endl;
-                //      cout << s;
-            }
-            reason.clear();
-        }
-        return FsTreeWalker::FtwOk;
-    }
-};
-
-static int     op_flags;
-#define OPT_MOINS 0x1
-#define OPT_c     0x2
-#define OPT_o     0x4
-
-static const char *thisprog;
-static char usage [] =
-    "trreadfile [-o offs] [-c cnt] topdirorfile\n\n"
-    ;
-static void
-Usage(void)
+bool file_scan(const string& fn, FileScanDo* doer, string *reason)
 {
-    fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
-    exit(1);
+    return file_scan(fn, doer, 0, -1, reason, nullptr);
 }
-
-int main(int argc, const char **argv)
-{
-    int64_t offs = 0;
-    size_t cnt = size_t(-1);
-    thisprog = argv[0];
-    argc--;
-    argv++;
-
-    while (argc > 0 && **argv == '-') {
-        (*argv)++;
-        if (!(**argv))
-            /* Cas du "adb - core" */
-        {
-            Usage();
-        }
-        while (**argv)
-            switch (*(*argv)++) {
-            case 'c':
-                op_flags |= OPT_c;
-                if (argc < 2) {
-                    Usage();
-                }
-                cnt = atoll(*(++argv));
-                argc--;
-                goto b1;
-            case 'o':
-                op_flags |= OPT_o;
-                if (argc < 2) {
-                    Usage();
-                }
-                offs = strtoull(*(++argv), 0, 0);
-                argc--;
-                goto b1;
-            default:
-                Usage();
-                break;
-            }
-b1:
-        argc--;
-        argv++;
-    }
-
-    if (argc != 1) {
-        Usage();
-    }
-    string top = *argv++;
-    argc--;
-    cerr << "filename " << top << " offs " << offs << " cnt " << cnt << endl;
-
-    struct stat st;
-    if (!top.empty() && stat(top.c_str(), &st) < 0) {
-        perror("stat");
-        exit(1);
-    }
-    if (!top.empty() && S_ISDIR(st.st_mode)) {
-        FsTreeWalker walker;
-        myCB cb;
-        walker.walk(top, cb);
-        if (walker.getErrCnt() > 0) {
-            cout << walker.getReason();
-        }
-    } else {
-        string s, reason;
-        if (!file_to_string(top, s, offs, cnt, &reason)) {
-            cerr << reason << endl;
-            exit(1);
-        } else {
-            cout << s;
-        }
-    }
-    exit(0);
-}
-#endif //TEST_READFILE
--- a/src/utils/readfile.h
+++ b/src/utils/readfile.h
@ -21,30 +21,72 @@

 #include <string>

-/**
- * Read file in chunks, calling an accumulator for each chunk. Can be used
- * for reading in a file, computing an md5...
- */
+class FileScanUpstream;
+
+/** Data sink for the file reader. */
 class FileScanDo {
 public:
    virtual ~FileScanDo() {}
-    virtual bool init(size_t size, std::string *reason) = 0;
-    virtual bool data(const char *buf, int cnt, std::string* reason) = 0;
+    /* Initialize and allocate. 
+     * @param size if set, lower bound of data size.
+     * @param reason[output] set to error message in case of error.
+     * @return false for error (file_scan will return), true if ok.
+     */
+    virtual bool init(int64_t size, std::string *reason) = 0;
+    /* Process chunk of data
+     * @param buf  the data buffer.
+     * @param cnt byte count.
+     * @param reason[output] set to error message in case of error.
+     * @return false for error (file_scan will return), true if ok.
+     */
+    virtual bool data(const char *buf, int cnt, std::string *reason) = 0;
+    
+    virtual void setUpstream(FileScanUpstream*) {}
 };
-bool file_scan(const std::string& filename, FileScanDo* doer, std::string *reason = 0);
-/* Same but only process count cnt from offset offs. Set cnt to size_t(-1)
- * for no limit */
-bool file_scan(const std::string& fn, FileScanDo* doer, int64_t offs, size_t cnt,
-               std::string *reason = 0);
+
+/** Open and read file, calling the FileScanDo data() method for each chunk.
+ *
+ * @param filename File name. Use empty value for stdin
+
+ * @param doer the data processor. The init() method will be called
+ * initially witht a lower bound of the data size (may be used to
+ * reserve a buffer), or with a 0 size if nothing is known about the
+ * size. The data() method will be called for every chunk of data
+ * read. 
+ * @param offs Start offset. If not zero, will disable decompression 
+ *             (set to -1 to start at 0 with no decompression).
+ * @param cnt Max bytes in output. Set cnt to -1 for no limit.
+ * @param[output] md5p If not null, points to a string to store the hex ascii 
+ *     md5 of the uncompressed data.
+ * @param[output] reason If not null, points to a string for storing an 
+ *     error message if the return value is false.
+ * @return true if the operation ended normally, else false.
+ */
+bool file_scan(const std::string& fn, FileScanDo* doer, int64_t startoffs,
+               int64_t cnttoread, std::string *reason, std::string *md5p);
+
+/** Same as above, not offset/cnt/md5 */
+bool file_scan(const std::string& filename, FileScanDo* doer,
+               std::string *reason);
+
+
+#if defined(READFILE_ENABLE_MINIZ)
+/* Process a zip archive member */
+bool file_scan(const std::string& filename, const std::string& membername,
+               FileScanDo* doer, std::string *reason);
+#endif

 /**
 * Read file into string.
 * @return true for ok, false else
 */
-bool file_to_string(const std::string& filename, std::string& data, std::string *reason = 0);
+bool file_to_string(const std::string& filename, std::string& data,
+                    std::string *reason = 0);

-/** Read file chunk into string. Set cnt to size_t(-1) for whole file */
+/** Read file chunk into string. Set cnt to -1 for going to
+ * eof, offs to -1 for going from the start without decompression */
 bool file_to_string(const std::string& filename, std::string& data,
                    int64_t offs, size_t cnt, std::string *reason = 0);

+
 #endif /* _READFILE_H_INCLUDED_ */