internal xslt working for single-sheet (abw). Still leaking memory?

This commit is contained in:
Jean-Francois Dockes 2018-12-25 10:57:26 +01:00
parent 2bd4b5ef4f
commit 00c0c5168b
13 changed files with 9812 additions and 271 deletions

View File

@ -2,6 +2,8 @@
CXXFLAGS ?= @CXXFLAGS@
LIBXAPIAN=@LIBXAPIAN@
XAPIANCXXFLAGS=@XAPIANCXXFLAGS@
XSLT_CFLAGS=@XSLT_CFLAGS@
XSLT_LINKADD=@XSLT_LINKADD@
LIBICONV=@LIBICONV@
INCICONV=@INCICONV@
LIBFAM = @LIBFAM@
@ -29,8 +31,10 @@ AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
$(COMMONCPPFLAGS) \
$(INCICONV) \
$(XAPIANCXXFLAGS) \
$(XSLT_CFLAGS) \
$(X_CFLAGS) \
-DRECOLL_DATADIR=\"${pkgdatadir}\" \
-DREADFILE_ENABLE_ZLIB -DREADFILE_ENABLE_MINIZ \
-D_GNU_SOURCE \
$(DEFS)
@ -121,6 +125,8 @@ internfile/mh_symlink.h \
internfile/mh_text.cpp \
internfile/mh_text.h \
internfile/mh_unknown.h \
internfile/mh_xslt.cpp \
internfile/mh_xslt.h \
internfile/mimehandler.cpp \
internfile/mimehandler.h \
internfile/myhtmlparse.cpp \
@ -224,6 +230,8 @@ utils/md5ut.cpp \
utils/md5ut.h \
utils/mimeparse.cpp \
utils/mimeparse.h \
utils/miniz.cpp \
utils/miniz.h \
utils/netcon.cpp \
utils/netcon.h \
utils/pathut.cpp \
@ -262,7 +270,7 @@ AM_YFLAGS = -d
librecoll_la_LDFLAGS = -release $(VERSION) \
-Wl,--no-undefined -Wl,--warn-unresolved-symbols
librecoll_la_LIBADD = $(LIBXAPIAN) $(LIBICONV) $(LIBTHREADS)
librecoll_la_LIBADD = $(XSLT_LINKADD) $(LIBXAPIAN) $(LIBICONV) $(LIBTHREADS)
# There is probably a better way to do this. The KIO needs to be linked
# with librecoll, but librecoll is installed into a non-standard place
@ -640,6 +648,7 @@ sampleconf/mimeview
filterdir = $(pkgdatadir)/filters
filter_DATA = \
desktop/hotrecoll.py \
filters/abiword.xsl \
filters/rcl7z \
filters/rclabw.py \
filters/rclaptosidman \

View File

@ -321,6 +321,21 @@ XAPIANCXXFLAGS=`$XAPIAN_CONFIG --cxxflags`
#echo LIBXAPIANSTATICEXTRA: $LIBXAPIANSTATICEXTRA
#echo XAPIANCXXFLAGS: $XAPIANCXXFLAGS
XSLT_CONFIG=${XSLT_CONFIG:-no}
if test "$XSLT_CONFIG" = "no"; then
AC_PATH_PROG(XSLT_CONFIG0, [xslt-config], no)
XSLT_CONFIG=$XSLT_CONFIG0
fi
if test "$XSLT_CONFIG" = "no" ; then
AC_MSG_ERROR([Cannot find xslt-config command in $PATH. Is
libxslt installed ?])
exit 1
fi
XSLT_CFLAGS=`xslt-config --cflags`
XSLT_LINKADD=`xslt-config --libs`
AC_ARG_ENABLE(xadump,
AC_HELP_STRING([--enable-xadump],
[Enable building the xadump low level Xapian access program.]),
@ -527,6 +542,8 @@ AC_SUBST(QMAKE_DISABLE_ZEITGEIST)
AC_SUBST(LIBQZEITGEIST)
AC_SUBST(RCLVERSION)
AC_SUBST(RCLLIBVERSION)
AC_SUBST(XSLT_CFLAGS)
AC_SUBST(XSLT_LINKADD)
# All object files depend on localdefs which has the cc flags. Avoid
# changing it unless necessary

88
src/filters/abiword.xsl Normal file
View File

@ -0,0 +1,88 @@
<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:ab="http://www.abisource.com/awml.dtd"
exclude-result-prefixes="ab"
>
<xsl:output method="html" encoding="UTF-8"/>
<xsl:template match="/">
<html>
<head>
<xsl:apply-templates select="ab:abiword/ab:metadata"/>
</head>
<body>
<!-- This is for the older abiword format with no namespaces -->
<xsl:for-each select="abiword/section">
<xsl:apply-templates select="p"/>
</xsl:for-each>
<!-- Newer namespaced format -->
<xsl:for-each select="ab:abiword/ab:section">
<xsl:for-each select="ab:p">
<p><xsl:value-of select="."/></p><xsl:text>
</xsl:text>
</xsl:for-each>
</xsl:for-each>
</body>
</html>
</xsl:template>
<xsl:template match="p">
<p><xsl:value-of select="."/></p><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="ab:metadata">
<xsl:for-each select="ab:m">
<xsl:choose>
<xsl:when test="@key = 'dc.creator'">
<meta>
<xsl:attribute name="name">author</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:when>
<xsl:when test="@key = 'abiword.keywords'">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:when>
<xsl:when test="@key = 'dc.subject'">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:when>
<xsl:when test="@key = 'dc.description'">
<meta>
<xsl:attribute name="name">abstract</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:when>
<xsl:when test="@key = 'dc.title'">
<title><xsl:value-of select="."/></title><xsl:text>
</xsl:text>
</xsl:when>
<xsl:otherwise>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>

View File

@ -38,9 +38,7 @@ public:
MimeHandlerMail(RclConfig *cnf, const std::string &id);
virtual ~MimeHandlerMail();
virtual bool is_data_input_ok(DataInput input) const {
if (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING)
return true;
return false;
return (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING);
}
virtual bool next_document();
virtual bool skip_to_document(const std::string& ipath);

232
src/internfile/mh_xslt.cpp Normal file
View File

@ -0,0 +1,232 @@
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include <libxml/parser.h>
#include <libxml/tree.h>
#include <libxslt/transform.h>
#include <libxslt/xsltInternals.h>
#include <libxslt/xsltutils.h>
#include "cstr.h"
#include "mh_xslt.h"
#include "log.h"
#include "smallut.h"
#include "md5ut.h"
#include "rclconfig.h"
#include "readfile.h"
using namespace std;
class FileScanXML : public FileScanDo {
public:
FileScanXML(const string& fn) : m_fn(fn) {}
virtual ~FileScanXML() {
if (ctxt) {
xmlFreeParserCtxt(ctxt);
}
}
xmlDocPtr getDoc() {
int ret;
if ((ret = xmlParseChunk(ctxt, nullptr, 0, 1))) {
xmlError *error = xmlGetLastError();
LOGERR("FileScanXML: final xmlParseChunk failed with error " <<
ret << " error: " <<
(error ? error->message :
" null return from xmlGetLastError()") << "\n");
return nullptr;
}
return ctxt->myDoc;
}
virtual bool init(int64_t size, string *) {
LOGDEB1("FileScanXML: init: size " << size << endl);
ctxt = xmlCreatePushParserCtxt(NULL, NULL, NULL, 0, m_fn.c_str());
if (ctxt == nullptr) {
LOGERR("FileScanXML: xmlCreatePushParserCtxt failed\n");
return false;
} else {
return true;
}
}
virtual bool data(const char *buf, int cnt, string*) {
if (0) {
string dt(buf, cnt);
LOGDEB1("FileScanXML: data: cnt " << cnt << " data " << dt << endl);
} else {
LOGDEB1("FileScanXML: data: cnt " << cnt << endl);
}
int ret;
if ((ret = xmlParseChunk(ctxt, buf, cnt, 0))) {
xmlError *error = xmlGetLastError();
LOGERR("FileScanXML: xmlParseChunk failed with error " <<
ret << " for [" << buf << "] error " <<
(error ? error->message :
" null return from xmlGetLastError()") << "\n");
return false;
} else {
LOGDEB1("xmlParseChunk ok (sent " << cnt << " bytes)\n");
return true;
}
}
private:
xmlParserCtxtPtr ctxt{nullptr};
string m_fn;
};
class MimeHandlerXslt::Internal {
public:
~Internal() {
if (metaOrAllSS) {
xsltFreeStylesheet(metaOrAllSS);
}
if (dataSS) {
xsltFreeStylesheet(dataSS);
}
}
bool ok{false};
xsltStylesheet *metaOrAllSS{nullptr};
xsltStylesheet *dataSS{nullptr};
string result;
};
MimeHandlerXslt::~MimeHandlerXslt()
{
delete m;
}
MimeHandlerXslt::MimeHandlerXslt(RclConfig *cnf, const std::string& id,
const std::vector<std::string>& params)
: RecollFilter(cnf, id), m(new Internal)
{
LOGDEB("MimeHandlerXslt: params: " << stringsToString(params) << endl);
string filtersdir = path_cat(cnf->getDatadir(), "filters");
xmlSubstituteEntitiesDefault(0);
xmlLoadExtDtdDefaultValue = 0;
// params can be "xslt stylesheetall" or
// "xslt metamember stylesheetmeta datamember stylesheetdata"
if (params.size() == 2) {
string ssfn = path_cat(filtersdir, params[1]);
FileScanXML XMLstyle(ssfn);
string reason;
if (!file_scan(ssfn, &XMLstyle, &reason)) {
LOGERR("MimeHandlerXslt: file_scan failed for style sheet " <<
ssfn << " : " << reason << endl);
return;
}
xmlDoc *stl = XMLstyle.getDoc();
if (stl == nullptr) {
LOGERR("MimeHandlerXslt: getDoc failed for style sheet " <<
ssfn << endl);
return;
}
m->metaOrAllSS = xsltParseStylesheetDoc(stl);
if (m->metaOrAllSS) {
m->ok = true;
}
} else if (params.size() == 4) {
} else {
LOGERR("MimeHandlerXslt: constructor with wrong param vector: " <<
stringsToString(params) << endl);
}
}
bool MimeHandlerXslt::set_document_file_impl(const std::string& mt,
const std::string &file_path)
{
LOGDEB0("MimeHandlerXslt::set_document_file_: fn: " << file_path << endl);
if (!m || !m->ok) {
return false;
}
if (nullptr == m->dataSS) {
if (nullptr == m->metaOrAllSS) {
LOGERR("MimeHandlerXslt::set_document_file_impl: both ss empty??\n");
return false;
}
FileScanXML XMLdoc(file_path);
string md5, reason;
if (!file_scan(file_path, &XMLdoc, 0, -1, &reason,
m_forPreview ? nullptr : &md5)) {
LOGERR("MimeHandlerXslt::set_document_file_impl: file_scan failed "
"for " << file_path << " : " << reason << endl);
return false;
}
if (!m_forPreview) {
m_metaData[cstr_dj_keymd5] = md5;
}
xmlDocPtr doc = XMLdoc.getDoc();
if (nullptr == doc) {
LOGERR("MimeHandlerXslt::set_doc_file_impl: no parsed doc\n");
return false;
}
xmlDocPtr transformed = xsltApplyStylesheet(m->metaOrAllSS, doc, NULL);
if (nullptr == transformed) {
LOGERR("MimeHandlerXslt::set_doc_file_: xslt transform failed\n");
xmlFreeDoc(doc);
return false;
}
xmlChar *outstr;
int outlen;
xsltSaveResultToString(&outstr, &outlen, transformed, m->metaOrAllSS);
m->result = string((const char*)outstr, outlen);
xmlFree(outstr);
xmlFreeDoc(transformed);
xmlFreeDoc(doc);
} else {
LOGERR("Not ready for multipart yet\n");
abort();
}
m_havedoc = true;
return true;
}
bool MimeHandlerXslt::set_document_string_impl(const string& mt,
const string& msgtxt)
{
if (!m || !m->ok) {
return false;
}
return true;
}
bool MimeHandlerXslt::next_document()
{
if (!m || !m->ok) {
return false;
}
if (m_havedoc == false)
return false;
m_havedoc = false;
m_metaData[cstr_dj_keymt] = cstr_texthtml;
m_metaData[cstr_dj_keycontent].swap(m->result);
LOGDEB1("MimeHandlerXslt::next_document: result: [" <<
m_metaData[cstr_dj_keycontent] << "]\n");
return true;
}
void MimeHandlerXslt::clear_impl()
{
m_havedoc = false;
m->result.clear();
}

49
src/internfile/mh_xslt.h Normal file
View File

@ -0,0 +1,49 @@
/* Copyright (C) 2018 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _MH_XSLT_H_INCLUDED_
#define _MH_XSLT_H_INCLUDED_
#include <string>
#include "mimehandler.h"
class MimeHandlerXslt : public RecollFilter {
public:
MimeHandlerXslt(RclConfig *cnf, const std::string& id,
const std::vector<std::string>& params);
virtual ~MimeHandlerXslt();
virtual bool next_document() override;
virtual void clear_impl() override;
virtual bool is_data_input_ok(DataInput input) const override {
return (input == DOCUMENT_FILE_NAME || input == DOCUMENT_STRING);
}
protected:
virtual bool set_document_file_impl(const std::string& mt,
const std::string& file_path);
virtual bool set_document_string_impl(const std::string& mt,
const std::string& data);
class Internal;
private:
Internal *m{nullptr};
};
#endif /* _MH_XSLT_H_INCLUDED_ */

View File

@ -41,6 +41,7 @@ using namespace std;
#include "mh_symlink.h"
#include "mh_unknown.h"
#include "mh_null.h"
#include "mh_xslt.h"
// Performance help: we use a pool of already known and created
// handlers. There can be several instances for a given mime type
@ -137,11 +138,17 @@ void clearMimeHandlerCache()
/** For mime types set as "internal" in mimeconf:
* create appropriate handler object. */
static RecollFilter *mhFactory(RclConfig *config, const string &mime,
static RecollFilter *mhFactory(RclConfig *config, const string &mimeOrParams,
bool nobuild, string& id)
{
LOGDEB2("mhFactory(" << mime << ")\n");
string lmime(mime);
LOGDEB1("mhFactory(" << mimeOrParams << ")\n");
vector<string> lparams;
stringToStrings(mimeOrParams, lparams);
if (lparams.empty()) {
// ??
return nullptr;
}
string lmime(lparams[0]);
stringtolower(lmime);
if (cstr_textplain == lmime) {
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText\n");
@ -160,11 +167,11 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime,
MD5String("MimeHandlerMail", id);
return nobuild ? 0 : new MimeHandlerMail(config, id);
} else if ("inode/symlink" == lmime) {
LOGDEB2("mhFactory(" << mime << "): ret MimeHandlerSymlink\n");
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerSymlink\n");
MD5String("MimeHandlerSymlink", id);
return nobuild ? 0 : new MimeHandlerSymlink(config, id);
} else if ("application/x-zerosize" == lmime) {
LOGDEB("mhFactory(" << mime << "): ret MimeHandlerNull\n");
LOGDEB("mhFactory(" << lmime << "): returning MimeHandlerNull\n");
MD5String("MimeHandlerNull", id);
return nobuild ? 0 : new MimeHandlerNull(config, id);
} else if (lmime.find("text/") == 0) {
@ -175,7 +182,11 @@ static RecollFilter *mhFactory(RclConfig *config, const string &mime,
// exec) but still opening with a specific editor.
LOGDEB2("mhFactory(" << mime << "): returning MimeHandlerText(x)\n");
MD5String("MimeHandlerText", id);
return nobuild ? 0 : new MimeHandlerText(config, id);
return nobuild ? 0 : new MimeHandlerText(config, id);
} else if ("xsltproc" == lmime) {
// XML Types processed with one or several xslt style sheets.
MD5String(mimeOrParams, id);
return nobuild ? 0 : new MimeHandlerXslt(config, id, lparams);
} else {
// We should not get there. It means that "internal" was set
// as a handler in mimeconf for a mime type we actually can't
@ -262,7 +273,7 @@ MimeHandlerExec *mhExecFactory(RclConfig *cfg, const string& mtype, string& hs,
/* Get handler/filter object for given mime type: */
RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg,
bool filtertypes)
bool filtertypes)
{
LOGDEB("getMimeHandler: mtype [" << mtype << "] filtertypes " <<
filtertypes << "\n");
@ -291,7 +302,7 @@ RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg,
}
bool internal = !stringlowercmp("internal", handlertype);
if (internal) {
// For internal types let the factory compute the id
// For internal types let the factory compute the cache id
mhFactory(cfg, cmdstr.empty() ? mtype : cmdstr, true, id);
} else {
// exec/execm: use the md5 of the def line
@ -304,16 +315,15 @@ RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg,
goto out;
LOGDEB2("getMimeHandler: " << mtype << " not in cache\n");
// Not in cache.
if (internal) {
// If there is a parameter after "internal" it's the mime
// type to use. This is so that we can have bogus mime
// types like text/x-purple-html-log (for ie: specific
// icon) and still use the html filter on them. This is
// partly redundant with the localfields/rclaptg, but
// better and the latter will probably go away at some
// point in the future.
// type to use, or the further qualifier (e.g. style sheet
// name for xslt types). This is so that we can have bogus
// mime types like text/x-purple-html-log (for ie:
// specific icon) and still use the html filter on
// them. This is partly redundant with the
// localfields/rclaptg, but better? (and the latter will
// probably go away at some point in the future?).
LOGDEB2("handlertype internal, cmdstr [" << cmdstr << "]\n");
h = mhFactory(cfg, cmdstr.empty() ? mtype : cmdstr, false, id);
goto out;
@ -336,14 +346,10 @@ RecollFilter *getMimeHandler(const string &mtype, RclConfig *cfg,
goto out;
}
}
}
// We get here if there was no specific error, but there is no
// identified mime type, or no handler associated.
// Finally, unhandled files are either ignored or their name and
// generic metadata is indexed, depending on configuration
{
} else {
// No identified mime type, or no handler associated.
// Unhandled files are either ignored or their name and
// generic metadata is indexed, depending on configuration
bool indexunknown = false;
cfg->getConfParam("indexallfilenames", &indexunknown);
if (indexunknown) {

View File

@ -110,7 +110,7 @@ application/vnd.sun.xml.writer = execm rclsoff.py
application/vnd.sun.xml.writer.global = execm rclsoff.py
application/vnd.sun.xml.writer.template = execm rclsoff.py
application/vnd.wordperfect = exec wpd2html;mimetype=text/html
application/x-abiword = execm rclabw.py
application/x-abiword = internal xsltproc abw.xsl
application/x-awk = internal text/plain
application/x-chm = execm rclchm
application/x-dia-diagram = execm rcldia;mimetype=text/plain

View File

@ -20,15 +20,17 @@
#include <string.h>
#include "md5ut.h"
#include "readfile.h"
using namespace std;
class FileScanMd5 : public FileScanDo {
// Quite incredibly if this class is named FileScanMd5 like the
// different one in readfile.cpp, the vtables get mixed up and mh_xslt
// crashes while calling a virtual function (gcc 6.3 and 7.3)
class FileScanMd5loc : public FileScanDo {
public:
FileScanMd5(string& d) : digest(d) {}
virtual bool init(size_t size, string *)
FileScanMd5loc(string& d) : digest(d) {}
virtual bool init(int64_t, string *)
{
MD5Init(&ctx);
return true;
@ -44,7 +46,7 @@ public:
bool MD5File(const string& filename, string &digest, string *reason)
{
FileScanMd5 md5er(digest);
FileScanMd5loc md5er(digest);
if (!file_scan(filename, &md5er, reason))
return false;
// We happen to know that digest and md5er.digest are the same object

7564
src/utils/miniz.cpp Normal file

File diff suppressed because it is too large Load Diff

1321
src/utils/miniz.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -14,7 +14,6 @@
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef TEST_READFILE
#ifdef BUILDING_RECOLL
#include "autoconfig.h"
#else
@ -37,20 +36,33 @@
#include "readfile.h"
#include "smallut.h"
#include "md5.h"
using std::string;
#ifdef MDU_INCLUDE_LOG
#include MDU_INCLUDE_LOG
#else
#include "log.h"
#endif
using namespace std;
///////////////
// Implementation of basic interface: read whole file to memory buffer
class FileToString : public FileScanDo {
public:
FileToString(string& data) : m_data(data) {}
string& m_data;
bool init(size_t size, string *reason) {
// Note: the fstat() + reserve() (in init()) calls divide cpu
// usage almost by 2 on both linux i586 and macosx (compared to
// just append()) Also tried a version with mmap, but it's
// actually slower on the mac and not faster on linux.
virtual bool init(int64_t size, string *reason) {
if (size > 0) {
m_data.reserve(size);
}
return true;
}
bool data(const char *buf, int cnt, string *reason) {
virtual bool data(const char *buf, int cnt, string *reason) {
try {
m_data.append(buf, cnt);
} catch (...) {
@ -59,248 +71,449 @@ public:
}
return true;
}
string& m_data;
};
bool file_to_string(const string& fn, string& data, int64_t offs, size_t cnt,
string *reason)
{
FileToString accum(data);
return file_scan(fn, &accum, offs, cnt, reason, nullptr);
}
bool file_to_string(const string& fn, string& data, string *reason)
{
return file_to_string(fn, data, 0, size_t(-1), reason);
}
bool file_to_string(const string& fn, string& data, int64_t offs, size_t cnt,
string *reason)
{
FileToString accum(data);
return file_scan(fn, &accum, offs, cnt, reason);
}
bool file_scan(const string& fn, FileScanDo* doer, string *reason)
{
return file_scan(fn, doer, 0, size_t(-1), reason);
}
const int RDBUFSZ = 8192;
// Note: the fstat() + reserve() (in init()) calls divide cpu usage almost by 2
// on both linux i586 and macosx (compared to just append())
// Also tried a version with mmap, but it's actually slower on the mac and not
// faster on linux.
bool file_scan(const string& fn, FileScanDo* doer, int64_t startoffs,
size_t cnttoread, string *reason)
{
if (startoffs < 0) {
*reason += " file_scan: negative startoffs not allowed";
return false;
/////////////
// Callback/filtering interface
// Abstract class base for both source (origin) and filter
// (midstream). Both have a downstream
class FileScanUpstream {
public:
virtual void setDownstream(FileScanDo *down) {
m_down = down;
}
virtual FileScanDo *out() {
return m_down;
}
protected:
FileScanDo *m_down{nullptr};
};
// Source element.
class FileScanSource : public FileScanUpstream {
public:
FileScanSource(FileScanDo *down) {
setDownstream(down);
}
virtual bool scan() = 0;
};
// Inside element of a transformation pipe. The idea is that elements
// which don't recognize the data get themselves out of the pipe
// (pop()). Typically, only one of the decompression modules
// (e.g. gzip/bzip2/xz...) would remain. For now there is only gzip,
// it pops itself if the data does not have the right magic number
class FileScanFilter : public FileScanDo, public FileScanUpstream {
public:
virtual void insertAtSink(FileScanDo *sink, FileScanUpstream *upstream) {
setDownstream(sink);
if (m_down) {
m_down->setUpstream(this);
}
setUpstream(upstream);
if (m_up) {
m_up->setDownstream(this);
}
}
bool ret = false;
bool noclosing = true;
int fd = 0;
struct stat st;
// Initialize st_size: if fn.empty() , the fstat() call won't happen.
st.st_size = 0;
// Remove myself from the pipe.
virtual void pop() {
if (m_down) {
m_down->setUpstream(m_up);
}
if (m_up) {
m_up->setDownstream(m_down);
}
}
// If we have a file name, open it, else use stdin.
if (!fn.empty()) {
fd = open(fn.c_str(), O_RDONLY | O_BINARY);
if (fd < 0 || fstat(fd, &st) < 0) {
catstrerror(reason, "open/stat", errno);
virtual void setUpstream(FileScanUpstream *up) override {
m_up = up;
}
private:
FileScanUpstream *m_up{nullptr};
};
#if defined(READFILE_ENABLE_ZLIB)
#include <zlib.h>
class GzFilter : public FileScanFilter {
public:
virtual ~GzFilter() {
if (m_initdone) {
inflateEnd(&m_stream);
}
}
virtual bool init(int64_t size, string *reason) override {
LOGDEB1("GzFilter::init\n");
if (out()) {
return out()->init(size, reason);
}
return true;
}
virtual bool data(const char *buf, int cnt, string *reason) override {
LOGDEB1("GzFilter::data: cnt " << cnt << endl);
int error;
m_stream.next_in = (Bytef*)buf;
m_stream.avail_in = cnt;
if (m_initdone == false) {
m_initdone = true;
// We do not support a first read cnt < 2. We probably should.
if (cnt < 2) {
if (reason)
*reason += "GzFilter: first data count < 2";
return false;
}
const unsigned char *ubuf = (const unsigned char *)buf;
if (ubuf[0] != 0x1f || ubuf[1] != 0x8b) {
LOGDEB1("GzFilter::data: not gzip. out() is " << out() << "\n");
pop();
if (out()) {
return out()->data(buf, cnt, reason);
} else {
return false;
}
}
m_stream.opaque = nullptr;
m_stream.zalloc = alloc_func;
m_stream.zfree = free_func;
m_stream.next_out = (Bytef*)m_obuf;
m_stream.avail_out = m_obs;
if ((error = inflateInit2(&m_stream, 15+32)) != Z_OK) {
LOGERR("inflateInit2 error: " << error << endl);
if (reason) {
*reason += " Zlib inflateinit failed";
if (m_stream.msg && *m_stream.msg) {
*reason += string(": ") + m_stream.msg;
}
}
return false;
}
}
while (m_stream.avail_in != 0) {
m_stream.next_out = (Bytef*)m_obuf;
m_stream.avail_out = m_obs;
if ((error = inflate(&m_stream, Z_SYNC_FLUSH)) < Z_OK) {
LOGERR("inflate error: " << error << endl);
if (reason) {
*reason += " Zlib inflate failed";
if (m_stream.msg && *m_stream.msg) {
*reason += string(": ") + m_stream.msg;
}
}
return false;
}
if (out() &&
!out()->data(m_obuf, m_obs - m_stream.avail_out, reason)) {
return false;
}
}
return true;
}
static voidpf alloc_func(voidpf opaque, uInt items, uInt size) {
return malloc(items * size);
}
static void free_func(voidpf opaque, voidpf address) {
free(address);
}
bool m_initdone{false};
z_stream m_stream;
char m_obuf[10000];
const int m_obs{10000};
};
#endif // GZ
class FileScanMd5 : public FileScanFilter {
public:
FileScanMd5(string& d) : digest(d) {}
virtual bool init(int64_t size, string *reason) override {
LOGDEB1("FileScanMd5: init\n");
MD5Init(&ctx);
if (out()) {
return out()->init(size, reason);
}
return true;
}
virtual bool data(const char *buf, int cnt, string *reason) override {
LOGDEB1("FileScanMd5: data. cnt " << cnt << endl);
MD5Update(&ctx, (const unsigned char*)buf, cnt);
if (out() && !out()->data(buf, cnt, reason)) {
return false;
}
noclosing = false;
return true;
}
bool finish() {
LOGDEB1("FileScanMd5: finish\n");
MD5Final(digest, &ctx);
return true;
}
string &digest;
MD5_CTX ctx;
};
// Source taking data from a regular file
class FileScanSourceFile : public FileScanSource {
public:
FileScanSourceFile(FileScanDo *next, const string& fn, int64_t startoffs,
int64_t cnttoread, string *reason)
: FileScanSource(next), m_fn(fn), m_startoffs(startoffs),
m_cnttoread(cnttoread), m_reason(reason) { }
virtual bool scan() {
LOGDEB1("FileScanSourceFile: reading " << m_fn << " offs " <<
m_startoffs<< " cnt " << m_cnttoread << " out " << out() << endl);
const int RDBUFSZ = 8192;
bool ret = false;
bool noclosing = true;
int fd = 0;
struct stat st;
// Initialize st_size: if fn.empty() , the fstat() call won't happen.
st.st_size = 0;
// If we have a file name, open it, else use stdin.
if (!m_fn.empty()) {
fd = open(m_fn.c_str(), O_RDONLY | O_BINARY);
if (fd < 0 || fstat(fd, &st) < 0) {
catstrerror(m_reason, "open/stat", errno);
return false;
}
noclosing = false;
}
#if defined O_NOATIME && O_NOATIME != 0
if (fcntl(fd, F_SETFL, O_NOATIME) < 0) {
// perror("fcntl");
if (fcntl(fd, F_SETFL, O_NOATIME) < 0) {
// perror("fcntl");
}
#endif
if (out()) {
if (m_cnttoread != -1 && m_cnttoread) {
out()->init(m_cnttoread + 1, m_reason);
} else if (st.st_size > 0) {
out()->init(st.st_size + 1, m_reason);
} else {
out()->init(0, m_reason);
}
}
int64_t curoffs = 0;
if (m_startoffs > 0 && !m_fn.empty()) {
if (lseek(fd, m_startoffs, SEEK_SET) != m_startoffs) {
catstrerror(m_reason, "lseek", errno);
return false;
}
curoffs = m_startoffs;
}
char buf[RDBUFSZ];
int64_t totread = 0;
for (;;) {
size_t toread = RDBUFSZ;
if (m_startoffs > 0 && curoffs < m_startoffs) {
toread = size_t(MIN(RDBUFSZ, m_startoffs - curoffs));
}
if (m_cnttoread != -1) {
toread = MIN(toread, (uint64_t)(m_cnttoread - totread));
}
ssize_t n = static_cast<ssize_t>(read(fd, buf, toread));
if (n < 0) {
catstrerror(m_reason, "read", errno);
goto out;
}
if (n == 0) {
break;
}
curoffs += n;
if (curoffs - n < m_startoffs) {
continue;
}
if (!out()->data(buf, n, m_reason)) {
goto out;
}
totread += n;
if (m_cnttoread > 0 && totread >= m_cnttoread) {
break;
}
}
ret = true;
out:
if (fd >= 0 && !noclosing) {
close(fd);
}
return ret;
}
protected:
string m_fn;
int64_t m_startoffs;
int64_t m_cnttoread;
string *m_reason;
};
#if defined(READFILE_ENABLE_MINIZ)
#include "miniz.h"
// Source taking data from a ZIP archive member
class FileScanSourceZip : public FileScanSource {
public:
FileScanSourceZip(FileScanDo *next, const string& fn, const string& member,
string *reason)
: FileScanSource(next), m_fn(fn), m_member(member),
m_reason(reason) { }
virtual bool scan() {
bool ret = false;
mz_zip_archive zip;
mz_zip_zero_struct(&zip);
void *opaque = this;
if (!mz_zip_reader_init_file(&zip, m_fn.c_str(), 0)) {
if (m_reason) {
*m_reason += "mz_zip_reader_init_file() failed: ";
*m_reason += string(mz_zip_get_error_string(zip.m_last_error));
}
return false;
}
mz_uint32 file_index;
if (mz_zip_reader_locate_file_v2(&zip, m_member.c_str(), NULL, 0,
&file_index) < 0) {
if (m_reason) {
*m_reason += "mz_zip_reader_locate_file() failed: ";
*m_reason += string(mz_zip_get_error_string(zip.m_last_error));
}
goto out;
}
mz_zip_archive_file_stat zstat;
if (!mz_zip_reader_file_stat(&zip, file_index, &zstat)) {
if (m_reason) {
*m_reason += "mz_zip_reader_file_stat() failed: ";
*m_reason += string(mz_zip_get_error_string(zip.m_last_error));
}
goto out;
}
if (out()) {
if (!out()->init(zstat.m_uncomp_size, m_reason)) {
goto out;
}
}
if (!mz_zip_reader_extract_to_callback(
&zip, file_index, write_cb, opaque, 0)) {
if (m_reason) {
*m_reason += "mz_zip_reader_extract_to_callback() failed: ";
*m_reason += string(mz_zip_get_error_string(zip.m_last_error));
}
goto out;
}
ret = true;
out:
mz_zip_reader_end(&zip);
return ret;
}
static size_t write_cb(void *pOpaque, mz_uint64 file_ofs,
const void *pBuf, size_t n) {
const char *cp = (const char*)pBuf;
LOGDEB1("write_cb: ofs " << file_ofs << " cnt " << n << " data: " <<
string(cp, n) << endl);
FileScanSourceZip *ths = (FileScanSourceZip *)pOpaque;
if (ths->out()) {
if (!ths->out()->data(cp, n, ths->m_reason)) {
return (size_t)-1;
}
}
return n;
}
protected:
string m_fn;
string m_member;
string *m_reason;
};
bool file_scan(const std::string& filename, const std::string& membername,
FileScanDo* doer, std::string *reason)
{
if (membername.empty()) {
return file_scan(filename, doer, 0, -1, reason, nullptr);
} else {
FileScanSourceZip source(doer, filename, membername, reason);
return source.scan();
}
}
#endif // READFILE_ENABLE_ZIP
bool file_scan(const string& fn, FileScanDo* doer, int64_t startoffs,
int64_t cnttoread, string *reason, string *md5p)
{
LOGDEB("file_scan: doer " << doer << endl);
#if defined(READFILE_ENABLE_ZLIB)
bool nodecomp = startoffs != 0;
#endif
if (startoffs < 0) {
startoffs = 0;
}
FileScanSourceFile source(doer, fn, startoffs, cnttoread, reason);
FileScanUpstream *up = &source;
// We compute the MD5 on the uncompressed data, so insert this
// right at the source.
string digest;
FileScanMd5 md5filter(digest);
if (md5p) {
md5filter.insertAtSink(doer, up);
up = &md5filter;
}
#if defined(READFILE_ENABLE_ZLIB)
GzFilter gzfilter;
if (!nodecomp) {
gzfilter.insertAtSink(doer, up);
up = &gzfilter;
}
#endif
if (cnttoread != (size_t) - 1 && cnttoread) {
doer->init(cnttoread + 1, reason);
} else if (st.st_size > 0) {
doer->init(size_t(st.st_size + 1), reason);
} else {
doer->init(0, reason);
}
bool ret = source.scan();
int64_t curoffs = 0;
if (startoffs > 0 && !fn.empty()) {
if (lseek(fd, startoffs, SEEK_SET) != startoffs) {
catstrerror(reason, "lseek", errno);
return false;
}
curoffs = startoffs;
}
char buf[RDBUFSZ];
size_t totread = 0;
for (;;) {
size_t toread = RDBUFSZ;
if (startoffs > 0 && curoffs < startoffs) {
toread = size_t(MIN(RDBUFSZ, startoffs - curoffs));
}
if (cnttoread != size_t(-1)) {
toread = MIN(toread, cnttoread - totread);
}
ssize_t n = static_cast<ssize_t>(read(fd, buf, toread));
if (n < 0) {
catstrerror(reason, "read", errno);
goto out;
}
if (n == 0) {
break;
}
curoffs += n;
if (curoffs - n < startoffs) {
continue;
}
if (!doer->data(buf, n, reason)) {
goto out;
}
totread += n;
if (cnttoread > 0 && totread >= cnttoread) {
break;
}
}
ret = true;
out:
if (fd >= 0 && !noclosing) {
close(fd);
if (md5p) {
md5filter.finish();
MD5HexPrint(digest, *md5p);
}
return ret;
}
#else // Test
#include "autoconfig.h"
#include <stdio.h>
#include <sys/types.h>
#include "safesysstat.h"
#include <stdlib.h>
#include <string>
#include <iostream>
using namespace std;
#include "readfile.h"
#include "fstreewalk.h"
using namespace std;
class myCB : public FsTreeWalkerCB {
public:
FsTreeWalker::Status processone(const string& path,
const struct stat *st,
FsTreeWalker::CbFlag flg) {
if (flg == FsTreeWalker::FtwDirEnter) {
//cout << "[Entering " << path << "]" << endl;
} else if (flg == FsTreeWalker::FtwDirReturn) {
//cout << "[Returning to " << path << "]" << endl;
} else if (flg == FsTreeWalker::FtwRegular) {
//cout << path << endl;
string s, reason;
if (!file_to_string(path, s, &reason)) {
cerr << "Failed: " << reason << " : " << path << endl;
} else {
//cout <<
//"================================================" << endl;
cout << path << endl;
// cout << s;
}
reason.clear();
}
return FsTreeWalker::FtwOk;
}
};
static int op_flags;
#define OPT_MOINS 0x1
#define OPT_c 0x2
#define OPT_o 0x4
static const char *thisprog;
static char usage [] =
"trreadfile [-o offs] [-c cnt] topdirorfile\n\n"
;
static void
Usage(void)
bool file_scan(const string& fn, FileScanDo* doer, string *reason)
{
fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
exit(1);
return file_scan(fn, doer, 0, -1, reason, nullptr);
}
int main(int argc, const char **argv)
{
int64_t offs = 0;
size_t cnt = size_t(-1);
thisprog = argv[0];
argc--;
argv++;
while (argc > 0 && **argv == '-') {
(*argv)++;
if (!(**argv))
/* Cas du "adb - core" */
{
Usage();
}
while (**argv)
switch (*(*argv)++) {
case 'c':
op_flags |= OPT_c;
if (argc < 2) {
Usage();
}
cnt = atoll(*(++argv));
argc--;
goto b1;
case 'o':
op_flags |= OPT_o;
if (argc < 2) {
Usage();
}
offs = strtoull(*(++argv), 0, 0);
argc--;
goto b1;
default:
Usage();
break;
}
b1:
argc--;
argv++;
}
if (argc != 1) {
Usage();
}
string top = *argv++;
argc--;
cerr << "filename " << top << " offs " << offs << " cnt " << cnt << endl;
struct stat st;
if (!top.empty() && stat(top.c_str(), &st) < 0) {
perror("stat");
exit(1);
}
if (!top.empty() && S_ISDIR(st.st_mode)) {
FsTreeWalker walker;
myCB cb;
walker.walk(top, cb);
if (walker.getErrCnt() > 0) {
cout << walker.getReason();
}
} else {
string s, reason;
if (!file_to_string(top, s, offs, cnt, &reason)) {
cerr << reason << endl;
exit(1);
} else {
cout << s;
}
}
exit(0);
}
#endif //TEST_READFILE

View File

@ -21,30 +21,72 @@
#include <string>
/**
* Read file in chunks, calling an accumulator for each chunk. Can be used
* for reading in a file, computing an md5...
*/
class FileScanUpstream;
/** Data sink for the file reader. */
class FileScanDo {
public:
virtual ~FileScanDo() {}
virtual bool init(size_t size, std::string *reason) = 0;
virtual bool data(const char *buf, int cnt, std::string* reason) = 0;
/* Initialize and allocate.
* @param size if set, lower bound of data size.
* @param reason[output] set to error message in case of error.
* @return false for error (file_scan will return), true if ok.
*/
virtual bool init(int64_t size, std::string *reason) = 0;
/* Process chunk of data
* @param buf the data buffer.
* @param cnt byte count.
* @param reason[output] set to error message in case of error.
* @return false for error (file_scan will return), true if ok.
*/
virtual bool data(const char *buf, int cnt, std::string *reason) = 0;
virtual void setUpstream(FileScanUpstream*) {}
};
bool file_scan(const std::string& filename, FileScanDo* doer, std::string *reason = 0);
/* Same but only process count cnt from offset offs. Set cnt to size_t(-1)
* for no limit */
bool file_scan(const std::string& fn, FileScanDo* doer, int64_t offs, size_t cnt,
std::string *reason = 0);
/** Open and read file, calling the FileScanDo data() method for each chunk.
*
* @param filename File name. Use empty value for stdin
* @param doer the data processor. The init() method will be called
* initially witht a lower bound of the data size (may be used to
* reserve a buffer), or with a 0 size if nothing is known about the
* size. The data() method will be called for every chunk of data
* read.
* @param offs Start offset. If not zero, will disable decompression
* (set to -1 to start at 0 with no decompression).
* @param cnt Max bytes in output. Set cnt to -1 for no limit.
* @param[output] md5p If not null, points to a string to store the hex ascii
* md5 of the uncompressed data.
* @param[output] reason If not null, points to a string for storing an
* error message if the return value is false.
* @return true if the operation ended normally, else false.
*/
bool file_scan(const std::string& fn, FileScanDo* doer, int64_t startoffs,
int64_t cnttoread, std::string *reason, std::string *md5p);
/** Same as above, not offset/cnt/md5 */
bool file_scan(const std::string& filename, FileScanDo* doer,
std::string *reason);
#if defined(READFILE_ENABLE_MINIZ)
/* Process a zip archive member */
bool file_scan(const std::string& filename, const std::string& membername,
FileScanDo* doer, std::string *reason);
#endif
/**
* Read file into string.
* @return true for ok, false else
*/
bool file_to_string(const std::string& filename, std::string& data, std::string *reason = 0);
bool file_to_string(const std::string& filename, std::string& data,
std::string *reason = 0);
/** Read file chunk into string. Set cnt to size_t(-1) for whole file */
/** Read file chunk into string. Set cnt to -1 for going to
* eof, offs to -1 for going from the start without decompression */
bool file_to_string(const std::string& filename, std::string& data,
int64_t offs, size_t cnt, std::string *reason = 0);
#endif /* _READFILE_H_INCLUDED_ */