From 586ff90dc01d63df9ddb3029bf229619b22295de Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Thu, 27 Dec 2018 16:20:12 +0100 Subject: [PATCH] internal xslt: openoffice zip format working --- src/Makefile.am | 2 + src/filters/opendoc-body.xsl | 32 ++++++ src/filters/opendoc-meta.xsl | 67 +++++++++++ src/internfile/mh_xslt.cpp | 213 ++++++++++++++++++++++++----------- src/sampleconf/mimeconf | 30 ++--- src/utils/readfile.cpp | 87 +++++++++++++- src/utils/readfile.h | 6 + 7 files changed, 348 insertions(+), 89 deletions(-) create mode 100644 src/filters/opendoc-body.xsl create mode 100644 src/filters/opendoc-meta.xsl diff --git a/src/Makefile.am b/src/Makefile.am index 2333c5d2..bd2e44af 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -653,7 +653,9 @@ filters/fb2.xsl \ filters/gnumeric.xsl \ filters/msodump.zip \ filters/okular-note.xsl \ +filters/opendoc-body.xsl \ filters/opendoc-flat.xsl \ +filters/opendoc-meta.xsl \ filters/ppt-dump.py \ filters/rcl7z \ filters/rclabw.py \ diff --git a/src/filters/opendoc-body.xsl b/src/filters/opendoc-body.xsl new file mode 100644 index 00000000..b4b6e049 --- /dev/null +++ b/src/filters/opendoc-body.xsl @@ -0,0 +1,32 @@ + + + + + + +

+ +
+ + +

+ +
+ + + + + + +
+
+ + + + + +
diff --git a/src/filters/opendoc-meta.xsl b/src/filters/opendoc-meta.xsl new file mode 100644 index 00000000..ab49b867 --- /dev/null +++ b/src/filters/opendoc-meta.xsl @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + <xsl:value-of select="."/> + + + + + + abstract + + + + + + + + + + keywords + + + + + + + + + + author + + + + + + + + + + keywords + + + + + + + + diff --git a/src/internfile/mh_xslt.cpp b/src/internfile/mh_xslt.cpp index 3d30fa3c..8afbc83a 100644 --- a/src/internfile/mh_xslt.cpp +++ b/src/internfile/mh_xslt.cpp @@ -94,18 +94,31 @@ private: class MimeHandlerXslt::Internal { public: + Internal(MimeHandlerXslt *_p) + : p(_p) {} ~Internal() { if (metaOrAllSS) { xsltFreeStylesheet(metaOrAllSS); } - if (dataSS) { - xsltFreeStylesheet(dataSS); + if (bodySS) { + xsltFreeStylesheet(bodySS); } } + + xsltStylesheet *prepare_stylesheet(const string& ssnm); + bool process_doc_or_string(bool forpv, const string& fn, const string& data); + bool apply_stylesheet( + const string& fn, const string& member, const string& data, + xsltStylesheet *ssp, string& result, string *md5p); + + MimeHandlerXslt *p; bool ok{false}; + string metamember; xsltStylesheet *metaOrAllSS{nullptr}; - xsltStylesheet *dataSS{nullptr}; + string bodymember; + xsltStylesheet *bodySS{nullptr}; string result; + string filtersdir; }; MimeHandlerXslt::~MimeHandlerXslt() @@ -115,99 +128,163 @@ MimeHandlerXslt::~MimeHandlerXslt() MimeHandlerXslt::MimeHandlerXslt(RclConfig *cnf, const std::string& id, const std::vector& params) - : RecollFilter(cnf, id), m(new Internal) + : RecollFilter(cnf, id), m(new Internal(this)) { LOGDEB("MimeHandlerXslt: params: " << stringsToString(params) << endl); - string filtersdir = path_cat(cnf->getDatadir(), "filters"); + m->filtersdir = path_cat(cnf->getDatadir(), "filters"); xmlSubstituteEntitiesDefault(0); xmlLoadExtDtdDefaultValue = 0; // params can be "xslt stylesheetall" or - // "xslt metamember stylesheetmeta datamember stylesheetdata" + // "xslt metamember metastylesheet bodymember bodystylesheet" if (params.size() == 2) { - string ssfn = path_cat(filtersdir, params[1]); - FileScanXML XMLstyle(ssfn); - string reason; - if (!file_scan(ssfn, &XMLstyle, &reason)) { - LOGERR("MimeHandlerXslt: file_scan failed for style sheet " << - ssfn << " : " << reason << endl); - return; - } - xmlDoc *stl = XMLstyle.getDoc(); - if (stl == nullptr) { - LOGERR("MimeHandlerXslt: getDoc failed for style sheet " << - ssfn << endl); - return; - } - m->metaOrAllSS = xsltParseStylesheetDoc(stl); + m->metaOrAllSS = m->prepare_stylesheet(params[1]); if (m->metaOrAllSS) { m->ok = true; } - } else if (params.size() == 4) { + } else if (params.size() == 5) { + m->metamember = params[1]; + m->metaOrAllSS = m->prepare_stylesheet(params[2]); + m->bodymember = params[3]; + m->bodySS = m->prepare_stylesheet(params[4]); + if (m->metaOrAllSS && m->bodySS) { + m->ok = true; + } } else { LOGERR("MimeHandlerXslt: constructor with wrong param vector: " << stringsToString(params) << endl); } } -bool MimeHandlerXslt::set_document_file_impl(const std::string& mt, - const std::string &file_path) +xsltStylesheet *MimeHandlerXslt::Internal::prepare_stylesheet(const string& ssnm) { - LOGDEB0("MimeHandlerXslt::set_document_file_: fn: " << file_path << endl); - if (!m || !m->ok) { + string ssfn = path_cat(filtersdir, ssnm); + FileScanXML XMLstyle(ssfn); + string reason; + if (!file_scan(ssfn, &XMLstyle, &reason)) { + LOGERR("MimeHandlerXslt: file_scan failed for style sheet " << + ssfn << " : " << reason << endl); + return nullptr; + } + xmlDoc *stl = XMLstyle.getDoc(); + if (stl == nullptr) { + LOGERR("MimeHandlerXslt: getDoc failed for style sheet " << + ssfn << endl); + return nullptr; + } + return xsltParseStylesheetDoc(stl); +} + +bool MimeHandlerXslt::Internal::apply_stylesheet( + const string& fn, const string& member, const string& data, + xsltStylesheet *ssp, string& result, string *md5p) +{ + FileScanXML XMLdoc(fn); + string md5, reason; + bool res; + if (!fn.empty()) { + if (member.empty()) { + res = file_scan(fn, &XMLdoc, 0, -1, &reason, md5p); + } else { + res = file_scan(fn, member, &XMLdoc, &reason); + } + } else { + if (member.empty()) { + res = string_scan(data.c_str(), data.size(), &XMLdoc, &reason, md5p); + } else { + res = string_scan(data.c_str(), data.size(), member, &XMLdoc, + &reason); + } + } + if (!res) { + LOGERR("MimeHandlerXslt::set_document_: file_scan failed for "<< + fn << " " << member << " : " << reason << endl); return false; } - if (nullptr == m->dataSS) { - if (nullptr == m->metaOrAllSS) { - LOGERR("MimeHandlerXslt::set_document_file_impl: both ss empty??\n"); - return false; - } - FileScanXML XMLdoc(file_path); - string md5, reason; - if (!file_scan(file_path, &XMLdoc, 0, -1, &reason, - m_forPreview ? nullptr : &md5)) { - LOGERR("MimeHandlerXslt::set_document_file_impl: file_scan failed " - "for " << file_path << " : " << reason << endl); - return false; - } - if (!m_forPreview) { - m_metaData[cstr_dj_keymd5] = md5; - } - xmlDocPtr doc = XMLdoc.getDoc(); - if (nullptr == doc) { - LOGERR("MimeHandlerXslt::set_doc_file_impl: no parsed doc\n"); - return false; - } - xmlDocPtr transformed = xsltApplyStylesheet(m->metaOrAllSS, doc, NULL); - if (nullptr == transformed) { - LOGERR("MimeHandlerXslt::set_doc_file_: xslt transform failed\n"); - xmlFreeDoc(doc); - return false; - } - xmlChar *outstr; - int outlen; - xsltSaveResultToString(&outstr, &outlen, transformed, m->metaOrAllSS); - m->result = string((const char*)outstr, outlen); - xmlFree(outstr); - xmlFreeDoc(transformed); - xmlFreeDoc(doc); - } else { - LOGERR("Not ready for multipart yet\n"); - abort(); + + xmlDocPtr doc = XMLdoc.getDoc(); + if (nullptr == doc) { + LOGERR("MimeHandlerXslt::set_document_: no parsed doc\n"); + return false; } - - m_havedoc = true; + xmlDocPtr transformed = xsltApplyStylesheet(ssp, doc, NULL); + if (nullptr == transformed) { + LOGERR("MimeHandlerXslt::set_document_: xslt transform failed\n"); + xmlFreeDoc(doc); + return false; + } + xmlChar *outstr; + int outlen; + xsltSaveResultToString(&outstr, &outlen, transformed, metaOrAllSS); + result = string((const char*)outstr, outlen); + xmlFree(outstr); + xmlFreeDoc(transformed); + xmlFreeDoc(doc); return true; } -bool MimeHandlerXslt::set_document_string_impl(const string& mt, - const string& msgtxt) +bool MimeHandlerXslt::Internal::process_doc_or_string( + bool forpreview, const string& fn, const string& data) { + if (nullptr == metaOrAllSS && nullptr == bodySS) { + LOGERR("MimeHandlerXslt::set_document_file_impl: both ss empty??\n"); + return false; + } + if (nullptr == bodySS) { + string md5; + if (apply_stylesheet(fn, string(), data, metaOrAllSS, result, + forpreview ? nullptr : &md5)) { + if (!forpreview) { + p->m_metaData[cstr_dj_keymd5] = md5; + } + return true; + } + return false; + } else { + result = "\n\n"; + string part; + if (!apply_stylesheet(fn,metamember, data, metaOrAllSS, part, nullptr)) { + return false; + } + result += part; + result += "\n\n"; + if (!apply_stylesheet(fn, bodymember, data, bodySS, part, nullptr)) { + return false; + } + result += part; + result += ""; + } + return true; +} + +bool MimeHandlerXslt::set_document_file_impl(const std::string& mt, + const std::string &fn) +{ + LOGDEB0("MimeHandlerXslt::set_document_file_: fn: " << fn << endl); if (!m || !m->ok) { return false; } - return true; + bool ret = m->process_doc_or_string(m_forPreview, fn, string()); + if (ret) { + m_havedoc = true; + } + return ret; +} + +bool MimeHandlerXslt::set_document_string_impl(const string& mt, + const string& txt) +{ + LOGDEB0("MimeHandlerXslt::set_document_string_\n"); + if (!m || !m->ok) { + return false; + } + bool ret = m->process_doc_or_string(m_forPreview, string(), txt); + if (ret) { + m_havedoc = true; + } + return ret; } bool MimeHandlerXslt::next_document() diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index 1ab2f270..7a0e8850 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -79,11 +79,11 @@ application/postscript = exec rclps application/sql = internal text/plain application/vnd.ms-excel = execm rclxls.py application/vnd.ms-powerpoint = execm rclppt.py -application/vnd.oasis.opendocument.text = execm rclsoff.py -application/vnd.oasis.opendocument.text-template = execm rclsoff.py -application/vnd.oasis.opendocument.presentation = execm rclsoff.py -application/vnd.oasis.opendocument.spreadsheet = execm rclsoff.py -application/vnd.oasis.opendocument.graphics = execm rclsoff.py +application/vnd.oasis.opendocument.text = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.oasis.opendocument.text-template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.oasis.opendocument.presentation = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.oasis.opendocument.spreadsheet = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.oasis.opendocument.graphics = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl application/vnd.oasis.opendocument.presentation-flat-xml = internal xsltproc opendoc-flat.xsl application/vnd.oasis.opendocument.text-flat-xml = internal xsltproc opendoc-flat.xsl application/vnd.oasis.opendocument.spreadsheet-flat-xml = internal xsltproc opendoc-flat.xsl @@ -99,16 +99,16 @@ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \ execm rclopxml.py application/vnd.openxmlformats-officedocument.spreadsheetml.template =\ execm rclopxml.py -application/vnd.sun.xml.calc = execm rclsoff.py -application/vnd.sun.xml.calc.template = execm rclsoff.py -application/vnd.sun.xml.draw = execm rclsoff.py -application/vnd.sun.xml.draw.template = execm rclsoff.py -application/vnd.sun.xml.impress = execm rclsoff.py -application/vnd.sun.xml.impress.template = execm rclsoff.py -application/vnd.sun.xml.math = execm rclsoff.py -application/vnd.sun.xml.writer = execm rclsoff.py -application/vnd.sun.xml.writer.global = execm rclsoff.py -application/vnd.sun.xml.writer.template = execm rclsoff.py +application/vnd.sun.xml.calc = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.sun.xml.calc.template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.sun.xml.draw = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.sun.xml.draw.template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.sun.xml.impress = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.sun.xml.impress.template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.sun.xml.math = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.sun.xml.writer = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.sun.xml.writer.global = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl +application/vnd.sun.xml.writer.template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl application/vnd.wordperfect = exec wpd2html;mimetype=text/html application/x-abiword = internal xsltproc abiword.xsl application/x-awk = internal text/plain diff --git a/src/utils/readfile.cpp b/src/utils/readfile.cpp index 0fa93806..ae2edf16 100644 --- a/src/utils/readfile.cpp +++ b/src/utils/readfile.cpp @@ -381,10 +381,15 @@ protected: // Source taking data from a ZIP archive member class FileScanSourceZip : public FileScanSource { public: - FileScanSourceZip(FileScanDo *next, const string& fn, const string& member, - string *reason) + FileScanSourceZip(FileScanDo *next, const string& fn, + const string& member, string *reason) : FileScanSource(next), m_fn(fn), m_member(member), - m_reason(reason) { } + m_reason(reason) {} + + FileScanSourceZip(const char *data, size_t cnt, FileScanDo *next, + const string& member, string *reason) + : FileScanSource(next), m_data(data), m_cnt(cnt), m_member(member), + m_reason(reason) {} virtual bool scan() { bool ret = false; @@ -392,13 +397,21 @@ public: mz_zip_zero_struct(&zip); void *opaque = this; - if (!mz_zip_reader_init_file(&zip, m_fn.c_str(), 0)) { + bool ret1; + if (m_fn.empty()) { + ret1 = mz_zip_reader_init_mem(&zip, m_data, m_cnt, 0); + } else { + ret1 = mz_zip_reader_init_file(&zip, m_fn.c_str(), 0); + } + if (!ret1) { if (m_reason) { - *m_reason += "mz_zip_reader_init_file() failed: "; - *m_reason += string(mz_zip_get_error_string(zip.m_last_error)); + *m_reason += "mz_zip_reader_init_xx() failed: "; + *m_reason += + string(mz_zip_get_error_string(zip.m_last_error)); } return false; } + mz_uint32 file_index; if (mz_zip_reader_locate_file_v2(&zip, m_member.c_str(), NULL, 0, &file_index) < 0) { @@ -453,6 +466,8 @@ public: } protected: + const char *m_data; + size_t m_cnt; string m_fn; string m_member; string *m_reason; @@ -469,6 +484,17 @@ bool file_scan(const std::string& filename, const std::string& membername, } } +bool string_scan(const char *data, size_t cnt, const std::string& membername, + FileScanDo* doer, std::string *reason) +{ + if (membername.empty()) { + return string_scan(data, cnt, doer, reason, nullptr); + } else { + FileScanSourceZip source(data, cnt, doer, membername, reason); + return source.scan(); + } +} + #endif // READFILE_ENABLE_ZIP bool file_scan(const string& fn, FileScanDo* doer, int64_t startoffs, @@ -515,3 +541,52 @@ bool file_scan(const string& fn, FileScanDo* doer, string *reason) { return file_scan(fn, doer, 0, -1, reason, nullptr); } + + +class FileScanSourceBuffer : public FileScanSource { +public: + FileScanSourceBuffer(FileScanDo *next, const char *data, size_t cnt, + string *reason) + : FileScanSource(next), m_data(data), m_cnt(cnt), m_reason(reason) {} + + virtual bool scan() { + if (out()) { + if (!out()->init(m_cnt, m_reason)) { + return false; + } + return out()->data(m_data, m_cnt, m_reason); + } else { + return true; + } + } + +protected: + const char *m_data{nullptr}; + size_t m_cnt{0}; + string *m_reason{nullptr}; +}; + +bool string_scan(const char *data, size_t cnt, FileScanDo* doer, + std::string *reason, std::string *md5p) +{ + FileScanSourceBuffer source(doer, data, cnt, reason); + FileScanUpstream *up = &source; + + // We compute the MD5 on the uncompressed data, so insert this + // right at the source. + string digest; + FileScanMd5 md5filter(digest); + if (md5p) { + md5filter.insertAtSink(doer, up); + up = &md5filter; + } + + bool ret = source.scan(); + + if (md5p) { + md5filter.finish(); + MD5HexPrint(digest, *md5p); + } + return ret; +} + diff --git a/src/utils/readfile.h b/src/utils/readfile.h index 05dc51d5..64323965 100644 --- a/src/utils/readfile.h +++ b/src/utils/readfile.h @@ -65,6 +65,10 @@ public: bool file_scan(const std::string& fn, FileScanDo* doer, int64_t startoffs, int64_t cnttoread, std::string *reason, std::string *md5p); +/** Same as file_scan, from a memory buffer */ +bool string_scan(const char *data, size_t cnt, FileScanDo* doer, + std::string *reason, std::string *md5p); + /** Same as above, not offset/cnt/md5 */ bool file_scan(const std::string& filename, FileScanDo* doer, std::string *reason); @@ -74,6 +78,8 @@ bool file_scan(const std::string& filename, FileScanDo* doer, /* Process a zip archive member */ bool file_scan(const std::string& filename, const std::string& membername, FileScanDo* doer, std::string *reason); +bool string_scan(const char* data, size_t cnt, const std::string& membername, + FileScanDo* doer, std::string *reason); #endif /**