internal xslt: openoffice zip format working

This commit is contained in:
Jean-Francois Dockes 2018-12-27 16:20:12 +01:00
parent 094a119538
commit 586ff90dc0
7 changed files with 348 additions and 89 deletions

View File

@ -653,7 +653,9 @@ filters/fb2.xsl \
filters/gnumeric.xsl \
filters/msodump.zip \
filters/okular-note.xsl \
filters/opendoc-body.xsl \
filters/opendoc-flat.xsl \
filters/opendoc-meta.xsl \
filters/ppt-dump.py \
filters/rcl7z \
filters/rclabw.py \

View File

@ -0,0 +1,32 @@
<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
exclude-result-prefixes="text"
>
<xsl:output method="html" encoding="UTF-8"/>
<xsl:template match="text:p">
<p><xsl:apply-templates/></p><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="text:h">
<p><xsl:apply-templates/></p><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="text:s">
<xsl:text> </xsl:text>
</xsl:template>
<xsl:template match="text:line-break">
<br />
</xsl:template>
<xsl:template match="text:tab">
<xsl:text> </xsl:text>
</xsl:template>
</xsl:stylesheet>

View File

@ -0,0 +1,67 @@
<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
xmlns:ooo="http://openoffice.org/2004/office"
exclude-result-prefixes="office xlink meta ooo dc"
>
<xsl:output method="html" encoding="UTF-8"/>
<xsl:template match="/office:document-meta">
<xsl:apply-templates select="office:meta/dc:description"/>
<xsl:apply-templates select="office:meta/dc:subject"/>
<xsl:apply-templates select="office:meta/dc:title"/>
<xsl:apply-templates select="office:meta/meta:keyword"/>
<xsl:apply-templates select="office:meta/dc:creator"/>
</xsl:template>
<xsl:template match="dc:title">
<title> <xsl:value-of select="."/> </title><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:description">
<meta>
<xsl:attribute name="name">abstract</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:subject">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="dc:creator">
<meta>
<xsl:attribute name="name">author</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
<xsl:template match="meta:keyword">
<meta>
<xsl:attribute name="name">keywords</xsl:attribute>
<xsl:attribute name="content">
<xsl:value-of select="."/>
</xsl:attribute>
</meta><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>

View File

@ -94,18 +94,31 @@ private:
class MimeHandlerXslt::Internal {
public:
Internal(MimeHandlerXslt *_p)
: p(_p) {}
~Internal() {
if (metaOrAllSS) {
xsltFreeStylesheet(metaOrAllSS);
}
if (dataSS) {
xsltFreeStylesheet(dataSS);
if (bodySS) {
xsltFreeStylesheet(bodySS);
}
}
xsltStylesheet *prepare_stylesheet(const string& ssnm);
bool process_doc_or_string(bool forpv, const string& fn, const string& data);
bool apply_stylesheet(
const string& fn, const string& member, const string& data,
xsltStylesheet *ssp, string& result, string *md5p);
MimeHandlerXslt *p;
bool ok{false};
string metamember;
xsltStylesheet *metaOrAllSS{nullptr};
xsltStylesheet *dataSS{nullptr};
string bodymember;
xsltStylesheet *bodySS{nullptr};
string result;
string filtersdir;
};
MimeHandlerXslt::~MimeHandlerXslt()
@ -115,99 +128,163 @@ MimeHandlerXslt::~MimeHandlerXslt()
MimeHandlerXslt::MimeHandlerXslt(RclConfig *cnf, const std::string& id,
const std::vector<std::string>& params)
: RecollFilter(cnf, id), m(new Internal)
: RecollFilter(cnf, id), m(new Internal(this))
{
LOGDEB("MimeHandlerXslt: params: " << stringsToString(params) << endl);
string filtersdir = path_cat(cnf->getDatadir(), "filters");
m->filtersdir = path_cat(cnf->getDatadir(), "filters");
xmlSubstituteEntitiesDefault(0);
xmlLoadExtDtdDefaultValue = 0;
// params can be "xslt stylesheetall" or
// "xslt metamember stylesheetmeta datamember stylesheetdata"
// "xslt metamember metastylesheet bodymember bodystylesheet"
if (params.size() == 2) {
string ssfn = path_cat(filtersdir, params[1]);
FileScanXML XMLstyle(ssfn);
string reason;
if (!file_scan(ssfn, &XMLstyle, &reason)) {
LOGERR("MimeHandlerXslt: file_scan failed for style sheet " <<
ssfn << " : " << reason << endl);
return;
}
xmlDoc *stl = XMLstyle.getDoc();
if (stl == nullptr) {
LOGERR("MimeHandlerXslt: getDoc failed for style sheet " <<
ssfn << endl);
return;
}
m->metaOrAllSS = xsltParseStylesheetDoc(stl);
m->metaOrAllSS = m->prepare_stylesheet(params[1]);
if (m->metaOrAllSS) {
m->ok = true;
}
} else if (params.size() == 4) {
} else if (params.size() == 5) {
m->metamember = params[1];
m->metaOrAllSS = m->prepare_stylesheet(params[2]);
m->bodymember = params[3];
m->bodySS = m->prepare_stylesheet(params[4]);
if (m->metaOrAllSS && m->bodySS) {
m->ok = true;
}
} else {
LOGERR("MimeHandlerXslt: constructor with wrong param vector: " <<
stringsToString(params) << endl);
}
}
bool MimeHandlerXslt::set_document_file_impl(const std::string& mt,
const std::string &file_path)
xsltStylesheet *MimeHandlerXslt::Internal::prepare_stylesheet(const string& ssnm)
{
LOGDEB0("MimeHandlerXslt::set_document_file_: fn: " << file_path << endl);
if (!m || !m->ok) {
string ssfn = path_cat(filtersdir, ssnm);
FileScanXML XMLstyle(ssfn);
string reason;
if (!file_scan(ssfn, &XMLstyle, &reason)) {
LOGERR("MimeHandlerXslt: file_scan failed for style sheet " <<
ssfn << " : " << reason << endl);
return nullptr;
}
xmlDoc *stl = XMLstyle.getDoc();
if (stl == nullptr) {
LOGERR("MimeHandlerXslt: getDoc failed for style sheet " <<
ssfn << endl);
return nullptr;
}
return xsltParseStylesheetDoc(stl);
}
bool MimeHandlerXslt::Internal::apply_stylesheet(
const string& fn, const string& member, const string& data,
xsltStylesheet *ssp, string& result, string *md5p)
{
FileScanXML XMLdoc(fn);
string md5, reason;
bool res;
if (!fn.empty()) {
if (member.empty()) {
res = file_scan(fn, &XMLdoc, 0, -1, &reason, md5p);
} else {
res = file_scan(fn, member, &XMLdoc, &reason);
}
} else {
if (member.empty()) {
res = string_scan(data.c_str(), data.size(), &XMLdoc, &reason, md5p);
} else {
res = string_scan(data.c_str(), data.size(), member, &XMLdoc,
&reason);
}
}
if (!res) {
LOGERR("MimeHandlerXslt::set_document_: file_scan failed for "<<
fn << " " << member << " : " << reason << endl);
return false;
}
if (nullptr == m->dataSS) {
if (nullptr == m->metaOrAllSS) {
LOGERR("MimeHandlerXslt::set_document_file_impl: both ss empty??\n");
return false;
}
FileScanXML XMLdoc(file_path);
string md5, reason;
if (!file_scan(file_path, &XMLdoc, 0, -1, &reason,
m_forPreview ? nullptr : &md5)) {
LOGERR("MimeHandlerXslt::set_document_file_impl: file_scan failed "
"for " << file_path << " : " << reason << endl);
return false;
}
if (!m_forPreview) {
m_metaData[cstr_dj_keymd5] = md5;
}
xmlDocPtr doc = XMLdoc.getDoc();
if (nullptr == doc) {
LOGERR("MimeHandlerXslt::set_doc_file_impl: no parsed doc\n");
return false;
}
xmlDocPtr transformed = xsltApplyStylesheet(m->metaOrAllSS, doc, NULL);
if (nullptr == transformed) {
LOGERR("MimeHandlerXslt::set_doc_file_: xslt transform failed\n");
xmlFreeDoc(doc);
return false;
}
xmlChar *outstr;
int outlen;
xsltSaveResultToString(&outstr, &outlen, transformed, m->metaOrAllSS);
m->result = string((const char*)outstr, outlen);
xmlFree(outstr);
xmlFreeDoc(transformed);
xmlFreeDoc(doc);
} else {
LOGERR("Not ready for multipart yet\n");
abort();
xmlDocPtr doc = XMLdoc.getDoc();
if (nullptr == doc) {
LOGERR("MimeHandlerXslt::set_document_: no parsed doc\n");
return false;
}
m_havedoc = true;
xmlDocPtr transformed = xsltApplyStylesheet(ssp, doc, NULL);
if (nullptr == transformed) {
LOGERR("MimeHandlerXslt::set_document_: xslt transform failed\n");
xmlFreeDoc(doc);
return false;
}
xmlChar *outstr;
int outlen;
xsltSaveResultToString(&outstr, &outlen, transformed, metaOrAllSS);
result = string((const char*)outstr, outlen);
xmlFree(outstr);
xmlFreeDoc(transformed);
xmlFreeDoc(doc);
return true;
}
bool MimeHandlerXslt::set_document_string_impl(const string& mt,
const string& msgtxt)
bool MimeHandlerXslt::Internal::process_doc_or_string(
bool forpreview, const string& fn, const string& data)
{
if (nullptr == metaOrAllSS && nullptr == bodySS) {
LOGERR("MimeHandlerXslt::set_document_file_impl: both ss empty??\n");
return false;
}
if (nullptr == bodySS) {
string md5;
if (apply_stylesheet(fn, string(), data, metaOrAllSS, result,
forpreview ? nullptr : &md5)) {
if (!forpreview) {
p->m_metaData[cstr_dj_keymd5] = md5;
}
return true;
}
return false;
} else {
result = "<html>\n<head>\n<meta http-equiv=\"Content-Type\""
"content=\"text/html; charset=UTF-8\">";
string part;
if (!apply_stylesheet(fn,metamember, data, metaOrAllSS, part, nullptr)) {
return false;
}
result += part;
result += "</head>\n<body>\n";
if (!apply_stylesheet(fn, bodymember, data, bodySS, part, nullptr)) {
return false;
}
result += part;
result += "</body></html>";
}
return true;
}
bool MimeHandlerXslt::set_document_file_impl(const std::string& mt,
const std::string &fn)
{
LOGDEB0("MimeHandlerXslt::set_document_file_: fn: " << fn << endl);
if (!m || !m->ok) {
return false;
}
return true;
bool ret = m->process_doc_or_string(m_forPreview, fn, string());
if (ret) {
m_havedoc = true;
}
return ret;
}
bool MimeHandlerXslt::set_document_string_impl(const string& mt,
const string& txt)
{
LOGDEB0("MimeHandlerXslt::set_document_string_\n");
if (!m || !m->ok) {
return false;
}
bool ret = m->process_doc_or_string(m_forPreview, string(), txt);
if (ret) {
m_havedoc = true;
}
return ret;
}
bool MimeHandlerXslt::next_document()

View File

@ -79,11 +79,11 @@ application/postscript = exec rclps
application/sql = internal text/plain
application/vnd.ms-excel = execm rclxls.py
application/vnd.ms-powerpoint = execm rclppt.py
application/vnd.oasis.opendocument.text = execm rclsoff.py
application/vnd.oasis.opendocument.text-template = execm rclsoff.py
application/vnd.oasis.opendocument.presentation = execm rclsoff.py
application/vnd.oasis.opendocument.spreadsheet = execm rclsoff.py
application/vnd.oasis.opendocument.graphics = execm rclsoff.py
application/vnd.oasis.opendocument.text = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.oasis.opendocument.text-template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.oasis.opendocument.presentation = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.oasis.opendocument.spreadsheet = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.oasis.opendocument.graphics = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.oasis.opendocument.presentation-flat-xml = internal xsltproc opendoc-flat.xsl
application/vnd.oasis.opendocument.text-flat-xml = internal xsltproc opendoc-flat.xsl
application/vnd.oasis.opendocument.spreadsheet-flat-xml = internal xsltproc opendoc-flat.xsl
@ -99,16 +99,16 @@ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
execm rclopxml.py
application/vnd.openxmlformats-officedocument.spreadsheetml.template =\
execm rclopxml.py
application/vnd.sun.xml.calc = execm rclsoff.py
application/vnd.sun.xml.calc.template = execm rclsoff.py
application/vnd.sun.xml.draw = execm rclsoff.py
application/vnd.sun.xml.draw.template = execm rclsoff.py
application/vnd.sun.xml.impress = execm rclsoff.py
application/vnd.sun.xml.impress.template = execm rclsoff.py
application/vnd.sun.xml.math = execm rclsoff.py
application/vnd.sun.xml.writer = execm rclsoff.py
application/vnd.sun.xml.writer.global = execm rclsoff.py
application/vnd.sun.xml.writer.template = execm rclsoff.py
application/vnd.sun.xml.calc = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.sun.xml.calc.template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.sun.xml.draw = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.sun.xml.draw.template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.sun.xml.impress = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.sun.xml.impress.template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.sun.xml.math = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.sun.xml.writer = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.sun.xml.writer.global = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.sun.xml.writer.template = internal xsltproc meta.xml opendoc-meta.xsl content.xml opendoc-body.xsl
application/vnd.wordperfect = exec wpd2html;mimetype=text/html
application/x-abiword = internal xsltproc abiword.xsl
application/x-awk = internal text/plain

View File

@ -381,10 +381,15 @@ protected:
// Source taking data from a ZIP archive member
class FileScanSourceZip : public FileScanSource {
public:
FileScanSourceZip(FileScanDo *next, const string& fn, const string& member,
string *reason)
FileScanSourceZip(FileScanDo *next, const string& fn,
const string& member, string *reason)
: FileScanSource(next), m_fn(fn), m_member(member),
m_reason(reason) { }
m_reason(reason) {}
FileScanSourceZip(const char *data, size_t cnt, FileScanDo *next,
const string& member, string *reason)
: FileScanSource(next), m_data(data), m_cnt(cnt), m_member(member),
m_reason(reason) {}
virtual bool scan() {
bool ret = false;
@ -392,13 +397,21 @@ public:
mz_zip_zero_struct(&zip);
void *opaque = this;
if (!mz_zip_reader_init_file(&zip, m_fn.c_str(), 0)) {
bool ret1;
if (m_fn.empty()) {
ret1 = mz_zip_reader_init_mem(&zip, m_data, m_cnt, 0);
} else {
ret1 = mz_zip_reader_init_file(&zip, m_fn.c_str(), 0);
}
if (!ret1) {
if (m_reason) {
*m_reason += "mz_zip_reader_init_file() failed: ";
*m_reason += string(mz_zip_get_error_string(zip.m_last_error));
*m_reason += "mz_zip_reader_init_xx() failed: ";
*m_reason +=
string(mz_zip_get_error_string(zip.m_last_error));
}
return false;
}
mz_uint32 file_index;
if (mz_zip_reader_locate_file_v2(&zip, m_member.c_str(), NULL, 0,
&file_index) < 0) {
@ -453,6 +466,8 @@ public:
}
protected:
const char *m_data;
size_t m_cnt;
string m_fn;
string m_member;
string *m_reason;
@ -469,6 +484,17 @@ bool file_scan(const std::string& filename, const std::string& membername,
}
}
bool string_scan(const char *data, size_t cnt, const std::string& membername,
FileScanDo* doer, std::string *reason)
{
if (membername.empty()) {
return string_scan(data, cnt, doer, reason, nullptr);
} else {
FileScanSourceZip source(data, cnt, doer, membername, reason);
return source.scan();
}
}
#endif // READFILE_ENABLE_ZIP
bool file_scan(const string& fn, FileScanDo* doer, int64_t startoffs,
@ -515,3 +541,52 @@ bool file_scan(const string& fn, FileScanDo* doer, string *reason)
{
return file_scan(fn, doer, 0, -1, reason, nullptr);
}
class FileScanSourceBuffer : public FileScanSource {
public:
FileScanSourceBuffer(FileScanDo *next, const char *data, size_t cnt,
string *reason)
: FileScanSource(next), m_data(data), m_cnt(cnt), m_reason(reason) {}
virtual bool scan() {
if (out()) {
if (!out()->init(m_cnt, m_reason)) {
return false;
}
return out()->data(m_data, m_cnt, m_reason);
} else {
return true;
}
}
protected:
const char *m_data{nullptr};
size_t m_cnt{0};
string *m_reason{nullptr};
};
bool string_scan(const char *data, size_t cnt, FileScanDo* doer,
std::string *reason, std::string *md5p)
{
FileScanSourceBuffer source(doer, data, cnt, reason);
FileScanUpstream *up = &source;
// We compute the MD5 on the uncompressed data, so insert this
// right at the source.
string digest;
FileScanMd5 md5filter(digest);
if (md5p) {
md5filter.insertAtSink(doer, up);
up = &md5filter;
}
bool ret = source.scan();
if (md5p) {
md5filter.finish();
MD5HexPrint(digest, *md5p);
}
return ret;
}

View File

@ -65,6 +65,10 @@ public:
bool file_scan(const std::string& fn, FileScanDo* doer, int64_t startoffs,
int64_t cnttoread, std::string *reason, std::string *md5p);
/** Same as file_scan, from a memory buffer */
bool string_scan(const char *data, size_t cnt, FileScanDo* doer,
std::string *reason, std::string *md5p);
/** Same as above, not offset/cnt/md5 */
bool file_scan(const std::string& filename, FileScanDo* doer,
std::string *reason);
@ -74,6 +78,8 @@ bool file_scan(const std::string& filename, FileScanDo* doer,
/* Process a zip archive member */
bool file_scan(const std::string& filename, const std::string& membername,
FileScanDo* doer, std::string *reason);
bool string_scan(const char* data, size_t cnt, const std::string& membername,
FileScanDo* doer, std::string *reason);
#endif
/**