From 19fe03af62744360b668cea218de57f96e6b6714 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Tue, 4 Aug 2020 10:57:13 +0200 Subject: [PATCH] Support visio .vsdx format --- src/filters/rclopxml.py | 78 +++++++++++++++++++++++++++++++++------ src/filters/rclxslt.py | 72 +++++++++++------------------------- src/sampleconf/mimeconf | 3 ++ src/sampleconf/mimemap | 1 + src/sampleconf/mimeview | 6 ++- tests/openxml/openxml.sh | 1 + tests/openxml/openxml.txt | 2 + 7 files changed, 100 insertions(+), 63 deletions(-) diff --git a/src/filters/rclopxml.py b/src/filters/rclopxml.py index 5bd05ed7..97e5bf58 100755 --- a/src/filters/rclopxml.py +++ b/src/filters/rclopxml.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright (C) 2015 J.F.Dockes +# Copyright (C) 2015-2020 J.F.Dockes # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or @@ -15,15 +15,23 @@ # Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ###################################### -from __future__ import print_function + +# Note that .docx and .xlsx are now normally processed by the C++ mh_xslt.cpp +# module. See the openxml-xxx.xsl files for the style sheets used by the C++. +# +# .pptx and .vsdx are processed by this Python module because the C++ module +# can't process their multiple document structure (pages) at the moment. import sys from zipfile import ZipFile -import fnmatch import rclexecm from rclbasehandler import RclBaseHandler import rclxslt +import re +# +# Common style sheet for the openxml metadata +# meta_stylesheet = ''' ''' +##################################### +# .docx definitions. Not used any more by Recoll in its default config + word_tagmatch = 'w:p' word_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006" @@ -90,12 +101,19 @@ xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" word_moretemplates = '' +##################################### +# .xlsx definitions. Not used any more by Recoll in its default config + xl_tagmatch = 'x:t' xl_xmlns_decls='''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main" ''' xl_moretemplates = '' + +##################### +# .pptx definitions + pp_tagmatch = 'a:t' pp_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" @@ -108,6 +126,21 @@ pp_moretemplates = ''' ''' + +##################### +# .vsdx definitions + +vs_tagmatch = 'Text' +vs_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" +xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" +''' +vs_moretemplates = '' + + +############################## +# Common style sheet (with replaceable parts) for .pptx and .vsdx (also .docx +# and .xlsx, but not used by default). + content_stylesheet = ''' @@ -179,16 +212,39 @@ class OXExtractor(RclBaseHandler): pass try: - stl = self.computestylesheet('pp') - # Note that we'd need a numeric sort really (else we get slide1 - # slide11 slide2) - for fn in sorted(zip.namelist()): - if fnmatch.fnmatch(fn, 'ppt/slides/slide*.xml'): - content = zip.read(fn) - docdata += rclxslt.apply_sheet_data(stl, content) - except: + stl = None + # Extract number suffix for numeric sort + prefix = "ppt/slides/slide" + exp = prefix + '[0-9]+' + '.xml' + names = [fn for fn in zip.namelist() if re.match(exp, fn)] + for fn in sorted( + names, + key=lambda e,prefix=prefix: int(e[len(prefix):len(e)-4])): + if stl is None: + stl = self.computestylesheet('pp') + content = zip.read(fn) + docdata += rclxslt.apply_sheet_data(stl, content) + except Exception as ex: + #self.em.rclog("PPT Exception: %s" % ex) pass + try: + stl = None + # Extract number suffix for numeric sort + prefix = 'visio/pages/page' + exp = prefix + '[0-9]+' + '.xml' + names = [fn for fn in zip.namelist() if re.match(exp, fn)] + for fn in sorted( + names, + key=lambda e,prefix=prefix: int(e[len(prefix):len(e)-4])): + if stl is None: + stl = self.computestylesheet('vs') + content = zip.read(fn) + docdata += rclxslt.apply_sheet_data(stl, content) + except Exception as ex: + #self.em.rclog("VISIO Exception: %s" % ex) + pass + docdata += b'' return docdata diff --git a/src/filters/rclxslt.py b/src/filters/rclxslt.py index 1c40e336..94679a3a 100644 --- a/src/filters/rclxslt.py +++ b/src/filters/rclxslt.py @@ -1,4 +1,4 @@ -# Copyright (C) 2014 J.F.Dockes +# Copyright (C) 2014-2020 J.F.Dockes # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or @@ -15,58 +15,28 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ###################################### -# Helper module for xslt-based filters - -from __future__ import print_function +# Common code for the remaining Python xslt-based filters (most xslt work is +# now done in the c++ mh_xslt module, the ones remaining don't fit with its +# model). import sys -PY2 = sys.version < '3' +try: + from lxml import etree +except: + print("RECFILTERROR HELPERNOTFOUND python3:lxml") + sys.exit(1); -if PY2: - try: - import libxml2 - import libxslt - libxml2.substituteEntitiesDefault(1) - except: - print("RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1") - sys.exit(1); - def _apply_sheet_doc(sheet, doc): - styledoc = libxml2.readMemory(sheet, len(sheet), '', '', - options=libxml2.XML_PARSE_NONET) - style = libxslt.parseStylesheetDoc(styledoc) - result = style.applyStylesheet(doc, None) - res = "" - try: - res = style.saveResultToString(result) - except Exception as err: - # print("saveResultToString got exception: %s"%err) - pass - style.freeStylesheet() - doc.freeDoc() - result.freeDoc() - return res - def apply_sheet_data(sheet, data): - doc = libxml2.readMemory(data, len(data), '', '', - options=libxml2.XML_PARSE_NONET) - return _apply_sheet_doc(sheet, doc) - def apply_sheet_file(sheet, fn): - doc = libxml2.readFile(fn, '', options=libxml2.XML_PARSE_NONET) - return _apply_sheet_doc(sheet, doc) -else: - try: - from lxml import etree - except: - print("RECFILTERROR HELPERNOTFOUND python3:lxml") - sys.exit(1); - def _apply_sheet_doc(sheet, doc): - styledoc = etree.fromstring(sheet) - transform = etree.XSLT(styledoc) - return bytes(transform(doc)) - def apply_sheet_data(sheet, data): - doc = etree.fromstring(data) - return _apply_sheet_doc(sheet, doc) - def apply_sheet_file(sheet, fn): - doc = etree.parse(fn) - return _apply_sheet_doc(sheet, doc) +def _apply_sheet_doc(sheet, doc): + styledoc = etree.fromstring(sheet) + transform = etree.XSLT(styledoc) + return bytes(transform(doc)) + +def apply_sheet_data(sheet, data): + doc = etree.fromstring(data) + return _apply_sheet_doc(sheet, doc) + +def apply_sheet_file(sheet, fn): + doc = etree.parse(fn) + return _apply_sheet_doc(sheet, doc) diff --git a/src/sampleconf/mimeconf b/src/sampleconf/mimeconf index 72899b82..5e895871 100644 --- a/src/sampleconf/mimeconf +++ b/src/sampleconf/mimeconf @@ -108,6 +108,7 @@ application/vnd.openxmlformats-officedocument.presentationml.template = \ execm rclopxml.py application/vnd.openxmlformats-officedocument.presentationml.presentation = \ execm rclopxml.py +application/vnd.ms-visio.drawing = execm rclopxml.py application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \ internal xsltproc meta docProps/core.xml openxml-meta.xsl \ body xl/sharedStrings.xml openxml-xls-body.xsl @@ -270,6 +271,7 @@ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = spreadsheet application/vnd.openxmlformats-officedocument.spreadsheetml.template = spreadsheet application/vnd.openxmlformats-officedocument.wordprocessingml.document = wordprocessing application/vnd.openxmlformats-officedocument.wordprocessingml.template = wordprocessing +application/vnd.ms-visio.drawing = document application/vnd.sun.xml.calc = spreadsheet application/vnd.sun.xml.calc.template = spreadsheet application/vnd.sun.xml.draw = drawing @@ -460,6 +462,7 @@ presentation = \ application/vnd.openxmlformats-officedocument.presentationml.template \ application/vnd.sun.xml.impress \ application/vnd.sun.xml.impress.template \ + application/vnd.ms-visio.drawing media = \ application/ogg \ diff --git a/src/sampleconf/mimemap b/src/sampleconf/mimemap index b034f2ae..e676f6ec 100644 --- a/src/sampleconf/mimemap +++ b/src/sampleconf/mimemap @@ -147,6 +147,7 @@ .ppsx = application/vnd.openxmlformats-officedocument.presentationml.slideshow .pptm = application/vnd.ms-powerpoint.presentation.macroEnabled.12 .pptx = application/vnd.openxmlformats-officedocument.presentationml.presentation +.vsdx = application/vnd.ms-visio.drawing .xlam = application/vnd.ms-excel.addin.macroEnabled.12 .xlsb = application/vnd.ms-excel.sheet.binary.macroEnabled.12 .xlsm = application/vnd.ms-excel.sheet.macroEnabled.12 diff --git a/src/sampleconf/mimeview b/src/sampleconf/mimeview index d691d7c3..4d31d343 100644 --- a/src/sampleconf/mimeview +++ b/src/sampleconf/mimeview @@ -19,9 +19,12 @@ # Use xallexcepts- and xallexcepts+ in a user file to add or remove from # the default xallexcepts list +# Visio xml is in there because the desktops tend to try and open it as an +# archive xallexcepts = application/pdf application/postscript application/x-dvi \ text/html|gnuinfo text/html|chm text/html|epub text/html|rclman \ - application/x-fsdirectory|parentopen inode/directory|parentopen + application/x-fsdirectory|parentopen inode/directory|parentopen \ + application/vnd.ms-visio.drawing [view] @@ -75,6 +78,7 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.template = \ libreoffice %f application/vnd.openxmlformats-officedocument.presentationml.template = \ libreoffice %f +application/vnd.ms-visio.drawing = libreoffice %f application/vnd.openxmlformats-officedocument.presentationml.presentation = \ libreoffice %f application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \ diff --git a/tests/openxml/openxml.sh b/tests/openxml/openxml.sh index 90298065..527ccec3 100644 --- a/tests/openxml/openxml.sh +++ b/tests/openxml/openxml.sh @@ -10,6 +10,7 @@ initvariables $0 recollq author:ben '"Consideration of the high correlation"' recollq '"The Circassian Education Foundation"' date:2008-01-20 recollq author:"Johnny Walker" '"Thin Lizzy"' +recollq '"Objekt steht im Akkusativ"' ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout diff --git a/tests/openxml/openxml.txt b/tests/openxml/openxml.txt index 90221808..c09e0e13 100755 --- a/tests/openxml/openxml.txt +++ b/tests/openxml/openxml.txt @@ -4,3 +4,5 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.document [file:// application/vnd.openxmlformats-officedocument.presentationml.presentation [file:///home/dockes/projets/fulltext/testrecoll/opxml/College_Application_Process_1_18_08.pptx] [College_Application_Process_1_18_08.pptx] 713280 bytes 1 results application/vnd.openxmlformats-officedocument.spreadsheetml.sheet [file:///home/dockes/projets/fulltext/testrecoll/opxml/GuitarHero2_Helper.xlsx] [GuitarHero2_Helper.xlsx] 17147 bytes +1 results +application/vnd.ms-visio.drawing [file:///home/dockes/projets/fulltext/testrecoll/visio/visio2.vsdx] [visio2.vsdx] 96002 bytes