Support visio .vsdx format

This commit is contained in:
Jean-Francois Dockes 2020-08-04 10:57:13 +02:00
parent 5cfd95226d
commit 19fe03af62
7 changed files with 100 additions and 63 deletions

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python3
# Copyright (C) 2015 J.F.Dockes
# Copyright (C) 2015-2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
@ -15,15 +15,23 @@
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
######################################
from __future__ import print_function
# Note that .docx and .xlsx are now normally processed by the C++ mh_xslt.cpp
# module. See the openxml-xxx.xsl files for the style sheets used by the C++.
#
# .pptx and .vsdx are processed by this Python module because the C++ module
# can't process their multiple document structure (pages) at the moment.
import sys
from zipfile import ZipFile
import fnmatch
import rclexecm
from rclbasehandler import RclBaseHandler
import rclxslt
import re
#
# Common style sheet for the openxml metadata
#
meta_stylesheet = '''<?xml version="1.0"?>
<xsl:stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
@ -75,6 +83,9 @@ meta_stylesheet = '''<?xml version="1.0"?>
</xsl:stylesheet>
'''
#####################################
# .docx definitions. Not used any more by Recoll in its default config
word_tagmatch = 'w:p'
word_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
@ -90,12 +101,19 @@ xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
word_moretemplates = ''
#####################################
# .xlsx definitions. Not used any more by Recoll in its default config
xl_tagmatch = 'x:t'
xl_xmlns_decls='''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
'''
xl_moretemplates = ''
#####################
# .pptx definitions
pp_tagmatch = 'a:t'
pp_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
@ -108,6 +126,21 @@ pp_moretemplates = '''<xsl:template match="p:attrName">
</xsl:template>
'''
#####################
# .vsdx definitions
vs_tagmatch = 'Text'
vs_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
'''
vs_moretemplates = ''
##############################
# Common style sheet (with replaceable parts) for .pptx and .vsdx (also .docx
# and .xlsx, but not used by default).
content_stylesheet = '''<?xml version="1.0"?>
<xsl:stylesheet @XMLNS_DECLS@ >
@ -179,14 +212,37 @@ class OXExtractor(RclBaseHandler):
pass
try:
stl = None
# Extract number suffix for numeric sort
prefix = "ppt/slides/slide"
exp = prefix + '[0-9]+' + '.xml'
names = [fn for fn in zip.namelist() if re.match(exp, fn)]
for fn in sorted(
names,
key=lambda e,prefix=prefix: int(e[len(prefix):len(e)-4])):
if stl is None:
stl = self.computestylesheet('pp')
# Note that we'd need a numeric sort really (else we get slide1
# slide11 slide2)
for fn in sorted(zip.namelist()):
if fnmatch.fnmatch(fn, 'ppt/slides/slide*.xml'):
content = zip.read(fn)
docdata += rclxslt.apply_sheet_data(stl, content)
except:
except Exception as ex:
#self.em.rclog("PPT Exception: %s" % ex)
pass
try:
stl = None
# Extract number suffix for numeric sort
prefix = 'visio/pages/page'
exp = prefix + '[0-9]+' + '.xml'
names = [fn for fn in zip.namelist() if re.match(exp, fn)]
for fn in sorted(
names,
key=lambda e,prefix=prefix: int(e[len(prefix):len(e)-4])):
if stl is None:
stl = self.computestylesheet('vs')
content = zip.read(fn)
docdata += rclxslt.apply_sheet_data(stl, content)
except Exception as ex:
#self.em.rclog("VISIO Exception: %s" % ex)
pass
docdata += b'</body></html>'

View File

@ -1,4 +1,4 @@
# Copyright (C) 2014 J.F.Dockes
# Copyright (C) 2014-2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
@ -15,57 +15,27 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
######################################
# Helper module for xslt-based filters
from __future__ import print_function
# Common code for the remaining Python xslt-based filters (most xslt work is
# now done in the c++ mh_xslt module, the ones remaining don't fit with its
# model).
import sys
PY2 = sys.version < '3'
if PY2:
try:
import libxml2
import libxslt
libxml2.substituteEntitiesDefault(1)
except:
print("RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1")
sys.exit(1);
def _apply_sheet_doc(sheet, doc):
styledoc = libxml2.readMemory(sheet, len(sheet), '', '',
options=libxml2.XML_PARSE_NONET)
style = libxslt.parseStylesheetDoc(styledoc)
result = style.applyStylesheet(doc, None)
res = ""
try:
res = style.saveResultToString(result)
except Exception as err:
# print("saveResultToString got exception: %s"%err)
pass
style.freeStylesheet()
doc.freeDoc()
result.freeDoc()
return res
def apply_sheet_data(sheet, data):
doc = libxml2.readMemory(data, len(data), '', '',
options=libxml2.XML_PARSE_NONET)
return _apply_sheet_doc(sheet, doc)
def apply_sheet_file(sheet, fn):
doc = libxml2.readFile(fn, '', options=libxml2.XML_PARSE_NONET)
return _apply_sheet_doc(sheet, doc)
else:
try:
from lxml import etree
except:
print("RECFILTERROR HELPERNOTFOUND python3:lxml")
sys.exit(1);
def _apply_sheet_doc(sheet, doc):
styledoc = etree.fromstring(sheet)
transform = etree.XSLT(styledoc)
return bytes(transform(doc))
def apply_sheet_data(sheet, data):
doc = etree.fromstring(data)
return _apply_sheet_doc(sheet, doc)
def apply_sheet_file(sheet, fn):
doc = etree.parse(fn)
return _apply_sheet_doc(sheet, doc)

View File

@ -108,6 +108,7 @@ application/vnd.openxmlformats-officedocument.presentationml.template = \
execm rclopxml.py
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
execm rclopxml.py
application/vnd.ms-visio.drawing = execm rclopxml.py
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
internal xsltproc meta docProps/core.xml openxml-meta.xsl \
body xl/sharedStrings.xml openxml-xls-body.xsl
@ -270,6 +271,7 @@ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = spreadsheet
application/vnd.openxmlformats-officedocument.spreadsheetml.template = spreadsheet
application/vnd.openxmlformats-officedocument.wordprocessingml.document = wordprocessing
application/vnd.openxmlformats-officedocument.wordprocessingml.template = wordprocessing
application/vnd.ms-visio.drawing = document
application/vnd.sun.xml.calc = spreadsheet
application/vnd.sun.xml.calc.template = spreadsheet
application/vnd.sun.xml.draw = drawing
@ -460,6 +462,7 @@ presentation = \
application/vnd.openxmlformats-officedocument.presentationml.template \
application/vnd.sun.xml.impress \
application/vnd.sun.xml.impress.template \
application/vnd.ms-visio.drawing
media = \
application/ogg \

View File

@ -147,6 +147,7 @@
.ppsx = application/vnd.openxmlformats-officedocument.presentationml.slideshow
.pptm = application/vnd.ms-powerpoint.presentation.macroEnabled.12
.pptx = application/vnd.openxmlformats-officedocument.presentationml.presentation
.vsdx = application/vnd.ms-visio.drawing
.xlam = application/vnd.ms-excel.addin.macroEnabled.12
.xlsb = application/vnd.ms-excel.sheet.binary.macroEnabled.12
.xlsm = application/vnd.ms-excel.sheet.macroEnabled.12

View File

@ -19,9 +19,12 @@
# Use xallexcepts- and xallexcepts+ in a user file to add or remove from
# the default xallexcepts list
# Visio xml is in there because the desktops tend to try and open it as an
# archive
xallexcepts = application/pdf application/postscript application/x-dvi \
text/html|gnuinfo text/html|chm text/html|epub text/html|rclman \
application/x-fsdirectory|parentopen inode/directory|parentopen
application/x-fsdirectory|parentopen inode/directory|parentopen \
application/vnd.ms-visio.drawing
[view]
@ -75,6 +78,7 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
libreoffice %f
application/vnd.openxmlformats-officedocument.presentationml.template = \
libreoffice %f
application/vnd.ms-visio.drawing = libreoffice %f
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
libreoffice %f
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \

View File

@ -10,6 +10,7 @@ initvariables $0
recollq author:ben '"Consideration of the high correlation"'
recollq '"The Circassian Education Foundation"' date:2008-01-20
recollq author:"Johnny Walker" '"Thin Lizzy"'
recollq '"Objekt steht im Akkusativ"'
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout

View File

@ -4,3 +4,5 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.document [file://
application/vnd.openxmlformats-officedocument.presentationml.presentation [file:///home/dockes/projets/fulltext/testrecoll/opxml/College_Application_Process_1_18_08.pptx] [College_Application_Process_1_18_08.pptx] 713280 bytes
1 results
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet [file:///home/dockes/projets/fulltext/testrecoll/opxml/GuitarHero2_Helper.xlsx] [GuitarHero2_Helper.xlsx] 17147 bytes
1 results
application/vnd.ms-visio.drawing [file:///home/dockes/projets/fulltext/testrecoll/visio/visio2.vsdx] [visio2.vsdx] 96002 bytes