Support visio .vsdx format

This commit is contained in:
Jean-Francois Dockes 2020-08-04 10:57:13 +02:00
parent 5cfd95226d
commit 19fe03af62
7 changed files with 100 additions and 63 deletions

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# Copyright (C) 2015 J.F.Dockes # Copyright (C) 2015-2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or # the Free Software Foundation; either version 2 of the License, or
@ -15,15 +15,23 @@
# Free Software Foundation, Inc., # Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
###################################### ######################################
from __future__ import print_function
# Note that .docx and .xlsx are now normally processed by the C++ mh_xslt.cpp
# module. See the openxml-xxx.xsl files for the style sheets used by the C++.
#
# .pptx and .vsdx are processed by this Python module because the C++ module
# can't process their multiple document structure (pages) at the moment.
import sys import sys
from zipfile import ZipFile from zipfile import ZipFile
import fnmatch
import rclexecm import rclexecm
from rclbasehandler import RclBaseHandler from rclbasehandler import RclBaseHandler
import rclxslt import rclxslt
import re
#
# Common style sheet for the openxml metadata
#
meta_stylesheet = '''<?xml version="1.0"?> meta_stylesheet = '''<?xml version="1.0"?>
<xsl:stylesheet <xsl:stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
@ -75,6 +83,9 @@ meta_stylesheet = '''<?xml version="1.0"?>
</xsl:stylesheet> </xsl:stylesheet>
''' '''
#####################################
# .docx definitions. Not used any more by Recoll in its default config
word_tagmatch = 'w:p' word_tagmatch = 'w:p'
word_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" word_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
@ -90,12 +101,19 @@ xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
word_moretemplates = '' word_moretemplates = ''
#####################################
# .xlsx definitions. Not used any more by Recoll in its default config
xl_tagmatch = 'x:t' xl_tagmatch = 'x:t'
xl_xmlns_decls='''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" xl_xmlns_decls='''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
''' '''
xl_moretemplates = '' xl_moretemplates = ''
#####################
# .pptx definitions
pp_tagmatch = 'a:t' pp_tagmatch = 'a:t'
pp_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" pp_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
@ -108,6 +126,21 @@ pp_moretemplates = '''<xsl:template match="p:attrName">
</xsl:template> </xsl:template>
''' '''
#####################
# .vsdx definitions
vs_tagmatch = 'Text'
vs_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
'''
vs_moretemplates = ''
##############################
# Common style sheet (with replaceable parts) for .pptx and .vsdx (also .docx
# and .xlsx, but not used by default).
content_stylesheet = '''<?xml version="1.0"?> content_stylesheet = '''<?xml version="1.0"?>
<xsl:stylesheet @XMLNS_DECLS@ > <xsl:stylesheet @XMLNS_DECLS@ >
@ -179,16 +212,39 @@ class OXExtractor(RclBaseHandler):
pass pass
try: try:
stl = self.computestylesheet('pp') stl = None
# Note that we'd need a numeric sort really (else we get slide1 # Extract number suffix for numeric sort
# slide11 slide2) prefix = "ppt/slides/slide"
for fn in sorted(zip.namelist()): exp = prefix + '[0-9]+' + '.xml'
if fnmatch.fnmatch(fn, 'ppt/slides/slide*.xml'): names = [fn for fn in zip.namelist() if re.match(exp, fn)]
content = zip.read(fn) for fn in sorted(
docdata += rclxslt.apply_sheet_data(stl, content) names,
except: key=lambda e,prefix=prefix: int(e[len(prefix):len(e)-4])):
if stl is None:
stl = self.computestylesheet('pp')
content = zip.read(fn)
docdata += rclxslt.apply_sheet_data(stl, content)
except Exception as ex:
#self.em.rclog("PPT Exception: %s" % ex)
pass pass
try:
stl = None
# Extract number suffix for numeric sort
prefix = 'visio/pages/page'
exp = prefix + '[0-9]+' + '.xml'
names = [fn for fn in zip.namelist() if re.match(exp, fn)]
for fn in sorted(
names,
key=lambda e,prefix=prefix: int(e[len(prefix):len(e)-4])):
if stl is None:
stl = self.computestylesheet('vs')
content = zip.read(fn)
docdata += rclxslt.apply_sheet_data(stl, content)
except Exception as ex:
#self.em.rclog("VISIO Exception: %s" % ex)
pass
docdata += b'</body></html>' docdata += b'</body></html>'
return docdata return docdata

View File

@ -1,4 +1,4 @@
# Copyright (C) 2014 J.F.Dockes # Copyright (C) 2014-2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or # the Free Software Foundation; either version 2 of the License, or
@ -15,58 +15,28 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
###################################### ######################################
# Helper module for xslt-based filters # Common code for the remaining Python xslt-based filters (most xslt work is
# now done in the c++ mh_xslt module, the ones remaining don't fit with its
from __future__ import print_function # model).
import sys import sys
PY2 = sys.version < '3' try:
from lxml import etree
except:
print("RECFILTERROR HELPERNOTFOUND python3:lxml")
sys.exit(1);
if PY2: def _apply_sheet_doc(sheet, doc):
try: styledoc = etree.fromstring(sheet)
import libxml2 transform = etree.XSLT(styledoc)
import libxslt return bytes(transform(doc))
libxml2.substituteEntitiesDefault(1)
except: def apply_sheet_data(sheet, data):
print("RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1") doc = etree.fromstring(data)
sys.exit(1); return _apply_sheet_doc(sheet, doc)
def _apply_sheet_doc(sheet, doc):
styledoc = libxml2.readMemory(sheet, len(sheet), '', '', def apply_sheet_file(sheet, fn):
options=libxml2.XML_PARSE_NONET) doc = etree.parse(fn)
style = libxslt.parseStylesheetDoc(styledoc) return _apply_sheet_doc(sheet, doc)
result = style.applyStylesheet(doc, None)
res = ""
try:
res = style.saveResultToString(result)
except Exception as err:
# print("saveResultToString got exception: %s"%err)
pass
style.freeStylesheet()
doc.freeDoc()
result.freeDoc()
return res
def apply_sheet_data(sheet, data):
doc = libxml2.readMemory(data, len(data), '', '',
options=libxml2.XML_PARSE_NONET)
return _apply_sheet_doc(sheet, doc)
def apply_sheet_file(sheet, fn):
doc = libxml2.readFile(fn, '', options=libxml2.XML_PARSE_NONET)
return _apply_sheet_doc(sheet, doc)
else:
try:
from lxml import etree
except:
print("RECFILTERROR HELPERNOTFOUND python3:lxml")
sys.exit(1);
def _apply_sheet_doc(sheet, doc):
styledoc = etree.fromstring(sheet)
transform = etree.XSLT(styledoc)
return bytes(transform(doc))
def apply_sheet_data(sheet, data):
doc = etree.fromstring(data)
return _apply_sheet_doc(sheet, doc)
def apply_sheet_file(sheet, fn):
doc = etree.parse(fn)
return _apply_sheet_doc(sheet, doc)

View File

@ -108,6 +108,7 @@ application/vnd.openxmlformats-officedocument.presentationml.template = \
execm rclopxml.py execm rclopxml.py
application/vnd.openxmlformats-officedocument.presentationml.presentation = \ application/vnd.openxmlformats-officedocument.presentationml.presentation = \
execm rclopxml.py execm rclopxml.py
application/vnd.ms-visio.drawing = execm rclopxml.py
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
internal xsltproc meta docProps/core.xml openxml-meta.xsl \ internal xsltproc meta docProps/core.xml openxml-meta.xsl \
body xl/sharedStrings.xml openxml-xls-body.xsl body xl/sharedStrings.xml openxml-xls-body.xsl
@ -270,6 +271,7 @@ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = spreadsheet
application/vnd.openxmlformats-officedocument.spreadsheetml.template = spreadsheet application/vnd.openxmlformats-officedocument.spreadsheetml.template = spreadsheet
application/vnd.openxmlformats-officedocument.wordprocessingml.document = wordprocessing application/vnd.openxmlformats-officedocument.wordprocessingml.document = wordprocessing
application/vnd.openxmlformats-officedocument.wordprocessingml.template = wordprocessing application/vnd.openxmlformats-officedocument.wordprocessingml.template = wordprocessing
application/vnd.ms-visio.drawing = document
application/vnd.sun.xml.calc = spreadsheet application/vnd.sun.xml.calc = spreadsheet
application/vnd.sun.xml.calc.template = spreadsheet application/vnd.sun.xml.calc.template = spreadsheet
application/vnd.sun.xml.draw = drawing application/vnd.sun.xml.draw = drawing
@ -460,6 +462,7 @@ presentation = \
application/vnd.openxmlformats-officedocument.presentationml.template \ application/vnd.openxmlformats-officedocument.presentationml.template \
application/vnd.sun.xml.impress \ application/vnd.sun.xml.impress \
application/vnd.sun.xml.impress.template \ application/vnd.sun.xml.impress.template \
application/vnd.ms-visio.drawing
media = \ media = \
application/ogg \ application/ogg \

View File

@ -147,6 +147,7 @@
.ppsx = application/vnd.openxmlformats-officedocument.presentationml.slideshow .ppsx = application/vnd.openxmlformats-officedocument.presentationml.slideshow
.pptm = application/vnd.ms-powerpoint.presentation.macroEnabled.12 .pptm = application/vnd.ms-powerpoint.presentation.macroEnabled.12
.pptx = application/vnd.openxmlformats-officedocument.presentationml.presentation .pptx = application/vnd.openxmlformats-officedocument.presentationml.presentation
.vsdx = application/vnd.ms-visio.drawing
.xlam = application/vnd.ms-excel.addin.macroEnabled.12 .xlam = application/vnd.ms-excel.addin.macroEnabled.12
.xlsb = application/vnd.ms-excel.sheet.binary.macroEnabled.12 .xlsb = application/vnd.ms-excel.sheet.binary.macroEnabled.12
.xlsm = application/vnd.ms-excel.sheet.macroEnabled.12 .xlsm = application/vnd.ms-excel.sheet.macroEnabled.12

View File

@ -19,9 +19,12 @@
# Use xallexcepts- and xallexcepts+ in a user file to add or remove from # Use xallexcepts- and xallexcepts+ in a user file to add or remove from
# the default xallexcepts list # the default xallexcepts list
# Visio xml is in there because the desktops tend to try and open it as an
# archive
xallexcepts = application/pdf application/postscript application/x-dvi \ xallexcepts = application/pdf application/postscript application/x-dvi \
text/html|gnuinfo text/html|chm text/html|epub text/html|rclman \ text/html|gnuinfo text/html|chm text/html|epub text/html|rclman \
application/x-fsdirectory|parentopen inode/directory|parentopen application/x-fsdirectory|parentopen inode/directory|parentopen \
application/vnd.ms-visio.drawing
[view] [view]
@ -75,6 +78,7 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
libreoffice %f libreoffice %f
application/vnd.openxmlformats-officedocument.presentationml.template = \ application/vnd.openxmlformats-officedocument.presentationml.template = \
libreoffice %f libreoffice %f
application/vnd.ms-visio.drawing = libreoffice %f
application/vnd.openxmlformats-officedocument.presentationml.presentation = \ application/vnd.openxmlformats-officedocument.presentationml.presentation = \
libreoffice %f libreoffice %f
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \

View File

@ -10,6 +10,7 @@ initvariables $0
recollq author:ben '"Consideration of the high correlation"' recollq author:ben '"Consideration of the high correlation"'
recollq '"The Circassian Education Foundation"' date:2008-01-20 recollq '"The Circassian Education Foundation"' date:2008-01-20
recollq author:"Johnny Walker" '"Thin Lizzy"' recollq author:"Johnny Walker" '"Thin Lizzy"'
recollq '"Objekt steht im Akkusativ"'
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout ) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout

View File

@ -4,3 +4,5 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.document [file://
application/vnd.openxmlformats-officedocument.presentationml.presentation [file:///home/dockes/projets/fulltext/testrecoll/opxml/College_Application_Process_1_18_08.pptx] [College_Application_Process_1_18_08.pptx] 713280 bytes application/vnd.openxmlformats-officedocument.presentationml.presentation [file:///home/dockes/projets/fulltext/testrecoll/opxml/College_Application_Process_1_18_08.pptx] [College_Application_Process_1_18_08.pptx] 713280 bytes
1 results 1 results
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet [file:///home/dockes/projets/fulltext/testrecoll/opxml/GuitarHero2_Helper.xlsx] [GuitarHero2_Helper.xlsx] 17147 bytes application/vnd.openxmlformats-officedocument.spreadsheetml.sheet [file:///home/dockes/projets/fulltext/testrecoll/opxml/GuitarHero2_Helper.xlsx] [GuitarHero2_Helper.xlsx] 17147 bytes
1 results
application/vnd.ms-visio.drawing [file:///home/dockes/projets/fulltext/testrecoll/visio/visio2.vsdx] [visio2.vsdx] 96002 bytes