Support visio .vsdx format
This commit is contained in:
parent
5cfd95226d
commit
19fe03af62
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (C) 2015 J.F.Dockes
|
||||
# Copyright (C) 2015-2020 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
@ -15,15 +15,23 @@
|
||||
# Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
######################################
|
||||
from __future__ import print_function
|
||||
|
||||
# Note that .docx and .xlsx are now normally processed by the C++ mh_xslt.cpp
|
||||
# module. See the openxml-xxx.xsl files for the style sheets used by the C++.
|
||||
#
|
||||
# .pptx and .vsdx are processed by this Python module because the C++ module
|
||||
# can't process their multiple document structure (pages) at the moment.
|
||||
|
||||
import sys
|
||||
from zipfile import ZipFile
|
||||
import fnmatch
|
||||
import rclexecm
|
||||
from rclbasehandler import RclBaseHandler
|
||||
import rclxslt
|
||||
import re
|
||||
|
||||
#
|
||||
# Common style sheet for the openxml metadata
|
||||
#
|
||||
meta_stylesheet = '''<?xml version="1.0"?>
|
||||
<xsl:stylesheet
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
@ -75,6 +83,9 @@ meta_stylesheet = '''<?xml version="1.0"?>
|
||||
</xsl:stylesheet>
|
||||
'''
|
||||
|
||||
#####################################
|
||||
# .docx definitions. Not used any more by Recoll in its default config
|
||||
|
||||
word_tagmatch = 'w:p'
|
||||
word_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
|
||||
@ -90,12 +101,19 @@ xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
|
||||
word_moretemplates = ''
|
||||
|
||||
|
||||
#####################################
|
||||
# .xlsx definitions. Not used any more by Recoll in its default config
|
||||
|
||||
xl_tagmatch = 'x:t'
|
||||
xl_xmlns_decls='''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
|
||||
'''
|
||||
xl_moretemplates = ''
|
||||
|
||||
|
||||
#####################
|
||||
# .pptx definitions
|
||||
|
||||
pp_tagmatch = 'a:t'
|
||||
pp_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
||||
@ -108,6 +126,21 @@ pp_moretemplates = '''<xsl:template match="p:attrName">
|
||||
</xsl:template>
|
||||
'''
|
||||
|
||||
|
||||
#####################
|
||||
# .vsdx definitions
|
||||
|
||||
vs_tagmatch = 'Text'
|
||||
vs_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
'''
|
||||
vs_moretemplates = ''
|
||||
|
||||
|
||||
##############################
|
||||
# Common style sheet (with replaceable parts) for .pptx and .vsdx (also .docx
|
||||
# and .xlsx, but not used by default).
|
||||
|
||||
content_stylesheet = '''<?xml version="1.0"?>
|
||||
<xsl:stylesheet @XMLNS_DECLS@ >
|
||||
|
||||
@ -179,14 +212,37 @@ class OXExtractor(RclBaseHandler):
|
||||
pass
|
||||
|
||||
try:
|
||||
stl = None
|
||||
# Extract number suffix for numeric sort
|
||||
prefix = "ppt/slides/slide"
|
||||
exp = prefix + '[0-9]+' + '.xml'
|
||||
names = [fn for fn in zip.namelist() if re.match(exp, fn)]
|
||||
for fn in sorted(
|
||||
names,
|
||||
key=lambda e,prefix=prefix: int(e[len(prefix):len(e)-4])):
|
||||
if stl is None:
|
||||
stl = self.computestylesheet('pp')
|
||||
# Note that we'd need a numeric sort really (else we get slide1
|
||||
# slide11 slide2)
|
||||
for fn in sorted(zip.namelist()):
|
||||
if fnmatch.fnmatch(fn, 'ppt/slides/slide*.xml'):
|
||||
content = zip.read(fn)
|
||||
docdata += rclxslt.apply_sheet_data(stl, content)
|
||||
except:
|
||||
except Exception as ex:
|
||||
#self.em.rclog("PPT Exception: %s" % ex)
|
||||
pass
|
||||
|
||||
try:
|
||||
stl = None
|
||||
# Extract number suffix for numeric sort
|
||||
prefix = 'visio/pages/page'
|
||||
exp = prefix + '[0-9]+' + '.xml'
|
||||
names = [fn for fn in zip.namelist() if re.match(exp, fn)]
|
||||
for fn in sorted(
|
||||
names,
|
||||
key=lambda e,prefix=prefix: int(e[len(prefix):len(e)-4])):
|
||||
if stl is None:
|
||||
stl = self.computestylesheet('vs')
|
||||
content = zip.read(fn)
|
||||
docdata += rclxslt.apply_sheet_data(stl, content)
|
||||
except Exception as ex:
|
||||
#self.em.rclog("VISIO Exception: %s" % ex)
|
||||
pass
|
||||
|
||||
docdata += b'</body></html>'
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Copyright (C) 2014 J.F.Dockes
|
||||
# Copyright (C) 2014-2020 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
@ -15,57 +15,27 @@
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
######################################
|
||||
|
||||
# Helper module for xslt-based filters
|
||||
|
||||
from __future__ import print_function
|
||||
# Common code for the remaining Python xslt-based filters (most xslt work is
|
||||
# now done in the c++ mh_xslt module, the ones remaining don't fit with its
|
||||
# model).
|
||||
|
||||
import sys
|
||||
|
||||
PY2 = sys.version < '3'
|
||||
|
||||
if PY2:
|
||||
try:
|
||||
import libxml2
|
||||
import libxslt
|
||||
libxml2.substituteEntitiesDefault(1)
|
||||
except:
|
||||
print("RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1")
|
||||
sys.exit(1);
|
||||
def _apply_sheet_doc(sheet, doc):
|
||||
styledoc = libxml2.readMemory(sheet, len(sheet), '', '',
|
||||
options=libxml2.XML_PARSE_NONET)
|
||||
style = libxslt.parseStylesheetDoc(styledoc)
|
||||
result = style.applyStylesheet(doc, None)
|
||||
res = ""
|
||||
try:
|
||||
res = style.saveResultToString(result)
|
||||
except Exception as err:
|
||||
# print("saveResultToString got exception: %s"%err)
|
||||
pass
|
||||
style.freeStylesheet()
|
||||
doc.freeDoc()
|
||||
result.freeDoc()
|
||||
return res
|
||||
def apply_sheet_data(sheet, data):
|
||||
doc = libxml2.readMemory(data, len(data), '', '',
|
||||
options=libxml2.XML_PARSE_NONET)
|
||||
return _apply_sheet_doc(sheet, doc)
|
||||
def apply_sheet_file(sheet, fn):
|
||||
doc = libxml2.readFile(fn, '', options=libxml2.XML_PARSE_NONET)
|
||||
return _apply_sheet_doc(sheet, doc)
|
||||
else:
|
||||
try:
|
||||
from lxml import etree
|
||||
except:
|
||||
print("RECFILTERROR HELPERNOTFOUND python3:lxml")
|
||||
sys.exit(1);
|
||||
|
||||
def _apply_sheet_doc(sheet, doc):
|
||||
styledoc = etree.fromstring(sheet)
|
||||
transform = etree.XSLT(styledoc)
|
||||
return bytes(transform(doc))
|
||||
|
||||
def apply_sheet_data(sheet, data):
|
||||
doc = etree.fromstring(data)
|
||||
return _apply_sheet_doc(sheet, doc)
|
||||
|
||||
def apply_sheet_file(sheet, fn):
|
||||
doc = etree.parse(fn)
|
||||
return _apply_sheet_doc(sheet, doc)
|
||||
|
||||
@ -108,6 +108,7 @@ application/vnd.openxmlformats-officedocument.presentationml.template = \
|
||||
execm rclopxml.py
|
||||
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
|
||||
execm rclopxml.py
|
||||
application/vnd.ms-visio.drawing = execm rclopxml.py
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
|
||||
internal xsltproc meta docProps/core.xml openxml-meta.xsl \
|
||||
body xl/sharedStrings.xml openxml-xls-body.xsl
|
||||
@ -270,6 +271,7 @@ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = spreadsheet
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.template = spreadsheet
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document = wordprocessing
|
||||
application/vnd.openxmlformats-officedocument.wordprocessingml.template = wordprocessing
|
||||
application/vnd.ms-visio.drawing = document
|
||||
application/vnd.sun.xml.calc = spreadsheet
|
||||
application/vnd.sun.xml.calc.template = spreadsheet
|
||||
application/vnd.sun.xml.draw = drawing
|
||||
@ -460,6 +462,7 @@ presentation = \
|
||||
application/vnd.openxmlformats-officedocument.presentationml.template \
|
||||
application/vnd.sun.xml.impress \
|
||||
application/vnd.sun.xml.impress.template \
|
||||
application/vnd.ms-visio.drawing
|
||||
|
||||
media = \
|
||||
application/ogg \
|
||||
|
||||
@ -147,6 +147,7 @@
|
||||
.ppsx = application/vnd.openxmlformats-officedocument.presentationml.slideshow
|
||||
.pptm = application/vnd.ms-powerpoint.presentation.macroEnabled.12
|
||||
.pptx = application/vnd.openxmlformats-officedocument.presentationml.presentation
|
||||
.vsdx = application/vnd.ms-visio.drawing
|
||||
.xlam = application/vnd.ms-excel.addin.macroEnabled.12
|
||||
.xlsb = application/vnd.ms-excel.sheet.binary.macroEnabled.12
|
||||
.xlsm = application/vnd.ms-excel.sheet.macroEnabled.12
|
||||
|
||||
@ -19,9 +19,12 @@
|
||||
# Use xallexcepts- and xallexcepts+ in a user file to add or remove from
|
||||
# the default xallexcepts list
|
||||
|
||||
# Visio xml is in there because the desktops tend to try and open it as an
|
||||
# archive
|
||||
xallexcepts = application/pdf application/postscript application/x-dvi \
|
||||
text/html|gnuinfo text/html|chm text/html|epub text/html|rclman \
|
||||
application/x-fsdirectory|parentopen inode/directory|parentopen
|
||||
application/x-fsdirectory|parentopen inode/directory|parentopen \
|
||||
application/vnd.ms-visio.drawing
|
||||
|
||||
|
||||
[view]
|
||||
@ -75,6 +78,7 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
|
||||
libreoffice %f
|
||||
application/vnd.openxmlformats-officedocument.presentationml.template = \
|
||||
libreoffice %f
|
||||
application/vnd.ms-visio.drawing = libreoffice %f
|
||||
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
|
||||
libreoffice %f
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
|
||||
|
||||
@ -10,6 +10,7 @@ initvariables $0
|
||||
recollq author:ben '"Consideration of the high correlation"'
|
||||
recollq '"The Circassian Education Foundation"' date:2008-01-20
|
||||
recollq author:"Johnny Walker" '"Thin Lizzy"'
|
||||
recollq '"Objekt steht im Akkusativ"'
|
||||
|
||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||
|
||||
|
||||
@ -4,3 +4,5 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.document [file://
|
||||
application/vnd.openxmlformats-officedocument.presentationml.presentation [file:///home/dockes/projets/fulltext/testrecoll/opxml/College_Application_Process_1_18_08.pptx] [College_Application_Process_1_18_08.pptx] 713280 bytes
|
||||
1 results
|
||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet [file:///home/dockes/projets/fulltext/testrecoll/opxml/GuitarHero2_Helper.xlsx] [GuitarHero2_Helper.xlsx] 17147 bytes
|
||||
1 results
|
||||
application/vnd.ms-visio.drawing [file:///home/dockes/projets/fulltext/testrecoll/visio/visio2.vsdx] [visio2.vsdx] 96002 bytes
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user