Support visio .vsdx format
This commit is contained in:
parent
5cfd95226d
commit
19fe03af62
@ -1,5 +1,5 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# Copyright (C) 2015 J.F.Dockes
|
# Copyright (C) 2015-2020 J.F.Dockes
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License as published by
|
# it under the terms of the GNU General Public License as published by
|
||||||
# the Free Software Foundation; either version 2 of the License, or
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
@ -15,15 +15,23 @@
|
|||||||
# Free Software Foundation, Inc.,
|
# Free Software Foundation, Inc.,
|
||||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
######################################
|
######################################
|
||||||
from __future__ import print_function
|
|
||||||
|
# Note that .docx and .xlsx are now normally processed by the C++ mh_xslt.cpp
|
||||||
|
# module. See the openxml-xxx.xsl files for the style sheets used by the C++.
|
||||||
|
#
|
||||||
|
# .pptx and .vsdx are processed by this Python module because the C++ module
|
||||||
|
# can't process their multiple document structure (pages) at the moment.
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
import fnmatch
|
|
||||||
import rclexecm
|
import rclexecm
|
||||||
from rclbasehandler import RclBaseHandler
|
from rclbasehandler import RclBaseHandler
|
||||||
import rclxslt
|
import rclxslt
|
||||||
|
import re
|
||||||
|
|
||||||
|
#
|
||||||
|
# Common style sheet for the openxml metadata
|
||||||
|
#
|
||||||
meta_stylesheet = '''<?xml version="1.0"?>
|
meta_stylesheet = '''<?xml version="1.0"?>
|
||||||
<xsl:stylesheet
|
<xsl:stylesheet
|
||||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||||
@ -75,6 +83,9 @@ meta_stylesheet = '''<?xml version="1.0"?>
|
|||||||
</xsl:stylesheet>
|
</xsl:stylesheet>
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
#####################################
|
||||||
|
# .docx definitions. Not used any more by Recoll in its default config
|
||||||
|
|
||||||
word_tagmatch = 'w:p'
|
word_tagmatch = 'w:p'
|
||||||
word_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
word_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||||
xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
|
xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
|
||||||
@ -90,12 +101,19 @@ xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
|
|||||||
word_moretemplates = ''
|
word_moretemplates = ''
|
||||||
|
|
||||||
|
|
||||||
|
#####################################
|
||||||
|
# .xlsx definitions. Not used any more by Recoll in its default config
|
||||||
|
|
||||||
xl_tagmatch = 'x:t'
|
xl_tagmatch = 'x:t'
|
||||||
xl_xmlns_decls='''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
xl_xmlns_decls='''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||||
xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
|
xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
|
||||||
'''
|
'''
|
||||||
xl_moretemplates = ''
|
xl_moretemplates = ''
|
||||||
|
|
||||||
|
|
||||||
|
#####################
|
||||||
|
# .pptx definitions
|
||||||
|
|
||||||
pp_tagmatch = 'a:t'
|
pp_tagmatch = 'a:t'
|
||||||
pp_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
pp_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||||
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
||||||
@ -108,6 +126,21 @@ pp_moretemplates = '''<xsl:template match="p:attrName">
|
|||||||
</xsl:template>
|
</xsl:template>
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
#####################
|
||||||
|
# .vsdx definitions
|
||||||
|
|
||||||
|
vs_tagmatch = 'Text'
|
||||||
|
vs_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
||||||
|
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||||
|
'''
|
||||||
|
vs_moretemplates = ''
|
||||||
|
|
||||||
|
|
||||||
|
##############################
|
||||||
|
# Common style sheet (with replaceable parts) for .pptx and .vsdx (also .docx
|
||||||
|
# and .xlsx, but not used by default).
|
||||||
|
|
||||||
content_stylesheet = '''<?xml version="1.0"?>
|
content_stylesheet = '''<?xml version="1.0"?>
|
||||||
<xsl:stylesheet @XMLNS_DECLS@ >
|
<xsl:stylesheet @XMLNS_DECLS@ >
|
||||||
|
|
||||||
@ -179,16 +212,39 @@ class OXExtractor(RclBaseHandler):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
stl = self.computestylesheet('pp')
|
stl = None
|
||||||
# Note that we'd need a numeric sort really (else we get slide1
|
# Extract number suffix for numeric sort
|
||||||
# slide11 slide2)
|
prefix = "ppt/slides/slide"
|
||||||
for fn in sorted(zip.namelist()):
|
exp = prefix + '[0-9]+' + '.xml'
|
||||||
if fnmatch.fnmatch(fn, 'ppt/slides/slide*.xml'):
|
names = [fn for fn in zip.namelist() if re.match(exp, fn)]
|
||||||
content = zip.read(fn)
|
for fn in sorted(
|
||||||
docdata += rclxslt.apply_sheet_data(stl, content)
|
names,
|
||||||
except:
|
key=lambda e,prefix=prefix: int(e[len(prefix):len(e)-4])):
|
||||||
|
if stl is None:
|
||||||
|
stl = self.computestylesheet('pp')
|
||||||
|
content = zip.read(fn)
|
||||||
|
docdata += rclxslt.apply_sheet_data(stl, content)
|
||||||
|
except Exception as ex:
|
||||||
|
#self.em.rclog("PPT Exception: %s" % ex)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
stl = None
|
||||||
|
# Extract number suffix for numeric sort
|
||||||
|
prefix = 'visio/pages/page'
|
||||||
|
exp = prefix + '[0-9]+' + '.xml'
|
||||||
|
names = [fn for fn in zip.namelist() if re.match(exp, fn)]
|
||||||
|
for fn in sorted(
|
||||||
|
names,
|
||||||
|
key=lambda e,prefix=prefix: int(e[len(prefix):len(e)-4])):
|
||||||
|
if stl is None:
|
||||||
|
stl = self.computestylesheet('vs')
|
||||||
|
content = zip.read(fn)
|
||||||
|
docdata += rclxslt.apply_sheet_data(stl, content)
|
||||||
|
except Exception as ex:
|
||||||
|
#self.em.rclog("VISIO Exception: %s" % ex)
|
||||||
|
pass
|
||||||
|
|
||||||
docdata += b'</body></html>'
|
docdata += b'</body></html>'
|
||||||
|
|
||||||
return docdata
|
return docdata
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
# Copyright (C) 2014 J.F.Dockes
|
# Copyright (C) 2014-2020 J.F.Dockes
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License as published by
|
# it under the terms of the GNU General Public License as published by
|
||||||
# the Free Software Foundation; either version 2 of the License, or
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
@ -15,58 +15,28 @@
|
|||||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
######################################
|
######################################
|
||||||
|
|
||||||
# Helper module for xslt-based filters
|
# Common code for the remaining Python xslt-based filters (most xslt work is
|
||||||
|
# now done in the c++ mh_xslt module, the ones remaining don't fit with its
|
||||||
from __future__ import print_function
|
# model).
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
PY2 = sys.version < '3'
|
try:
|
||||||
|
from lxml import etree
|
||||||
|
except:
|
||||||
|
print("RECFILTERROR HELPERNOTFOUND python3:lxml")
|
||||||
|
sys.exit(1);
|
||||||
|
|
||||||
if PY2:
|
def _apply_sheet_doc(sheet, doc):
|
||||||
try:
|
styledoc = etree.fromstring(sheet)
|
||||||
import libxml2
|
transform = etree.XSLT(styledoc)
|
||||||
import libxslt
|
return bytes(transform(doc))
|
||||||
libxml2.substituteEntitiesDefault(1)
|
|
||||||
except:
|
def apply_sheet_data(sheet, data):
|
||||||
print("RECFILTERROR HELPERNOTFOUND python:libxml2/python:libxslt1")
|
doc = etree.fromstring(data)
|
||||||
sys.exit(1);
|
return _apply_sheet_doc(sheet, doc)
|
||||||
def _apply_sheet_doc(sheet, doc):
|
|
||||||
styledoc = libxml2.readMemory(sheet, len(sheet), '', '',
|
def apply_sheet_file(sheet, fn):
|
||||||
options=libxml2.XML_PARSE_NONET)
|
doc = etree.parse(fn)
|
||||||
style = libxslt.parseStylesheetDoc(styledoc)
|
return _apply_sheet_doc(sheet, doc)
|
||||||
result = style.applyStylesheet(doc, None)
|
|
||||||
res = ""
|
|
||||||
try:
|
|
||||||
res = style.saveResultToString(result)
|
|
||||||
except Exception as err:
|
|
||||||
# print("saveResultToString got exception: %s"%err)
|
|
||||||
pass
|
|
||||||
style.freeStylesheet()
|
|
||||||
doc.freeDoc()
|
|
||||||
result.freeDoc()
|
|
||||||
return res
|
|
||||||
def apply_sheet_data(sheet, data):
|
|
||||||
doc = libxml2.readMemory(data, len(data), '', '',
|
|
||||||
options=libxml2.XML_PARSE_NONET)
|
|
||||||
return _apply_sheet_doc(sheet, doc)
|
|
||||||
def apply_sheet_file(sheet, fn):
|
|
||||||
doc = libxml2.readFile(fn, '', options=libxml2.XML_PARSE_NONET)
|
|
||||||
return _apply_sheet_doc(sheet, doc)
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
from lxml import etree
|
|
||||||
except:
|
|
||||||
print("RECFILTERROR HELPERNOTFOUND python3:lxml")
|
|
||||||
sys.exit(1);
|
|
||||||
def _apply_sheet_doc(sheet, doc):
|
|
||||||
styledoc = etree.fromstring(sheet)
|
|
||||||
transform = etree.XSLT(styledoc)
|
|
||||||
return bytes(transform(doc))
|
|
||||||
def apply_sheet_data(sheet, data):
|
|
||||||
doc = etree.fromstring(data)
|
|
||||||
return _apply_sheet_doc(sheet, doc)
|
|
||||||
def apply_sheet_file(sheet, fn):
|
|
||||||
doc = etree.parse(fn)
|
|
||||||
return _apply_sheet_doc(sheet, doc)
|
|
||||||
|
|
||||||
|
|||||||
@ -108,6 +108,7 @@ application/vnd.openxmlformats-officedocument.presentationml.template = \
|
|||||||
execm rclopxml.py
|
execm rclopxml.py
|
||||||
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
|
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
|
||||||
execm rclopxml.py
|
execm rclopxml.py
|
||||||
|
application/vnd.ms-visio.drawing = execm rclopxml.py
|
||||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
|
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
|
||||||
internal xsltproc meta docProps/core.xml openxml-meta.xsl \
|
internal xsltproc meta docProps/core.xml openxml-meta.xsl \
|
||||||
body xl/sharedStrings.xml openxml-xls-body.xsl
|
body xl/sharedStrings.xml openxml-xls-body.xsl
|
||||||
@ -270,6 +271,7 @@ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = spreadsheet
|
|||||||
application/vnd.openxmlformats-officedocument.spreadsheetml.template = spreadsheet
|
application/vnd.openxmlformats-officedocument.spreadsheetml.template = spreadsheet
|
||||||
application/vnd.openxmlformats-officedocument.wordprocessingml.document = wordprocessing
|
application/vnd.openxmlformats-officedocument.wordprocessingml.document = wordprocessing
|
||||||
application/vnd.openxmlformats-officedocument.wordprocessingml.template = wordprocessing
|
application/vnd.openxmlformats-officedocument.wordprocessingml.template = wordprocessing
|
||||||
|
application/vnd.ms-visio.drawing = document
|
||||||
application/vnd.sun.xml.calc = spreadsheet
|
application/vnd.sun.xml.calc = spreadsheet
|
||||||
application/vnd.sun.xml.calc.template = spreadsheet
|
application/vnd.sun.xml.calc.template = spreadsheet
|
||||||
application/vnd.sun.xml.draw = drawing
|
application/vnd.sun.xml.draw = drawing
|
||||||
@ -460,6 +462,7 @@ presentation = \
|
|||||||
application/vnd.openxmlformats-officedocument.presentationml.template \
|
application/vnd.openxmlformats-officedocument.presentationml.template \
|
||||||
application/vnd.sun.xml.impress \
|
application/vnd.sun.xml.impress \
|
||||||
application/vnd.sun.xml.impress.template \
|
application/vnd.sun.xml.impress.template \
|
||||||
|
application/vnd.ms-visio.drawing
|
||||||
|
|
||||||
media = \
|
media = \
|
||||||
application/ogg \
|
application/ogg \
|
||||||
|
|||||||
@ -147,6 +147,7 @@
|
|||||||
.ppsx = application/vnd.openxmlformats-officedocument.presentationml.slideshow
|
.ppsx = application/vnd.openxmlformats-officedocument.presentationml.slideshow
|
||||||
.pptm = application/vnd.ms-powerpoint.presentation.macroEnabled.12
|
.pptm = application/vnd.ms-powerpoint.presentation.macroEnabled.12
|
||||||
.pptx = application/vnd.openxmlformats-officedocument.presentationml.presentation
|
.pptx = application/vnd.openxmlformats-officedocument.presentationml.presentation
|
||||||
|
.vsdx = application/vnd.ms-visio.drawing
|
||||||
.xlam = application/vnd.ms-excel.addin.macroEnabled.12
|
.xlam = application/vnd.ms-excel.addin.macroEnabled.12
|
||||||
.xlsb = application/vnd.ms-excel.sheet.binary.macroEnabled.12
|
.xlsb = application/vnd.ms-excel.sheet.binary.macroEnabled.12
|
||||||
.xlsm = application/vnd.ms-excel.sheet.macroEnabled.12
|
.xlsm = application/vnd.ms-excel.sheet.macroEnabled.12
|
||||||
|
|||||||
@ -19,9 +19,12 @@
|
|||||||
# Use xallexcepts- and xallexcepts+ in a user file to add or remove from
|
# Use xallexcepts- and xallexcepts+ in a user file to add or remove from
|
||||||
# the default xallexcepts list
|
# the default xallexcepts list
|
||||||
|
|
||||||
|
# Visio xml is in there because the desktops tend to try and open it as an
|
||||||
|
# archive
|
||||||
xallexcepts = application/pdf application/postscript application/x-dvi \
|
xallexcepts = application/pdf application/postscript application/x-dvi \
|
||||||
text/html|gnuinfo text/html|chm text/html|epub text/html|rclman \
|
text/html|gnuinfo text/html|chm text/html|epub text/html|rclman \
|
||||||
application/x-fsdirectory|parentopen inode/directory|parentopen
|
application/x-fsdirectory|parentopen inode/directory|parentopen \
|
||||||
|
application/vnd.ms-visio.drawing
|
||||||
|
|
||||||
|
|
||||||
[view]
|
[view]
|
||||||
@ -75,6 +78,7 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.template = \
|
|||||||
libreoffice %f
|
libreoffice %f
|
||||||
application/vnd.openxmlformats-officedocument.presentationml.template = \
|
application/vnd.openxmlformats-officedocument.presentationml.template = \
|
||||||
libreoffice %f
|
libreoffice %f
|
||||||
|
application/vnd.ms-visio.drawing = libreoffice %f
|
||||||
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
|
application/vnd.openxmlformats-officedocument.presentationml.presentation = \
|
||||||
libreoffice %f
|
libreoffice %f
|
||||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
|
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet = \
|
||||||
|
|||||||
@ -10,6 +10,7 @@ initvariables $0
|
|||||||
recollq author:ben '"Consideration of the high correlation"'
|
recollq author:ben '"Consideration of the high correlation"'
|
||||||
recollq '"The Circassian Education Foundation"' date:2008-01-20
|
recollq '"The Circassian Education Foundation"' date:2008-01-20
|
||||||
recollq author:"Johnny Walker" '"Thin Lizzy"'
|
recollq author:"Johnny Walker" '"Thin Lizzy"'
|
||||||
|
recollq '"Objekt steht im Akkusativ"'
|
||||||
|
|
||||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||||
|
|
||||||
|
|||||||
@ -4,3 +4,5 @@ application/vnd.openxmlformats-officedocument.wordprocessingml.document [file://
|
|||||||
application/vnd.openxmlformats-officedocument.presentationml.presentation [file:///home/dockes/projets/fulltext/testrecoll/opxml/College_Application_Process_1_18_08.pptx] [College_Application_Process_1_18_08.pptx] 713280 bytes
|
application/vnd.openxmlformats-officedocument.presentationml.presentation [file:///home/dockes/projets/fulltext/testrecoll/opxml/College_Application_Process_1_18_08.pptx] [College_Application_Process_1_18_08.pptx] 713280 bytes
|
||||||
1 results
|
1 results
|
||||||
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet [file:///home/dockes/projets/fulltext/testrecoll/opxml/GuitarHero2_Helper.xlsx] [GuitarHero2_Helper.xlsx] 17147 bytes
|
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet [file:///home/dockes/projets/fulltext/testrecoll/opxml/GuitarHero2_Helper.xlsx] [GuitarHero2_Helper.xlsx] 17147 bytes
|
||||||
|
1 results
|
||||||
|
application/vnd.ms-visio.drawing [file:///home/dockes/projets/fulltext/testrecoll/visio/visio2.vsdx] [visio2.vsdx] 96002 bytes
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user