diff --git a/src/filters/rclopxml.py b/src/filters/rclopxml.py
index 5bd05ed7..97e5bf58 100755
--- a/src/filters/rclopxml.py
+++ b/src/filters/rclopxml.py
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
-# Copyright (C) 2015 J.F.Dockes
+# Copyright (C) 2015-2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
@@ -15,15 +15,23 @@
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
######################################
-from __future__ import print_function
+
+# Note that .docx and .xlsx are now normally processed by the C++ mh_xslt.cpp
+# module. See the openxml-xxx.xsl files for the style sheets used by the C++.
+#
+# .pptx and .vsdx are processed by this Python module because the C++ module
+# can't process their multiple document structure (pages) at the moment.
import sys
from zipfile import ZipFile
-import fnmatch
import rclexecm
from rclbasehandler import RclBaseHandler
import rclxslt
+import re
+#
+# Common style sheet for the openxml metadata
+#
meta_stylesheet = '''
'''
+#####################################
+# .docx definitions. Not used any more by Recoll in its default config
+
word_tagmatch = 'w:p'
word_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
@@ -90,12 +101,19 @@ xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
word_moretemplates = ''
+#####################################
+# .xlsx definitions. Not used any more by Recoll in its default config
+
xl_tagmatch = 'x:t'
xl_xmlns_decls='''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
'''
xl_moretemplates = ''
+
+#####################
+# .pptx definitions
+
pp_tagmatch = 'a:t'
pp_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
@@ -108,6 +126,21 @@ pp_moretemplates = '''
'''
+
+#####################
+# .vsdx definitions
+
+vs_tagmatch = 'Text'
+vs_xmlns_decls = '''xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
+xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
+'''
+vs_moretemplates = ''
+
+
+##############################
+# Common style sheet (with replaceable parts) for .pptx and .vsdx (also .docx
+# and .xlsx, but not used by default).
+
content_stylesheet = '''
@@ -179,16 +212,39 @@ class OXExtractor(RclBaseHandler):
pass
try:
- stl = self.computestylesheet('pp')
- # Note that we'd need a numeric sort really (else we get slide1
- # slide11 slide2)
- for fn in sorted(zip.namelist()):
- if fnmatch.fnmatch(fn, 'ppt/slides/slide*.xml'):
- content = zip.read(fn)
- docdata += rclxslt.apply_sheet_data(stl, content)
- except:
+ stl = None
+ # Extract number suffix for numeric sort
+ prefix = "ppt/slides/slide"
+ exp = prefix + '[0-9]+' + '.xml'
+ names = [fn for fn in zip.namelist() if re.match(exp, fn)]
+ for fn in sorted(
+ names,
+ key=lambda e,prefix=prefix: int(e[len(prefix):len(e)-4])):
+ if stl is None:
+ stl = self.computestylesheet('pp')
+ content = zip.read(fn)
+ docdata += rclxslt.apply_sheet_data(stl, content)
+ except Exception as ex:
+ #self.em.rclog("PPT Exception: %s" % ex)
pass
+ try:
+ stl = None
+ # Extract number suffix for numeric sort
+ prefix = 'visio/pages/page'
+ exp = prefix + '[0-9]+' + '.xml'
+ names = [fn for fn in zip.namelist() if re.match(exp, fn)]
+ for fn in sorted(
+ names,
+ key=lambda e,prefix=prefix: int(e[len(prefix):len(e)-4])):
+ if stl is None:
+ stl = self.computestylesheet('vs')
+ content = zip.read(fn)
+ docdata += rclxslt.apply_sheet_data(stl, content)
+ except Exception as ex:
+ #self.em.rclog("VISIO Exception: %s" % ex)
+ pass
+
docdata += b'