From e71d7f183fee3bd30096d5947e907321ac70e046 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes
Date: Mon, 25 Mar 2019 11:30:50 +0100
Subject: [PATCH] Python filters: using list append + join instead of string
append improves performance hugely for big (book-sized) documents. Impact on
a typical pdf mix is moderate though
---
src/filters/rcldoc.py | 24 ++++++++++++------------
src/filters/rclexec1.py | 2 +-
src/filters/rclpdf.py | 10 +++++-----
src/filters/rclppt.py | 14 +++++++-------
src/filters/rclrtf.py | 14 +++++++-------
src/filters/rclxls.py | 20 ++++++++++----------
src/filters/xlsxmltocsv.py | 17 +++++++++--------
7 files changed, 51 insertions(+), 50 deletions(-)
diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py
index d6bd06fe..104c420d 100755
--- a/src/filters/rcldoc.py
+++ b/src/filters/rcldoc.py
@@ -12,7 +12,7 @@ import os
class WordProcessData:
def __init__(self, em):
self.em = em
- self.out = b''
+ self.out = []
self.cont = b''
self.gotdata = False
# Line with continued word (ending in -)
@@ -26,10 +26,10 @@ class WordProcessData:
if not self.gotdata:
if line == b'':
return
- self.out = b'' + \
+ self.out.append(b'' + \
b'' + \
- b''
+ b'
')
self.gotdata = True
if self.cont:
@@ -37,7 +37,7 @@ class WordProcessData:
self.cont = ""
if line == b'\f':
- self.out += '
'
+ self.out.append('
')
return
if self.patcont.search(line):
@@ -51,30 +51,30 @@ class WordProcessData:
line = b''
if line:
- self.out += self.em.htmlescape(line) + b'
'
+ self.out.append(self.em.htmlescape(line) + b'
')
else:
- self.out += b'
'
+ self.out.append(b'
')
def wrapData(self):
if self.gotdata:
- self.out += b'
'
+ self.out.append(b'
')
self.em.setmimetype("text/html")
- return self.out
+ return b'\n'.join(self.out)
-# Null data accumulator. We use this when antiword has fail, and the
+# Null data accumulator. We use this when antiword has failed, and the
# data actually comes from rclrtf, rcltext or vwWare, which all
# output HTML
class WordPassData:
def __init__(self, em):
- self.out = b''
+ self.out = []
self.em = em
def takeLine(self, line):
- self.out += line
+ self.out.append(line)
def wrapData(self):
self.em.setmimetype("text/html")
- return self.out
+ return b'\n'.join(self.out)
# Filter for msword docs. Try antiword, and if this fails, check for
diff --git a/src/filters/rclexec1.py b/src/filters/rclexec1.py
index a9e9847f..dc4f818e 100644
--- a/src/filters/rclexec1.py
+++ b/src/filters/rclexec1.py
@@ -56,7 +56,7 @@ class Executor(RclBaseHandler):
except Exception as err:
self.em.rclog("runCmd: error reading %s: %s"%(filename, err))
return(False, "")
- for line in data.split('\n'):
+ for line in data.split(b'\n'):
postproc.takeLine(line)
return True, postproc.wrapData()
else:
diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py
index 4104cfc1..af92d057 100755
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@@ -322,7 +322,7 @@ class PDFExtractor:
inheader = False
inbody = False
didcs = False
- output = b''
+ output = []
isempty = True
for line in input.split(b'\n'):
if re.search(b'', line):
@@ -331,8 +331,8 @@ class PDFExtractor:
inbody = False
if inheader:
if not didcs:
- output += b'\n'
+ output.append(b'\n')
didcs = True
if self.needescape:
m = re.search(b'''(.*)(.*)(<\/title>.*)''', line)
@@ -361,9 +361,9 @@ class PDFExtractor:
if re.search(b'', line):
inbody = True
- output += line + b'\n'
+ output.append(line)
- return output, isempty
+ return b'\n'.join(output), isempty
def _metatag(self, nm, val):
return "''' + \
- b'''''' + \
- b''''''
+ self.out.append(b'' + \
+ b'' + \
+ b'')
self.gotdata = True
- self.out += self.em.htmlescape(line) + b"
\n"
+ self.out.append(self.em.htmlescape(line))
def wrapData(self):
- return self.out + b''''''
+ return b'\n'.join(self.out) + b'''