Python filters: using list append + join instead of string append improves performance hugely for big (book-sized) documents. Impact on a typical pdf mix is moderate though

This commit is contained in:
Jean-Francois Dockes 2019-03-25 11:30:50 +01:00
parent 79724b1d28
commit e71d7f183f
7 changed files with 51 additions and 50 deletions

View File

@ -12,7 +12,7 @@ import os
class WordProcessData: class WordProcessData:
def __init__(self, em): def __init__(self, em):
self.em = em self.em = em
self.out = b'' self.out = []
self.cont = b'' self.cont = b''
self.gotdata = False self.gotdata = False
# Line with continued word (ending in -) # Line with continued word (ending in -)
@ -26,10 +26,10 @@ class WordProcessData:
if not self.gotdata: if not self.gotdata:
if line == b'': if line == b'':
return return
self.out = b'<html><head><title></title>' + \ self.out.append(b'<html><head><title></title>' + \
b'<meta http-equiv="Content-Type"' + \ b'<meta http-equiv="Content-Type"' + \
b'content="text/html;charset=UTF-8">' + \ b'content="text/html;charset=UTF-8">' + \
b'</head><body><p>' b'</head><body><p>')
self.gotdata = True self.gotdata = True
if self.cont: if self.cont:
@ -37,7 +37,7 @@ class WordProcessData:
self.cont = "" self.cont = ""
if line == b'\f': if line == b'\f':
self.out += '</p><hr><p>' self.out.append('</p><hr><p>')
return return
if self.patcont.search(line): if self.patcont.search(line):
@ -51,30 +51,30 @@ class WordProcessData:
line = b'' line = b''
if line: if line:
self.out += self.em.htmlescape(line) + b'<br>' self.out.append(self.em.htmlescape(line) + b'<br>')
else: else:
self.out += b'<br>' self.out.append(b'<br>')
def wrapData(self): def wrapData(self):
if self.gotdata: if self.gotdata:
self.out += b'</p></body></html>' self.out.append(b'</p></body></html>')
self.em.setmimetype("text/html") self.em.setmimetype("text/html")
return self.out return b'\n'.join(self.out)
# Null data accumulator. We use this when antiword has fail, and the # Null data accumulator. We use this when antiword has failed, and the
# data actually comes from rclrtf, rcltext or vwWare, which all # data actually comes from rclrtf, rcltext or vwWare, which all
# output HTML # output HTML
class WordPassData: class WordPassData:
def __init__(self, em): def __init__(self, em):
self.out = b'' self.out = []
self.em = em self.em = em
def takeLine(self, line): def takeLine(self, line):
self.out += line self.out.append(line)
def wrapData(self): def wrapData(self):
self.em.setmimetype("text/html") self.em.setmimetype("text/html")
return self.out return b'\n'.join(self.out)
# Filter for msword docs. Try antiword, and if this fails, check for # Filter for msword docs. Try antiword, and if this fails, check for

View File

@ -56,7 +56,7 @@ class Executor(RclBaseHandler):
except Exception as err: except Exception as err:
self.em.rclog("runCmd: error reading %s: %s"%(filename, err)) self.em.rclog("runCmd: error reading %s: %s"%(filename, err))
return(False, "") return(False, "")
for line in data.split('\n'): for line in data.split(b'\n'):
postproc.takeLine(line) postproc.takeLine(line)
return True, postproc.wrapData() return True, postproc.wrapData()
else: else:

View File

@ -322,7 +322,7 @@ class PDFExtractor:
inheader = False inheader = False
inbody = False inbody = False
didcs = False didcs = False
output = b'' output = []
isempty = True isempty = True
for line in input.split(b'\n'): for line in input.split(b'\n'):
if re.search(b'</head>', line): if re.search(b'</head>', line):
@ -331,8 +331,8 @@ class PDFExtractor:
inbody = False inbody = False
if inheader: if inheader:
if not didcs: if not didcs:
output += b'<meta http-equiv="Content-Type"' + \ output.append(b'<meta http-equiv="Content-Type"' + \
b'content="text/html; charset=UTF-8">\n' b'content="text/html; charset=UTF-8">\n')
didcs = True didcs = True
if self.needescape: if self.needescape:
m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line) m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
@ -361,9 +361,9 @@ class PDFExtractor:
if re.search(b'<pre>', line): if re.search(b'<pre>', line):
inbody = True inbody = True
output += line + b'\n' output.append(line)
return output, isempty return b'\n'.join(output), isempty
def _metatag(self, nm, val): def _metatag(self, nm, val):
return "<meta name=\"" + nm + "\" content=\"" + \ return "<meta name=\"" + nm + "\" content=\"" + \

View File

@ -13,20 +13,20 @@ import os
class PPTProcessData: class PPTProcessData:
def __init__(self, em): def __init__(self, em):
self.em = em self.em = em
self.out = b"" self.out = []
self.gotdata = 0 self.gotdata = 0
def takeLine(self, line): def takeLine(self, line):
if not self.gotdata: if not self.gotdata:
self.out += b'''<html><head>''' + \ self.out.append(b'<html><head>' + \
b'''<meta http-equiv="Content-Type" ''' + \ b'<meta http-equiv="Content-Type" ' + \
b'''content="text/html;charset=UTF-8">''' + \ b'content="text/html;charset=UTF-8">' + \
b'''</head><body><pre>''' b'</head><body><pre>')
self.gotdata = True self.gotdata = True
self.out += self.em.htmlescape(line) + b"<br>\n" self.out.append(self.em.htmlescape(line))
def wrapData(self): def wrapData(self):
return self.out + b'''</pre></body></html>''' return b'\n'.join(self.out) + b'''</pre></body></html>'''
class PPTFilter: class PPTFilter:
def __init__(self, em): def __init__(self, em):

View File

@ -11,7 +11,7 @@ import os
class RTFProcessData: class RTFProcessData:
def __init__(self, em): def __init__(self, em):
self.em = em self.em = em
self.out = b'' self.out = []
self.gothead = 0 self.gothead = 0
self.patendhead = re.compile(b'''</head>''') self.patendhead = re.compile(b'''</head>''')
self.patcharset = re.compile(b'''^<meta http-equiv=''') self.patcharset = re.compile(b'''^<meta http-equiv=''')
@ -21,17 +21,17 @@ class RTFProcessData:
def takeLine(self, line): def takeLine(self, line):
if not self.gothead: if not self.gothead:
if self.patendhead.search(line): if self.patendhead.search(line):
self.out += b'<meta http-equiv="Content-Type" ' + \ self.out.append(b'<meta http-equiv="Content-Type" ' + \
b'content="text/html;charset=UTF-8">' + b'\n' b'content="text/html;charset=UTF-8">')
self.out += line + b'\n' self.out.append(line)
self.gothead = 1 self.gothead = 1
elif not self.patcharset.search(line): elif not self.patcharset.search(line):
self.out += line + b'\n' self.out.append(line)
else: else:
self.out += line + b'\n' self.out.append(line)
def wrapData(self): def wrapData(self):
return self.out return b'\n'.join(self.out)
class RTFFilter: class RTFFilter:
def __init__(self, em): def __init__(self, em):

View File

@ -13,30 +13,30 @@ import xml.sax
class XLSProcessData: class XLSProcessData:
def __init__(self, em, ishtml = False): def __init__(self, em, ishtml = False):
self.em = em self.em = em
self.out = b"" self.out = []
self.gotdata = 0 self.gotdata = 0
self.xmldata = b"" self.xmldata = []
self.ishtml = ishtml self.ishtml = ishtml
def takeLine(self, line): def takeLine(self, line):
if self.ishtml: if self.ishtml:
self.out += line + "\n" self.out.append(line)
return return
if not self.gotdata: if not self.gotdata:
self.out += b'''<html><head>''' + \ self.out.append(b'''<html><head>''' + \
b'''<meta http-equiv="Content-Type" ''' + \ b'''<meta http-equiv="Content-Type" ''' + \
b'''content="text/html;charset=UTF-8">''' + \ b'''content="text/html;charset=UTF-8">''' + \
b'''</head><body><pre>''' b'''</head><body><pre>''')
self.gotdata = True self.gotdata = True
self.xmldata += line self.xmldata.append(line)
def wrapData(self): def wrapData(self):
if self.ishtml: if self.ishtml:
return self.out return b'\n'.join(self.out)
handler = xlsxmltocsv.XlsXmlHandler() handler = xlsxmltocsv.XlsXmlHandler()
xml.sax.parseString(self.xmldata, handler) xml.sax.parseString(b'\n'.join(self.xmldata), handler)
self.out += self.em.htmlescape(handler.output) self.out.append(self.em.htmlescape(b'\n'.join(handler.output)))
return self.out + b'''</pre></body></html>''' return b'\n'.join(self.out) + b'</pre></body></html>'
class XLSFilter: class XLSFilter:
def __init__(self, em): def __init__(self, em):

View File

@ -39,12 +39,12 @@ else:
class XlsXmlHandler(xml.sax.handler.ContentHandler): class XlsXmlHandler(xml.sax.handler.ContentHandler):
def __init__(self): def __init__(self):
self.output = b'' self.output = []
def startElement(self, name, attrs): def startElement(self, name, attrs):
if name == "worksheet": if name == "worksheet":
if "name" in attrs: if "name" in attrs:
self.output += b"%s\n" % attrs["name"].encode("UTF-8") self.output.append(b"%s\n" % attrs["name"].encode("UTF-8"))
elif name == "row": elif name == "row":
self.cells = dict() self.cells = dict()
elif name == "label-cell" or name == "number-cell": elif name == "label-cell" or name == "number-cell":
@ -56,7 +56,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
self.cells[int(attrs["col"])] = value self.cells[int(attrs["col"])] = value
else: else:
#?? #??
self.output += b"%s%s" % (value.encode("UTF-8"), sepstring) self.output.append(b"%s%s" % (value.encode("UTF-8"), sepstring))
elif name == "formula-cell": elif name == "formula-cell":
if "formula-result" in attrs and "col" in attrs: if "formula-result" in attrs and "col" in attrs:
self.cells[int(attrs["col"])] = \ self.cells[int(attrs["col"])] = \
@ -65,20 +65,21 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
def endElement(self, name, ): def endElement(self, name, ):
if name == "row": if name == "row":
curidx = 0 curidx = 0
line = []
for idx, value in self.cells.items(): for idx, value in self.cells.items():
self.output += sepstring * (idx - curidx) line.append(sepstring * (idx - curidx))
self.output += b"%s%s%s" % (dquote, value, dquote) line.append(b"%s%s%s" % (dquote, value, dquote))
curidx = idx curidx = idx
self.output += b"\n" self.output.append(b''.join(line))
elif name == "worksheet": elif name == "worksheet":
self.output += b"\n" self.output.append(b'')
if __name__ == '__main__': if __name__ == '__main__':
try: try:
handler = XlsXmlHandler() handler = XlsXmlHandler()
xml.sax.parse(sys.stdin, handler) xml.sax.parse(sys.stdin, handler)
print(handler.output) print(b'\n'.join(handler.output))
except BaseException as err: except BaseException as err:
print("xml-parse: %s\n" % (str(sys.exc_info()[:2]),), file=sys.stderr) print("xml-parse: %s\n" % (str(sys.exc_info()[:2]),), file=sys.stderr)
sys.exit(1) sys.exit(1)