Python filters: using list append + join instead of string append improves performance hugely for big (book-sized) documents. Impact on a typical pdf mix is moderate though
This commit is contained in:
parent
79724b1d28
commit
e71d7f183f
@ -12,7 +12,7 @@ import os
|
|||||||
class WordProcessData:
|
class WordProcessData:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
self.em = em
|
||||||
self.out = b''
|
self.out = []
|
||||||
self.cont = b''
|
self.cont = b''
|
||||||
self.gotdata = False
|
self.gotdata = False
|
||||||
# Line with continued word (ending in -)
|
# Line with continued word (ending in -)
|
||||||
@ -26,10 +26,10 @@ class WordProcessData:
|
|||||||
if not self.gotdata:
|
if not self.gotdata:
|
||||||
if line == b'':
|
if line == b'':
|
||||||
return
|
return
|
||||||
self.out = b'<html><head><title></title>' + \
|
self.out.append(b'<html><head><title></title>' + \
|
||||||
b'<meta http-equiv="Content-Type"' + \
|
b'<meta http-equiv="Content-Type"' + \
|
||||||
b'content="text/html;charset=UTF-8">' + \
|
b'content="text/html;charset=UTF-8">' + \
|
||||||
b'</head><body><p>'
|
b'</head><body><p>')
|
||||||
self.gotdata = True
|
self.gotdata = True
|
||||||
|
|
||||||
if self.cont:
|
if self.cont:
|
||||||
@ -37,7 +37,7 @@ class WordProcessData:
|
|||||||
self.cont = ""
|
self.cont = ""
|
||||||
|
|
||||||
if line == b'\f':
|
if line == b'\f':
|
||||||
self.out += '</p><hr><p>'
|
self.out.append('</p><hr><p>')
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.patcont.search(line):
|
if self.patcont.search(line):
|
||||||
@ -51,30 +51,30 @@ class WordProcessData:
|
|||||||
line = b''
|
line = b''
|
||||||
|
|
||||||
if line:
|
if line:
|
||||||
self.out += self.em.htmlescape(line) + b'<br>'
|
self.out.append(self.em.htmlescape(line) + b'<br>')
|
||||||
else:
|
else:
|
||||||
self.out += b'<br>'
|
self.out.append(b'<br>')
|
||||||
|
|
||||||
def wrapData(self):
|
def wrapData(self):
|
||||||
if self.gotdata:
|
if self.gotdata:
|
||||||
self.out += b'</p></body></html>'
|
self.out.append(b'</p></body></html>')
|
||||||
self.em.setmimetype("text/html")
|
self.em.setmimetype("text/html")
|
||||||
return self.out
|
return b'\n'.join(self.out)
|
||||||
|
|
||||||
# Null data accumulator. We use this when antiword has fail, and the
|
# Null data accumulator. We use this when antiword has failed, and the
|
||||||
# data actually comes from rclrtf, rcltext or vwWare, which all
|
# data actually comes from rclrtf, rcltext or vwWare, which all
|
||||||
# output HTML
|
# output HTML
|
||||||
class WordPassData:
|
class WordPassData:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.out = b''
|
self.out = []
|
||||||
self.em = em
|
self.em = em
|
||||||
|
|
||||||
def takeLine(self, line):
|
def takeLine(self, line):
|
||||||
self.out += line
|
self.out.append(line)
|
||||||
|
|
||||||
def wrapData(self):
|
def wrapData(self):
|
||||||
self.em.setmimetype("text/html")
|
self.em.setmimetype("text/html")
|
||||||
return self.out
|
return b'\n'.join(self.out)
|
||||||
|
|
||||||
|
|
||||||
# Filter for msword docs. Try antiword, and if this fails, check for
|
# Filter for msword docs. Try antiword, and if this fails, check for
|
||||||
|
|||||||
@ -56,7 +56,7 @@ class Executor(RclBaseHandler):
|
|||||||
except Exception as err:
|
except Exception as err:
|
||||||
self.em.rclog("runCmd: error reading %s: %s"%(filename, err))
|
self.em.rclog("runCmd: error reading %s: %s"%(filename, err))
|
||||||
return(False, "")
|
return(False, "")
|
||||||
for line in data.split('\n'):
|
for line in data.split(b'\n'):
|
||||||
postproc.takeLine(line)
|
postproc.takeLine(line)
|
||||||
return True, postproc.wrapData()
|
return True, postproc.wrapData()
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -322,7 +322,7 @@ class PDFExtractor:
|
|||||||
inheader = False
|
inheader = False
|
||||||
inbody = False
|
inbody = False
|
||||||
didcs = False
|
didcs = False
|
||||||
output = b''
|
output = []
|
||||||
isempty = True
|
isempty = True
|
||||||
for line in input.split(b'\n'):
|
for line in input.split(b'\n'):
|
||||||
if re.search(b'</head>', line):
|
if re.search(b'</head>', line):
|
||||||
@ -331,8 +331,8 @@ class PDFExtractor:
|
|||||||
inbody = False
|
inbody = False
|
||||||
if inheader:
|
if inheader:
|
||||||
if not didcs:
|
if not didcs:
|
||||||
output += b'<meta http-equiv="Content-Type"' + \
|
output.append(b'<meta http-equiv="Content-Type"' + \
|
||||||
b'content="text/html; charset=UTF-8">\n'
|
b'content="text/html; charset=UTF-8">\n')
|
||||||
didcs = True
|
didcs = True
|
||||||
if self.needescape:
|
if self.needescape:
|
||||||
m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
|
m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
|
||||||
@ -361,9 +361,9 @@ class PDFExtractor:
|
|||||||
if re.search(b'<pre>', line):
|
if re.search(b'<pre>', line):
|
||||||
inbody = True
|
inbody = True
|
||||||
|
|
||||||
output += line + b'\n'
|
output.append(line)
|
||||||
|
|
||||||
return output, isempty
|
return b'\n'.join(output), isempty
|
||||||
|
|
||||||
def _metatag(self, nm, val):
|
def _metatag(self, nm, val):
|
||||||
return "<meta name=\"" + nm + "\" content=\"" + \
|
return "<meta name=\"" + nm + "\" content=\"" + \
|
||||||
|
|||||||
@ -13,20 +13,20 @@ import os
|
|||||||
class PPTProcessData:
|
class PPTProcessData:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
self.em = em
|
||||||
self.out = b""
|
self.out = []
|
||||||
self.gotdata = 0
|
self.gotdata = 0
|
||||||
|
|
||||||
def takeLine(self, line):
|
def takeLine(self, line):
|
||||||
if not self.gotdata:
|
if not self.gotdata:
|
||||||
self.out += b'''<html><head>''' + \
|
self.out.append(b'<html><head>' + \
|
||||||
b'''<meta http-equiv="Content-Type" ''' + \
|
b'<meta http-equiv="Content-Type" ' + \
|
||||||
b'''content="text/html;charset=UTF-8">''' + \
|
b'content="text/html;charset=UTF-8">' + \
|
||||||
b'''</head><body><pre>'''
|
b'</head><body><pre>')
|
||||||
self.gotdata = True
|
self.gotdata = True
|
||||||
self.out += self.em.htmlescape(line) + b"<br>\n"
|
self.out.append(self.em.htmlescape(line))
|
||||||
|
|
||||||
def wrapData(self):
|
def wrapData(self):
|
||||||
return self.out + b'''</pre></body></html>'''
|
return b'\n'.join(self.out) + b'''</pre></body></html>'''
|
||||||
|
|
||||||
class PPTFilter:
|
class PPTFilter:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
|
|||||||
@ -11,7 +11,7 @@ import os
|
|||||||
class RTFProcessData:
|
class RTFProcessData:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
self.em = em
|
||||||
self.out = b''
|
self.out = []
|
||||||
self.gothead = 0
|
self.gothead = 0
|
||||||
self.patendhead = re.compile(b'''</head>''')
|
self.patendhead = re.compile(b'''</head>''')
|
||||||
self.patcharset = re.compile(b'''^<meta http-equiv=''')
|
self.patcharset = re.compile(b'''^<meta http-equiv=''')
|
||||||
@ -21,17 +21,17 @@ class RTFProcessData:
|
|||||||
def takeLine(self, line):
|
def takeLine(self, line):
|
||||||
if not self.gothead:
|
if not self.gothead:
|
||||||
if self.patendhead.search(line):
|
if self.patendhead.search(line):
|
||||||
self.out += b'<meta http-equiv="Content-Type" ' + \
|
self.out.append(b'<meta http-equiv="Content-Type" ' + \
|
||||||
b'content="text/html;charset=UTF-8">' + b'\n'
|
b'content="text/html;charset=UTF-8">')
|
||||||
self.out += line + b'\n'
|
self.out.append(line)
|
||||||
self.gothead = 1
|
self.gothead = 1
|
||||||
elif not self.patcharset.search(line):
|
elif not self.patcharset.search(line):
|
||||||
self.out += line + b'\n'
|
self.out.append(line)
|
||||||
else:
|
else:
|
||||||
self.out += line + b'\n'
|
self.out.append(line)
|
||||||
|
|
||||||
def wrapData(self):
|
def wrapData(self):
|
||||||
return self.out
|
return b'\n'.join(self.out)
|
||||||
|
|
||||||
class RTFFilter:
|
class RTFFilter:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
|
|||||||
@ -13,30 +13,30 @@ import xml.sax
|
|||||||
class XLSProcessData:
|
class XLSProcessData:
|
||||||
def __init__(self, em, ishtml = False):
|
def __init__(self, em, ishtml = False):
|
||||||
self.em = em
|
self.em = em
|
||||||
self.out = b""
|
self.out = []
|
||||||
self.gotdata = 0
|
self.gotdata = 0
|
||||||
self.xmldata = b""
|
self.xmldata = []
|
||||||
self.ishtml = ishtml
|
self.ishtml = ishtml
|
||||||
|
|
||||||
def takeLine(self, line):
|
def takeLine(self, line):
|
||||||
if self.ishtml:
|
if self.ishtml:
|
||||||
self.out += line + "\n"
|
self.out.append(line)
|
||||||
return
|
return
|
||||||
if not self.gotdata:
|
if not self.gotdata:
|
||||||
self.out += b'''<html><head>''' + \
|
self.out.append(b'''<html><head>''' + \
|
||||||
b'''<meta http-equiv="Content-Type" ''' + \
|
b'''<meta http-equiv="Content-Type" ''' + \
|
||||||
b'''content="text/html;charset=UTF-8">''' + \
|
b'''content="text/html;charset=UTF-8">''' + \
|
||||||
b'''</head><body><pre>'''
|
b'''</head><body><pre>''')
|
||||||
self.gotdata = True
|
self.gotdata = True
|
||||||
self.xmldata += line
|
self.xmldata.append(line)
|
||||||
|
|
||||||
def wrapData(self):
|
def wrapData(self):
|
||||||
if self.ishtml:
|
if self.ishtml:
|
||||||
return self.out
|
return b'\n'.join(self.out)
|
||||||
handler = xlsxmltocsv.XlsXmlHandler()
|
handler = xlsxmltocsv.XlsXmlHandler()
|
||||||
xml.sax.parseString(self.xmldata, handler)
|
xml.sax.parseString(b'\n'.join(self.xmldata), handler)
|
||||||
self.out += self.em.htmlescape(handler.output)
|
self.out.append(self.em.htmlescape(b'\n'.join(handler.output)))
|
||||||
return self.out + b'''</pre></body></html>'''
|
return b'\n'.join(self.out) + b'</pre></body></html>'
|
||||||
|
|
||||||
class XLSFilter:
|
class XLSFilter:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
|
|||||||
@ -39,12 +39,12 @@ else:
|
|||||||
|
|
||||||
class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.output = b''
|
self.output = []
|
||||||
|
|
||||||
def startElement(self, name, attrs):
|
def startElement(self, name, attrs):
|
||||||
if name == "worksheet":
|
if name == "worksheet":
|
||||||
if "name" in attrs:
|
if "name" in attrs:
|
||||||
self.output += b"%s\n" % attrs["name"].encode("UTF-8")
|
self.output.append(b"%s\n" % attrs["name"].encode("UTF-8"))
|
||||||
elif name == "row":
|
elif name == "row":
|
||||||
self.cells = dict()
|
self.cells = dict()
|
||||||
elif name == "label-cell" or name == "number-cell":
|
elif name == "label-cell" or name == "number-cell":
|
||||||
@ -56,7 +56,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
|||||||
self.cells[int(attrs["col"])] = value
|
self.cells[int(attrs["col"])] = value
|
||||||
else:
|
else:
|
||||||
#??
|
#??
|
||||||
self.output += b"%s%s" % (value.encode("UTF-8"), sepstring)
|
self.output.append(b"%s%s" % (value.encode("UTF-8"), sepstring))
|
||||||
elif name == "formula-cell":
|
elif name == "formula-cell":
|
||||||
if "formula-result" in attrs and "col" in attrs:
|
if "formula-result" in attrs and "col" in attrs:
|
||||||
self.cells[int(attrs["col"])] = \
|
self.cells[int(attrs["col"])] = \
|
||||||
@ -65,20 +65,21 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler):
|
|||||||
def endElement(self, name, ):
|
def endElement(self, name, ):
|
||||||
if name == "row":
|
if name == "row":
|
||||||
curidx = 0
|
curidx = 0
|
||||||
|
line = []
|
||||||
for idx, value in self.cells.items():
|
for idx, value in self.cells.items():
|
||||||
self.output += sepstring * (idx - curidx)
|
line.append(sepstring * (idx - curidx))
|
||||||
self.output += b"%s%s%s" % (dquote, value, dquote)
|
line.append(b"%s%s%s" % (dquote, value, dquote))
|
||||||
curidx = idx
|
curidx = idx
|
||||||
self.output += b"\n"
|
self.output.append(b''.join(line))
|
||||||
elif name == "worksheet":
|
elif name == "worksheet":
|
||||||
self.output += b"\n"
|
self.output.append(b'')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
try:
|
try:
|
||||||
handler = XlsXmlHandler()
|
handler = XlsXmlHandler()
|
||||||
xml.sax.parse(sys.stdin, handler)
|
xml.sax.parse(sys.stdin, handler)
|
||||||
print(handler.output)
|
print(b'\n'.join(handler.output))
|
||||||
except BaseException as err:
|
except BaseException as err:
|
||||||
print("xml-parse: %s\n" % (str(sys.exc_info()[:2]),), file=sys.stderr)
|
print("xml-parse: %s\n" % (str(sys.exc_info()[:2]),), file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user