diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py index d6bd06fe..104c420d 100755 --- a/src/filters/rcldoc.py +++ b/src/filters/rcldoc.py @@ -12,7 +12,7 @@ import os class WordProcessData: def __init__(self, em): self.em = em - self.out = b'' + self.out = [] self.cont = b'' self.gotdata = False # Line with continued word (ending in -) @@ -26,10 +26,10 @@ class WordProcessData: if not self.gotdata: if line == b'': return - self.out = b'' + \ + self.out.append(b'' + \ b'' + \ - b'

' + b'

') self.gotdata = True if self.cont: @@ -37,7 +37,7 @@ class WordProcessData: self.cont = "" if line == b'\f': - self.out += '


' + self.out.append('


') return if self.patcont.search(line): @@ -51,30 +51,30 @@ class WordProcessData: line = b'' if line: - self.out += self.em.htmlescape(line) + b'
' + self.out.append(self.em.htmlescape(line) + b'
') else: - self.out += b'
' + self.out.append(b'
') def wrapData(self): if self.gotdata: - self.out += b'

' + self.out.append(b'

') self.em.setmimetype("text/html") - return self.out + return b'\n'.join(self.out) -# Null data accumulator. We use this when antiword has fail, and the +# Null data accumulator. We use this when antiword has failed, and the # data actually comes from rclrtf, rcltext or vwWare, which all # output HTML class WordPassData: def __init__(self, em): - self.out = b'' + self.out = [] self.em = em def takeLine(self, line): - self.out += line + self.out.append(line) def wrapData(self): self.em.setmimetype("text/html") - return self.out + return b'\n'.join(self.out) # Filter for msword docs. Try antiword, and if this fails, check for diff --git a/src/filters/rclexec1.py b/src/filters/rclexec1.py index a9e9847f..dc4f818e 100644 --- a/src/filters/rclexec1.py +++ b/src/filters/rclexec1.py @@ -56,7 +56,7 @@ class Executor(RclBaseHandler): except Exception as err: self.em.rclog("runCmd: error reading %s: %s"%(filename, err)) return(False, "") - for line in data.split('\n'): + for line in data.split(b'\n'): postproc.takeLine(line) return True, postproc.wrapData() else: diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 4104cfc1..af92d057 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -322,7 +322,7 @@ class PDFExtractor: inheader = False inbody = False didcs = False - output = b'' + output = [] isempty = True for line in input.split(b'\n'): if re.search(b'', line): @@ -331,8 +331,8 @@ class PDFExtractor: inbody = False if inheader: if not didcs: - output += b'\n' + output.append(b'\n') didcs = True if self.needescape: m = re.search(b'''(.*)(.*)(<\/title>.*)''', line) @@ -361,9 +361,9 @@ class PDFExtractor: if re.search(b'<pre>', line): inbody = True - output += line + b'\n' + output.append(line) - return output, isempty + return b'\n'.join(output), isempty def _metatag(self, nm, val): return "<meta name=\"" + nm + "\" content=\"" + \ diff --git a/src/filters/rclppt.py b/src/filters/rclppt.py index 684b8907..ea282990 100755 --- a/src/filters/rclppt.py +++ b/src/filters/rclppt.py @@ -13,20 +13,20 @@ import os class PPTProcessData: def __init__(self, em): self.em = em - self.out = b"" + self.out = [] self.gotdata = 0 def takeLine(self, line): if not self.gotdata: - self.out += b'''<html><head>''' + \ - b'''<meta http-equiv="Content-Type" ''' + \ - b'''content="text/html;charset=UTF-8">''' + \ - b'''</head><body><pre>''' + self.out.append(b'<html><head>' + \ + b'<meta http-equiv="Content-Type" ' + \ + b'content="text/html;charset=UTF-8">' + \ + b'</head><body><pre>') self.gotdata = True - self.out += self.em.htmlescape(line) + b"<br>\n" + self.out.append(self.em.htmlescape(line)) def wrapData(self): - return self.out + b'''</pre></body></html>''' + return b'\n'.join(self.out) + b'''</pre></body></html>''' class PPTFilter: def __init__(self, em): diff --git a/src/filters/rclrtf.py b/src/filters/rclrtf.py index e5652440..25be690e 100755 --- a/src/filters/rclrtf.py +++ b/src/filters/rclrtf.py @@ -11,7 +11,7 @@ import os class RTFProcessData: def __init__(self, em): self.em = em - self.out = b'' + self.out = [] self.gothead = 0 self.patendhead = re.compile(b'''</head>''') self.patcharset = re.compile(b'''^<meta http-equiv=''') @@ -21,17 +21,17 @@ class RTFProcessData: def takeLine(self, line): if not self.gothead: if self.patendhead.search(line): - self.out += b'<meta http-equiv="Content-Type" ' + \ - b'content="text/html;charset=UTF-8">' + b'\n' - self.out += line + b'\n' + self.out.append(b'<meta http-equiv="Content-Type" ' + \ + b'content="text/html;charset=UTF-8">') + self.out.append(line) self.gothead = 1 elif not self.patcharset.search(line): - self.out += line + b'\n' + self.out.append(line) else: - self.out += line + b'\n' + self.out.append(line) def wrapData(self): - return self.out + return b'\n'.join(self.out) class RTFFilter: def __init__(self, em): diff --git a/src/filters/rclxls.py b/src/filters/rclxls.py index ec8ad6a5..213bcce3 100755 --- a/src/filters/rclxls.py +++ b/src/filters/rclxls.py @@ -13,30 +13,30 @@ import xml.sax class XLSProcessData: def __init__(self, em, ishtml = False): self.em = em - self.out = b"" + self.out = [] self.gotdata = 0 - self.xmldata = b"" + self.xmldata = [] self.ishtml = ishtml def takeLine(self, line): if self.ishtml: - self.out += line + "\n" + self.out.append(line) return if not self.gotdata: - self.out += b'''<html><head>''' + \ + self.out.append(b'''<html><head>''' + \ b'''<meta http-equiv="Content-Type" ''' + \ b'''content="text/html;charset=UTF-8">''' + \ - b'''</head><body><pre>''' + b'''</head><body><pre>''') self.gotdata = True - self.xmldata += line + self.xmldata.append(line) def wrapData(self): if self.ishtml: - return self.out + return b'\n'.join(self.out) handler = xlsxmltocsv.XlsXmlHandler() - xml.sax.parseString(self.xmldata, handler) - self.out += self.em.htmlescape(handler.output) - return self.out + b'''</pre></body></html>''' + xml.sax.parseString(b'\n'.join(self.xmldata), handler) + self.out.append(self.em.htmlescape(b'\n'.join(handler.output))) + return b'\n'.join(self.out) + b'</pre></body></html>' class XLSFilter: def __init__(self, em): diff --git a/src/filters/xlsxmltocsv.py b/src/filters/xlsxmltocsv.py index 1a69f503..094184fc 100755 --- a/src/filters/xlsxmltocsv.py +++ b/src/filters/xlsxmltocsv.py @@ -39,12 +39,12 @@ else: class XlsXmlHandler(xml.sax.handler.ContentHandler): def __init__(self): - self.output = b'' + self.output = [] def startElement(self, name, attrs): if name == "worksheet": if "name" in attrs: - self.output += b"%s\n" % attrs["name"].encode("UTF-8") + self.output.append(b"%s\n" % attrs["name"].encode("UTF-8")) elif name == "row": self.cells = dict() elif name == "label-cell" or name == "number-cell": @@ -56,7 +56,7 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler): self.cells[int(attrs["col"])] = value else: #?? - self.output += b"%s%s" % (value.encode("UTF-8"), sepstring) + self.output.append(b"%s%s" % (value.encode("UTF-8"), sepstring)) elif name == "formula-cell": if "formula-result" in attrs and "col" in attrs: self.cells[int(attrs["col"])] = \ @@ -65,20 +65,21 @@ class XlsXmlHandler(xml.sax.handler.ContentHandler): def endElement(self, name, ): if name == "row": curidx = 0 + line = [] for idx, value in self.cells.items(): - self.output += sepstring * (idx - curidx) - self.output += b"%s%s%s" % (dquote, value, dquote) + line.append(sepstring * (idx - curidx)) + line.append(b"%s%s%s" % (dquote, value, dquote)) curidx = idx - self.output += b"\n" + self.output.append(b''.join(line)) elif name == "worksheet": - self.output += b"\n" + self.output.append(b'') if __name__ == '__main__': try: handler = XlsXmlHandler() xml.sax.parse(sys.stdin, handler) - print(handler.output) + print(b'\n'.join(handler.output)) except BaseException as err: print("xml-parse: %s\n" % (str(sys.exc_info()[:2]),), file=sys.stderr) sys.exit(1)