#!/usr/bin/env python2 from __future__ import print_function import rclexecm import rclexec1 import re import sys import os # Processing the output from antiword: create html header and tail, process # continuation lines escape, HTML special characters, accumulate the data. class WordProcessData: def __init__(self, em): self.em = em self.out = b'' self.cont = b'' self.gotdata = False # Line with continued word (ending in -) # we strip the - which is not nice for actually hyphenated word. # What to do ? self.patcont = re.compile(b'''[\w][-]$''') # Pattern for breaking continuation at last word start self.patws = re.compile(b'''([\s])([\w]+)(-)$''') def takeLine(self, line): if not self.gotdata: if line == b'': return self.out = b'' + \ b'' + \ b'

' self.gotdata = True if self.cont: line = self.cont + line self.cont = "" if line == b'\f': self.out += '

' return if self.patcont.search(line): # Break at last whitespace match = self.patws.search(line) if match: self.cont = line[match.start(2):match.end(2)] line = line[0:match.start(1)] else: self.cont = line line = b'' if line: self.out += self.em.htmlescape(line) + b'
' else: self.out += b'
' def wrapData(self): if self.gotdata: self.out += b'

' self.em.setmimetype("text/html") return self.out # Null data accumulator. We use this when antiword has fail, and the # data actually comes from rclrtf, rcltext or vwWare, which all # output HTML class WordPassData: def __init__(self, em): self.out = b'' self.em = em def takeLine(self, line): self.out += line def wrapData(self): self.em.setmimetype("text/html") return self.out # Filter for msword docs. Try antiword, and if this fails, check for # an rtf or text document (.doc are sometimes like this...). Also try # vwWare if the doc is actually a word doc class WordFilter: def __init__(self, em, td): self.em = em self.ntry = 0 self.execdir = td def reset(self): self.ntry = 0 def hasControlChars(self, data): for c in data: if c < chr(32) and c != '\n' and c != '\t' and \ c != '\f' and c != '\r': return True return False def mimetype(self, fn): rtfprolog = b'{\\rtf1' docprolog = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1' try: f = open(fn, "rb") except: return "" data = f.read(100) if data[0:6] == rtfprolog: return "text/rtf" elif data[0:8] == docprolog: return "application/msword" elif self.hasControlChars(data): return "application/octet-stream" else: return "text/plain" def getCmd(self, fn): '''Return command to execute, and postprocessor, according to our state: first try antiword, then others depending on mime identification. Do 2 tries at most''' if self.ntry == 0: self.ntry = 1 cmd = rclexecm.which("antiword") if cmd: return ([cmd, "-t", "-i", "1", "-m", "UTF-8"], WordProcessData(self.em)) else: return ([],None) elif self.ntry == 1: self.ntry = 2 # antiword failed. Check for an rtf file, or text and # process accordingly. It the doc is actually msword, try # wvWare. mt = self.mimetype(fn) self.em.rclog("rcldoc.py: actual MIME type %s" % mt) if mt == "text/plain": return ([sys.executable, os.path.join(self.execdir, "rcltext.py")], WordPassData(self.em)) elif mt == "text/rtf": cmd = [sys.executable, os.path.join(self.execdir, "rclrtf.py"), "-s"] self.em.rclog("rcldoc.py: returning cmd %s" % cmd) return (cmd, WordPassData(self.em)) elif mt == "application/msword": cmd = rclexecm.which("wvWare") if cmd: return ([cmd, "--nographics", "--charset=utf-8"], WordPassData(self.em)) else: return ([],None) else: return ([],None) else: return ([],None) if __name__ == '__main__': # Remember where we execute filters from, in case we need to exec another execdir = os.path.dirname(sys.argv[0]) # Check that we have antiword. We could fallback to wvWare, but # this is not what the old filter did. if not rclexecm.which("antiword"): print("RECFILTERROR HELPERNOTFOUND antiword") sys.exit(1) proto = rclexecm.RclExecM() filter = WordFilter(proto, execdir) extract = rclexec1.Executor(proto, filter) rclexecm.main(proto, extract)