From e40cf64e667f0a7e3559bf08eae05822f291599f Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Mon, 7 Sep 2015 11:16:20 +0200 Subject: [PATCH] New python-based msword filter + basic arch to convert the others --HG-- branch : WINDOWSPORT --- src/filters/rcldoc.py | 24 ++++++++++++++---------- src/filters/rclexecm.py | 1 + 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py index 4499c4ea..f6dcf334 100755 --- a/src/filters/rcldoc.py +++ b/src/filters/rcldoc.py @@ -6,7 +6,7 @@ import sys import os # Processing the output from antiword: create html header and tail, process -# continuation lines escape HTML special characters, accumulate the data +# continuation lines escape, HTML special characters, accumulate the data. class WordProcessData: def __init__(self, em): self.em = em @@ -74,15 +74,19 @@ class WordPassData: self.em.setmimetype("text/html") return self.out + # Filter for msword docs. Try antiword, and if this fails, check for -# an rtf or text document (.doc are sometimes like this). Also try +# an rtf or text document (.doc are sometimes like this...). Also try # vwWare if the doc is actually a word doc class WordFilter: def __init__(self, em, td): self.em = em self.ntry = 0 - self.thisdir = td - + self.execdir = td + + def reset(self): + self.ntry = 0 + def hasControlChars(self, data): for c in data: if c < chr(32) and c != '\n' and c != '\t' and \ @@ -108,7 +112,7 @@ class WordFilter: return "text/plain" def getCmd(self, fn): - '''Return command to execute and postprocessor according to + '''Return command to execute, and postprocessor, according to our state: first try antiword, then others depending on mime identification. Do 2 tries at most''' if self.ntry == 0: @@ -116,16 +120,16 @@ class WordFilter: return (["antiword", "-t", "-i", "1", "-m", "UTF-8"], WordProcessData(self.em)) elif self.ntry == 1: - ntry = 2 + self.ntry = 2 # antiword failed. Check for an rtf file, or text and # process accordingly. It the doc is actually msword, try # wvWare. mt = self.mimetype(fn) if mt == "text/plain": - return ([os.path.join(self.thisdir,"rcltext")], + return ([os.path.join(self.execdir,"rcltext")], WordPassData(self.em)) elif mt == "text/rtf": - return ([os.path.join(self.thisdir, "rclrtf")], + return ([os.path.join(self.execdir, "rclrtf")], WordPassData(self.em)) elif mt == "application/msword": return (["wvWare", "--nographics", "--charset=utf-8"], @@ -136,8 +140,8 @@ class WordFilter: return ([],None) if __name__ == '__main__': - thisdir = os.path.dirname(sys.argv[0]) + execdir = os.path.dirname(sys.argv[0]) proto = rclexecm.RclExecM() - filter = WordFilter(proto, thisdir) + filter = WordFilter(proto, execdir) extract = rclexecm.Executor(proto, filter) rclexecm.main(proto, extract) diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py index 148a571a..e31753df 100644 --- a/src/filters/rclexecm.py +++ b/src/filters/rclexecm.py @@ -233,6 +233,7 @@ class Executor: def extractone(self, params): #self.em.rclog("extractone %s %s" % (params["filename:"], \ # params["mimetype:"])) + self.flt.reset() ok = False if not params.has_key("filename:"): self.em.rclog("extractone: no mime or file name")