New python-based msword filter + basic arch to convert the others

--HG--
branch : WINDOWSPORT
This commit is contained in:
Jean-Francois Dockes 2015-09-07 11:16:20 +02:00
parent 5d0e8330fb
commit e40cf64e66
2 changed files with 15 additions and 10 deletions

View File

@ -6,7 +6,7 @@ import sys
import os
# Processing the output from antiword: create html header and tail, process
# continuation lines escape HTML special characters, accumulate the data
# continuation lines escape, HTML special characters, accumulate the data.
class WordProcessData:
def __init__(self, em):
self.em = em
@ -74,15 +74,19 @@ class WordPassData:
self.em.setmimetype("text/html")
return self.out
# Filter for msword docs. Try antiword, and if this fails, check for
# an rtf or text document (.doc are sometimes like this). Also try
# an rtf or text document (.doc are sometimes like this...). Also try
# vwWare if the doc is actually a word doc
class WordFilter:
def __init__(self, em, td):
self.em = em
self.ntry = 0
self.thisdir = td
self.execdir = td
def reset(self):
self.ntry = 0
def hasControlChars(self, data):
for c in data:
if c < chr(32) and c != '\n' and c != '\t' and \
@ -108,7 +112,7 @@ class WordFilter:
return "text/plain"
def getCmd(self, fn):
'''Return command to execute and postprocessor according to
'''Return command to execute, and postprocessor, according to
our state: first try antiword, then others depending on mime
identification. Do 2 tries at most'''
if self.ntry == 0:
@ -116,16 +120,16 @@ class WordFilter:
return (["antiword", "-t", "-i", "1", "-m", "UTF-8"],
WordProcessData(self.em))
elif self.ntry == 1:
ntry = 2
self.ntry = 2
# antiword failed. Check for an rtf file, or text and
# process accordingly. It the doc is actually msword, try
# wvWare.
mt = self.mimetype(fn)
if mt == "text/plain":
return ([os.path.join(self.thisdir,"rcltext")],
return ([os.path.join(self.execdir,"rcltext")],
WordPassData(self.em))
elif mt == "text/rtf":
return ([os.path.join(self.thisdir, "rclrtf")],
return ([os.path.join(self.execdir, "rclrtf")],
WordPassData(self.em))
elif mt == "application/msword":
return (["wvWare", "--nographics", "--charset=utf-8"],
@ -136,8 +140,8 @@ class WordFilter:
return ([],None)
if __name__ == '__main__':
thisdir = os.path.dirname(sys.argv[0])
execdir = os.path.dirname(sys.argv[0])
proto = rclexecm.RclExecM()
filter = WordFilter(proto, thisdir)
filter = WordFilter(proto, execdir)
extract = rclexecm.Executor(proto, filter)
rclexecm.main(proto, extract)

View File

@ -233,6 +233,7 @@ class Executor:
def extractone(self, params):
#self.em.rclog("extractone %s %s" % (params["filename:"], \
# params["mimetype:"]))
self.flt.reset()
ok = False
if not params.has_key("filename:"):
self.em.rclog("extractone: no mime or file name")