New python-based msword filter + basic arch to convert the others
--HG-- branch : WINDOWSPORT
This commit is contained in:
parent
5d0e8330fb
commit
e40cf64e66
@ -6,7 +6,7 @@ import sys
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
# Processing the output from antiword: create html header and tail, process
|
# Processing the output from antiword: create html header and tail, process
|
||||||
# continuation lines escape HTML special characters, accumulate the data
|
# continuation lines escape, HTML special characters, accumulate the data.
|
||||||
class WordProcessData:
|
class WordProcessData:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.em = em
|
self.em = em
|
||||||
@ -74,14 +74,18 @@ class WordPassData:
|
|||||||
self.em.setmimetype("text/html")
|
self.em.setmimetype("text/html")
|
||||||
return self.out
|
return self.out
|
||||||
|
|
||||||
|
|
||||||
# Filter for msword docs. Try antiword, and if this fails, check for
|
# Filter for msword docs. Try antiword, and if this fails, check for
|
||||||
# an rtf or text document (.doc are sometimes like this). Also try
|
# an rtf or text document (.doc are sometimes like this...). Also try
|
||||||
# vwWare if the doc is actually a word doc
|
# vwWare if the doc is actually a word doc
|
||||||
class WordFilter:
|
class WordFilter:
|
||||||
def __init__(self, em, td):
|
def __init__(self, em, td):
|
||||||
self.em = em
|
self.em = em
|
||||||
self.ntry = 0
|
self.ntry = 0
|
||||||
self.thisdir = td
|
self.execdir = td
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.ntry = 0
|
||||||
|
|
||||||
def hasControlChars(self, data):
|
def hasControlChars(self, data):
|
||||||
for c in data:
|
for c in data:
|
||||||
@ -108,7 +112,7 @@ class WordFilter:
|
|||||||
return "text/plain"
|
return "text/plain"
|
||||||
|
|
||||||
def getCmd(self, fn):
|
def getCmd(self, fn):
|
||||||
'''Return command to execute and postprocessor according to
|
'''Return command to execute, and postprocessor, according to
|
||||||
our state: first try antiword, then others depending on mime
|
our state: first try antiword, then others depending on mime
|
||||||
identification. Do 2 tries at most'''
|
identification. Do 2 tries at most'''
|
||||||
if self.ntry == 0:
|
if self.ntry == 0:
|
||||||
@ -116,16 +120,16 @@ class WordFilter:
|
|||||||
return (["antiword", "-t", "-i", "1", "-m", "UTF-8"],
|
return (["antiword", "-t", "-i", "1", "-m", "UTF-8"],
|
||||||
WordProcessData(self.em))
|
WordProcessData(self.em))
|
||||||
elif self.ntry == 1:
|
elif self.ntry == 1:
|
||||||
ntry = 2
|
self.ntry = 2
|
||||||
# antiword failed. Check for an rtf file, or text and
|
# antiword failed. Check for an rtf file, or text and
|
||||||
# process accordingly. It the doc is actually msword, try
|
# process accordingly. It the doc is actually msword, try
|
||||||
# wvWare.
|
# wvWare.
|
||||||
mt = self.mimetype(fn)
|
mt = self.mimetype(fn)
|
||||||
if mt == "text/plain":
|
if mt == "text/plain":
|
||||||
return ([os.path.join(self.thisdir,"rcltext")],
|
return ([os.path.join(self.execdir,"rcltext")],
|
||||||
WordPassData(self.em))
|
WordPassData(self.em))
|
||||||
elif mt == "text/rtf":
|
elif mt == "text/rtf":
|
||||||
return ([os.path.join(self.thisdir, "rclrtf")],
|
return ([os.path.join(self.execdir, "rclrtf")],
|
||||||
WordPassData(self.em))
|
WordPassData(self.em))
|
||||||
elif mt == "application/msword":
|
elif mt == "application/msword":
|
||||||
return (["wvWare", "--nographics", "--charset=utf-8"],
|
return (["wvWare", "--nographics", "--charset=utf-8"],
|
||||||
@ -136,8 +140,8 @@ class WordFilter:
|
|||||||
return ([],None)
|
return ([],None)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
thisdir = os.path.dirname(sys.argv[0])
|
execdir = os.path.dirname(sys.argv[0])
|
||||||
proto = rclexecm.RclExecM()
|
proto = rclexecm.RclExecM()
|
||||||
filter = WordFilter(proto, thisdir)
|
filter = WordFilter(proto, execdir)
|
||||||
extract = rclexecm.Executor(proto, filter)
|
extract = rclexecm.Executor(proto, filter)
|
||||||
rclexecm.main(proto, extract)
|
rclexecm.main(proto, extract)
|
||||||
|
|||||||
@ -233,6 +233,7 @@ class Executor:
|
|||||||
def extractone(self, params):
|
def extractone(self, params):
|
||||||
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
#self.em.rclog("extractone %s %s" % (params["filename:"], \
|
||||||
# params["mimetype:"]))
|
# params["mimetype:"]))
|
||||||
|
self.flt.reset()
|
||||||
ok = False
|
ok = False
|
||||||
if not params.has_key("filename:"):
|
if not params.has_key("filename:"):
|
||||||
self.em.rclog("extractone: no mime or file name")
|
self.em.rclog("extractone: no mime or file name")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user