From e40cf64e667f0a7e3559bf08eae05822f291599f Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Mon, 7 Sep 2015 11:16:20 +0200
Subject: [PATCH] New python-based msword filter + basic arch to convert the
 others

--HG--
branch : WINDOWSPORT
---
 src/filters/rcldoc.py   | 24 ++++++++++++++----------
 src/filters/rclexecm.py |  1 +
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/filters/rcldoc.py b/src/filters/rcldoc.py
index 4499c4ea..f6dcf334 100755
--- a/src/filters/rcldoc.py
+++ b/src/filters/rcldoc.py
@@ -6,7 +6,7 @@ import sys
 import os
 
 # Processing the output from antiword: create html header and tail, process
-# continuation lines escape HTML special characters, accumulate the data
+# continuation lines escape, HTML special characters, accumulate the data.
 class WordProcessData:
     def __init__(self, em):
         self.em = em
@@ -74,15 +74,19 @@ class WordPassData:
         self.em.setmimetype("text/html")
         return self.out
         
+
 # Filter for msword docs. Try antiword, and if this fails, check for
-# an rtf or text document (.doc are sometimes like this). Also try
+# an rtf or text document (.doc are sometimes like this...). Also try
 # vwWare if the doc is actually a word doc
 class WordFilter:
     def __init__(self, em, td):
         self.em = em
         self.ntry = 0
-        self.thisdir = td
-        
+        self.execdir = td
+
+    def reset(self):
+        self.ntry = 0
+            
     def hasControlChars(self, data):
         for c in data:
             if c < chr(32) and c != '\n' and c != '\t' and \
@@ -108,7 +112,7 @@ class WordFilter:
             return "text/plain"
 
     def getCmd(self, fn):
-        '''Return command to execute and postprocessor according to
+        '''Return command to execute, and postprocessor, according to
         our state: first try antiword, then others depending on mime
         identification. Do 2 tries at most'''
         if self.ntry == 0:
@@ -116,16 +120,16 @@ class WordFilter:
             return (["antiword", "-t", "-i", "1", "-m", "UTF-8"],
                     WordProcessData(self.em))
         elif self.ntry == 1:
-            ntry = 2
+            self.ntry = 2
             # antiword failed. Check for an rtf file, or text and
             # process accordingly. It the doc is actually msword, try
             # wvWare.
             mt = self.mimetype(fn)
             if mt == "text/plain":
-                return ([os.path.join(self.thisdir,"rcltext")],
+                return ([os.path.join(self.execdir,"rcltext")],
                        WordPassData(self.em))
             elif mt == "text/rtf":
-                return ([os.path.join(self.thisdir, "rclrtf")],
+                return ([os.path.join(self.execdir, "rclrtf")],
                         WordPassData(self.em))
             elif mt == "application/msword":
                 return (["wvWare", "--nographics", "--charset=utf-8"],
@@ -136,8 +140,8 @@ class WordFilter:
             return ([],None)
 
 if __name__ == '__main__':
-    thisdir = os.path.dirname(sys.argv[0])
+    execdir = os.path.dirname(sys.argv[0])
     proto = rclexecm.RclExecM()
-    filter = WordFilter(proto, thisdir)
+    filter = WordFilter(proto, execdir)
     extract = rclexecm.Executor(proto, filter)
     rclexecm.main(proto, extract)
diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py
index 148a571a..e31753df 100644
--- a/src/filters/rclexecm.py
+++ b/src/filters/rclexecm.py
@@ -233,6 +233,7 @@ class Executor:
     def extractone(self, params):
         #self.em.rclog("extractone %s %s" % (params["filename:"], \
         # params["mimetype:"]))
+        self.flt.reset()
         ok = False
         if not params.has_key("filename:"):
             self.em.rclog("extractone: no mime or file name")