diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py index 5007df9e..d2a2e897 100644 --- a/src/filters/rclexecm.py +++ b/src/filters/rclexecm.py @@ -31,8 +31,26 @@ import rclconfig import cmdtalk PY3 = (sys.version > '3') -_mswindows = (sys.platform == "win32") -_execdir = os.path.dirname(sys.argv[0]) +_g_mswindows = (sys.platform == "win32") +_g_execdir = os.path.dirname(sys.argv[0]) + +_g_config = rclconfig.RclConfig() +_g_debugfile = _g_config.getConfParam("filterdebuglog") +_g_errfout = None + + +def logmsg(msg): + global _g_debugfile, _g_errfout + if _g_debugfile and not _g_errfout: + try: + _g_errfout = open(_g_debugfile, "a") + except: + pass + if _g_errfout: + print("%s" % msg, file=_g_errfout) + elif not _g_mswindows: + print("%s" % msg, file=sys.stderr) + # Convert to bytes if not already such. def makebytes(data): @@ -40,6 +58,7 @@ def makebytes(data): return data.encode("UTF-8") return data + # Possibly decode binary file name for use as subprocess argument, # depending on platform. def subprocfile(fn): @@ -48,26 +67,22 @@ def subprocfile(fn): # to convert. # On Unix all list elements get converted to bytes in the C # _posixsubprocess module, nothing to do. - if PY3 and _mswindows and type(fn) != type(''): + if PY3 and _g_mswindows and type(fn) != type(''): return fn.decode('UTF-8') else: return fn + # Check for truthness of rclconfig value. def configparamtrue(value): if not value: return False try: ivalue = int(value) - if ivalue: - return True - else: - return False + return True if ivalue else False except: - pass - if value[0] in 'tT': - return True - return False + return True if value[0] in 'tT' else False + # Escape special characters in plain text for inclusion in HTML doc. # Note: tried replacing this with a multiple replacer according to @@ -84,8 +99,6 @@ def htmlescape(txt): return txt -my_config = rclconfig.RclConfig() - ############################################ # RclExecM implements the communication protocol with the recollindex # process. It calls the object specific of the document type to @@ -109,7 +122,7 @@ class RclExecM(cmdtalk.CmdTalk): self.maxmembersize = self.maxmembersize * 1024 # Tell cmdtalk where to log - self.debugfile = my_config.getConfParam("filterdebuglog") + self.debugfile = _g_config.getConfParam("filterdebuglog") # Some of our params are binary, cmdtalk should not decode them self.nodecodeinput = True @@ -222,7 +235,7 @@ def which(program): def path_candidates(): yield os.path.dirname(sys.argv[0]) - rclpath = my_config.getConfParam("recollhelperpath") + rclpath = _g_config.getConfParam("recollhelperpath") if rclpath: for path in rclpath.split(os.pathsep): yield path @@ -244,9 +257,9 @@ def which(program): def execPythonScript(icmd): import subprocess cmd = list(icmd) - if _mswindows: + if _g_mswindows: if not os.path.isabs(cmd[0]): - cmd[0] = os.path.join(_execdir, cmd[0]) + cmd[0] = os.path.join(_g_execdir, cmd[0]) cmd = [sys.executable] + cmd return subprocess.check_output(cmd) @@ -347,8 +360,8 @@ def main(proto, extract): # Some filters (e.g. rclaudio) need/get a MIME type from the indexer. # We make a half-assed attempt to emulate: - mimetype = my_config.mimeType(path) - if not mimetype and not _mswindows: + mimetype = _g_config.mimeType(path) + if not mimetype and not _g_mswindows: mimetype = mimetype_with_file(path) if mimetype: params['mimetype'] = mimetype diff --git a/src/filters/rclmidi.py b/src/filters/rclmidi.py index 8295643c..f11a61ba 100644 --- a/src/filters/rclmidi.py +++ b/src/filters/rclmidi.py @@ -29,9 +29,6 @@ import sys from struct import unpack, pack import six -def debug(s): - print("%s"%s, file=sys.stderr) - PY3 = sys.version > '3' if PY3: diff --git a/src/filters/rclocr.py b/src/filters/rclocr.py index 40f4d26a..332231b4 100755 --- a/src/filters/rclocr.py +++ b/src/filters/rclocr.py @@ -30,11 +30,10 @@ import importlib.util import rclconfig import rclocrcache +import rclexecm -_mswindows = (sys.platform == "win32") def _deb(s): - if not _mswindows: - print("rclocr: %s" % s, file=sys.stderr) + rclexecm.logmsg(s) def Usage(): _deb("Usage: rclocr.py ") @@ -57,6 +56,7 @@ def breakwrite(f, data): offset += tow total -= tow + if len(sys.argv) != 2: Usage() diff --git a/src/filters/rclocrabbyy.py b/src/filters/rclocrabbyy.py index e3450a09..529d0f74 100755 --- a/src/filters/rclocrabbyy.py +++ b/src/filters/rclocrabbyy.py @@ -40,8 +40,7 @@ abbyyocrcmd = "" abbyocrdir = "" def _deb(s): - if not _mswindows: - print("rclocrabbyy: %s" % s, file=sys.stderr) + rclexecm.logmsg(s) # Return true if abbyy appears to be available def ocrpossible(config, path): diff --git a/src/filters/rclocrcache.py b/src/filters/rclocrcache.py index cdece6b2..6d71994e 100755 --- a/src/filters/rclocrcache.py +++ b/src/filters/rclocrcache.py @@ -61,9 +61,12 @@ import urllib.parse import zlib import glob +import rclexecm + def _deb(s): - print("rclocrcache: %s" %s, file=sys.stderr) + rclexecm.logmsg(s) + class OCRCache(object): def __init__(self, conf): self.config = conf @@ -324,4 +327,3 @@ if __name__ == '__main__': # if not incache: # trystore(path) # - diff --git a/src/filters/rclocrtesseract.py b/src/filters/rclocrtesseract.py index 4c8d2c9d..0dce9277 100755 --- a/src/filters/rclocrtesseract.py +++ b/src/filters/rclocrtesseract.py @@ -39,9 +39,10 @@ _okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg') tesseractcmd = None pdftoppmcmd = None + def _deb(s): - if not _mswindows: - print("rclocrtesseract: %s" % s, file=sys.stderr) + rclexecm.logmsg(s) + def vacuumdir(dir): if dir: @@ -51,6 +52,7 @@ def vacuumdir(dir): os.unlink(path) return True + tmpdir = None def _maybemaketmpdir(): global tmpdir @@ -61,13 +63,16 @@ def _maybemaketmpdir(): else: tmpdir = tempfile.mkdtemp(prefix='rclmpdf') + def finalcleanup(): if tmpdir: vacuumdir(tmpdir) os.rmdir(tmpdir) + atexit.register(finalcleanup) + # Return true if tesseract and the appropriate conversion program for # the file type (e.g. pdftoppt for pdf) appear to be available def ocrpossible(config, path): @@ -145,6 +150,7 @@ def _guesstesseractlang(config, path): _deb("Tesseract lang (guessed): %s" % tesseractlang) return tesseractlang + # Process pdf file: use pdftoppm to split it into ppm pages, then run # tesseract on each and concatenate the result. It would probably be # possible instead to use pdftocairo to produce a tiff, buf pdftocairo diff --git a/src/filters/rcluncomp.py b/src/filters/rcluncomp.py index 746c65e6..8bb9edc1 100644 --- a/src/filters/rcluncomp.py +++ b/src/filters/rcluncomp.py @@ -9,24 +9,26 @@ import platform import subprocess import glob -ftrace = sys.stderr -#ftrace = open("C:/Users/Bill/log-uncomp.txt", "w") + +def _msg(s): + rclexecm.logmsg(s) + sysplat = platform.system() if sysplat != "Windows": - print("rcluncomp.py: only for Windows", file = ftrace) + _msg("rcluncomp.py: only for Windows") sys.exit(1) try: import msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) except Exception as err: - print("setmode binary failed: %s" % str(err), file = ftrace) + _msg("setmode binary failed: %s" % str(err)) sevenz = rclexecm.which("7z") if not sevenz: - print("rcluncomp.py: can't find 7z exe. Maybe set recollhelperpath " \ - "in recoll.conf ?", file=ftrace) + _msg("rcluncomp.py: can't find 7z exe. Maybe set recollhelperpath " \ + "in recoll.conf ?") sys.exit(2) # Params: uncompression program, input file name, temp directory. @@ -34,7 +36,7 @@ if not sevenz: infile = sys.argv[2] outdir = sys.argv[3] -# print("rcluncomp.py infile [%s], outdir [%s]" % (infile, outdir), file = ftrace) +# _msg("rcluncomp.py infile [%s], outdir [%s]" % (infile, outdir)) # There is apparently no way to suppress 7z output. Hopefully the # possible deadlock described by the subprocess module doc can't occur @@ -47,7 +49,7 @@ try: # There should be only one file in there.. print(outputname[0]) except Exception as err: - print("%s" % (str(err),), file = ftrace) + _msg("%s" % (str(err),)) sys.exit(4) sys.exit(0)