From 1593b1d87ff02120f1e70175daf626660ab440e3 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 3 Dec 2021 10:49:44 +0100 Subject: [PATCH] Change the way rclpd executes rclocr to avoid the command being killed before it can clean up when a signal is raised (e.g. timeout or kbd interrupt) --- src/filters/rclocr.py | 27 +++++++++++++++++++++++++-- src/filters/rclocrabbyy.py | 3 +++ src/filters/rclocrtesseract.py | 16 ++++++---------- src/filters/rclpdf.py | 16 +++++++++++++--- 4 files changed, 47 insertions(+), 15 deletions(-) diff --git a/src/filters/rclocr.py b/src/filters/rclocr.py index c6fa5b06..a576fc6e 100755 --- a/src/filters/rclocr.py +++ b/src/filters/rclocr.py @@ -26,6 +26,8 @@ import os import sys +import atexit +import signal import importlib.util import rclconfig @@ -33,7 +35,27 @@ import rclocrcache import rclexecm def _deb(s): - rclexecm.logmsg(s) + rclexecm.logmsg("rclocr: %s" % s) + +ocrcleanupmodule = None +@atexit.register +def finalcleanup(): + if ocrcleanupmodule: + ocrcleanupmodule.cleanocr() + +def signal_handler(sig, frame): + sys.exit(1) + +# Not all signals necessary exist on all systems, use catch +try: signal.signal(signal.SIGHUP, signal_handler) +except: pass +try: signal.signal(signal.SIGINT, signal_handler) +except: pass +try: signal.signal(signal.SIGQUIT, signal_handler) +except: pass +try: signal.signal(signal.SIGTERM, signal_handler) +except: pass + def Usage(): _deb("Usage: rclocr.py ") @@ -72,7 +94,7 @@ if incache: try: breakwrite(sys.stdout.buffer, data) except Exception as e: - _deb("RCLOCR error writing: %s" % e) + _deb("error writing: %s" % e) sys.exit(1) sys.exit(0) @@ -112,6 +134,7 @@ if not ok: # The OCR module will retrieve its specific parameters from the # configuration +ocrcleanupmodule = ocr status, data = ocr.runocr(config, path) if not status: diff --git a/src/filters/rclocrabbyy.py b/src/filters/rclocrabbyy.py index 529d0f74..8da9b73d 100755 --- a/src/filters/rclocrabbyy.py +++ b/src/filters/rclocrabbyy.py @@ -42,6 +42,9 @@ abbyocrdir = "" def _deb(s): rclexecm.logmsg(s) +def cleanocr(): + pass + # Return true if abbyy appears to be available def ocrpossible(config, path): global abbyyocrcmd diff --git a/src/filters/rclocrtesseract.py b/src/filters/rclocrtesseract.py index c20026b2..42c99466 100755 --- a/src/filters/rclocrtesseract.py +++ b/src/filters/rclocrtesseract.py @@ -21,7 +21,6 @@ import os import sys -import atexit import tempfile import subprocess import glob @@ -41,8 +40,7 @@ pdftoppmcmd = None def _deb(s): - rclexecm.logmsg(s) - + rclexecm.logmsg("rclocrtesseract: %s" % s) def vacuumdir(dir): if dir: @@ -61,18 +59,16 @@ def _maybemaketmpdir(): _deb("openfile: vacuumdir %s failed" % tmpdir) return False else: - tmpdir = tempfile.mkdtemp(prefix='rclmpdf') + tmpdir = tempfile.mkdtemp(prefix='rclocrtmp') -def finalcleanup(): +def cleanocr(): + global tmpdir if tmpdir: vacuumdir(tmpdir) os.rmdir(tmpdir) - - -atexit.register(finalcleanup) - - + tmpdir = None + # Return true if tesseract and the appropriate conversion program for # the file type (e.g. pdftoppt for pdf) appear to be available def ocrpossible(config, path): diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index cb95c11e..8263c91c 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -33,6 +33,7 @@ import glob import traceback import atexit import signal +import time import rclexecm import rclconfig @@ -66,11 +67,18 @@ _htmlprefix =b''' _htmlsuffix = b'''''' def finalcleanup(): + global tmpdir if tmpdir: vacuumdir(tmpdir) os.rmdir(tmpdir) + tmpdir = None +ocrproc = None def signal_handler(signal, frame): + global ocrproc + if ocrproc: + ocrproc.wait() + ocrproc = None sys.exit(1) atexit.register(finalcleanup) @@ -491,9 +499,11 @@ class PDFExtractor: s = self.config.getConfParam("pdfocr") if rclexecm.configparamtrue(s): try: - cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"), - self.filename] - data = subprocess.check_output(cmd) + cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"), self.filename] + global ocrproc + ocrproc = subprocess.Popen(cmd, stdout=subprocess.PIPE) + data, stderr = ocrproc.communicate() + ocrproc = None html = _htmlprefix + rclexecm.htmlescape(data) + _htmlsuffix except Exception as e: self.em.rclog("%s failed: %s" % (cmd, e))