From e121695a3c5eb541e61004d71d04a531f4c515c3 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dockes Date: Fri, 3 Dec 2021 11:03:23 +0100 Subject: [PATCH] Python handlers: factorise tmp dir code --- src/filters/rclexecm.py | 29 ++++++++++++++++++++--------- src/filters/rclocrtesseract.py | 28 ++++++++++------------------ src/filters/rclpdf.py | 32 +++++++++++--------------------- 3 files changed, 41 insertions(+), 48 deletions(-) diff --git a/src/filters/rclexecm.py b/src/filters/rclexecm.py index 6c075709..42d2ff76 100644 --- a/src/filters/rclexecm.py +++ b/src/filters/rclexecm.py @@ -265,19 +265,30 @@ def execPythonScript(icmd): # Temp dir helper class SafeTmpDir: - def __init__(self, em): + def __init__(self, tag, em=None): + self.tag = tag self.em = em - self.toptmp = "" - self.tmpdir = "" + self.toptmp = None + self.tmpdir = None def __del__(self): - try: - if self.toptmp: - shutil.rmtree(self.tmpdir, True) + if self.toptmp: + try: + if self.tmpdir: + shutil.rmtree(self.tmpdir, True) os.rmdir(self.toptmp) - except Exception as err: - self.em.rclog("delete dir failed for " + self.toptmp) + except Exception as err: + if self.em: + self.em.rclog("delete dir failed for " + self.toptmp) + def vacuumdir(self): + if self.tmpdir: + for fn in os.listdir(self.tmpdir): + path = os.path.join(self.tmpdir, fn) + if os.path.isfile(path): + os.unlink(path) + return True + def getpath(self): if not self.tmpdir: envrcltmp = os.getenv('RECOLL_TMPDIR') @@ -286,7 +297,7 @@ class SafeTmpDir: else: self.toptmp = tempfile.mkdtemp(prefix='rcltmp') - self.tmpdir = os.path.join(self.toptmp, 'rclsofftmp') + self.tmpdir = os.path.join(self.toptmp, self.tag) os.makedirs(self.tmpdir) return self.tmpdir diff --git a/src/filters/rclocrtesseract.py b/src/filters/rclocrtesseract.py index 42c99466..f8974c14 100755 --- a/src/filters/rclocrtesseract.py +++ b/src/filters/rclocrtesseract.py @@ -42,33 +42,25 @@ pdftoppmcmd = None def _deb(s): rclexecm.logmsg("rclocrtesseract: %s" % s) -def vacuumdir(dir): - if dir: - for fn in os.listdir(dir): - path = os.path.join(dir, fn) - if os.path.isfile(path): - os.unlink(path) - return True - - tmpdir = None + def _maybemaketmpdir(): global tmpdir if tmpdir: - if not vacuumdir(tmpdir): - _deb("openfile: vacuumdir %s failed" % tmpdir) + if not tmpdir.vacuumdir(): + _deb("openfile: vacuumdir %s failed" % tmpdir.getpath()) return False else: - tmpdir = tempfile.mkdtemp(prefix='rclocrtmp') + tmpdir = rclexecm.SafeTmpDir("rclocrtesseract") def cleanocr(): global tmpdir if tmpdir: - vacuumdir(tmpdir) - os.rmdir(tmpdir) + del tmpdir tmpdir = None - + + # Return true if tesseract and the appropriate conversion program for # the file type (e.g. pdftoppt for pdf) appear to be available def ocrpossible(config, path): @@ -165,12 +157,12 @@ def _pdftesseract(config, path): tesseractlang = _guesstesseractlang(config, path) - #tesserrorfile = os.path.join(tmpdir, "tesserrorfile") - tmpfile = os.path.join(tmpdir, "ocrXXXXXX") + #tesserrorfile = os.path.join(tmpdir.getpath(), "tesserrorfile") + tmpfile = os.path.join(tmpdir.getpath(), "ocrXXXXXX") # Split pdf pages try: - vacuumdir(tmpdir) + tmpdir.vacuumdir() cmd = [pdftoppmcmd, "-r", "300", path, tmpfile] #_deb("Executing %s" % cmd) subprocess.check_call(cmd) diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 8263c91c..0feaf741 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -69,8 +69,7 @@ _htmlsuffix = b'''''' def finalcleanup(): global tmpdir if tmpdir: - vacuumdir(tmpdir) - os.rmdir(tmpdir) + del tmpdir tmpdir = None ocrproc = None @@ -93,14 +92,6 @@ except: pass try: signal.signal(signal.SIGTERM, signal_handler) except: pass -def vacuumdir(dir): - if dir: - for fn in os.listdir(dir): - path = os.path.join(dir, fn) - if os.path.isfile(path): - os.unlink(path) - return True - class PDFExtractor: def __init__(self, em): self.currentindex = 0 @@ -221,7 +212,7 @@ class PDFExtractor: # no big deal return True try: - vacuumdir(tmpdir) + tmpdir.vacuumdir() # Note: the java version of pdftk sometimes/often fails # here with writing to stdout: # Error occurred during initialization of VM @@ -231,9 +222,9 @@ class PDFExtractor: # output, until we fix the error or preferably find a way # to do it with poppler... subprocess.check_call( - [self.pdftk, self.filename, "unpack_files", "output", - tmpdir], stdout=sys.stderr) - self.attachlist = sorted(os.listdir(tmpdir)) + [self.pdftk, self.filename, "unpack_files", "output", tmpdir.getpath()], + stdout=sys.stderr) + self.attachlist = sorted(os.listdir(tmpdir.getpath())) return True except Exception as e: self.em.rclog("extractAttach: failed: %s" % e) @@ -407,11 +398,12 @@ class PDFExtractor: def maybemaketmpdir(self): global tmpdir if tmpdir: - if not vacuumdir(tmpdir): - self.em.rclog("openfile: vacuumdir %s failed" % tmpdir) + if not tmpdir.vacuumdir(): + self.em.rclog("openfile: vacuumdir %s failed" % tmpdir.getpath()) return False else: - tmpdir = tempfile.mkdtemp(prefix='rclmpdf') + tmpdir = rclexecm.SafeTmpDir("rclpdf", self.em) + #self.em.rclog("Using temporary directory %s" % tmpdir.getpath()) if self.pdftk and re.match("/snap/", self.pdftk): # We know this is Unix (Ubuntu actually). Check that tmpdir # belongs to the user as snap commands can't use /tmp to share @@ -423,9 +415,7 @@ class PDFExtractor: if st.st_uid == os.getuid(): ok = True if not ok: - self.em.rclog( - "pdftk is a snap command and needs TMPDIR to be " - "a directory you own") + self.em.rclog("pdftk is a snap command and needs TMPDIR to be owned by you") def _process_annotations(self, html): doc = Poppler.Document.new_from_file( @@ -530,7 +520,7 @@ class PDFExtractor: if not self.attextractdone: if not self.extractAttach(): return (False, "", "", rclexecm.RclExecM.eofnow) - path = os.path.join(tmpdir, ipath) + path = os.path.join(tmpdir.getpath(), ipath) if os.path.isfile(path): f = open(path, "rb") docdata = f.read();