Python handlers: factorise tmp dir code

This commit is contained in:
Jean-Francois Dockes 2021-12-03 11:03:23 +01:00
parent d942b23c85
commit e121695a3c
3 changed files with 41 additions and 48 deletions

View File

@ -265,19 +265,30 @@ def execPythonScript(icmd):
# Temp dir helper
class SafeTmpDir:
def __init__(self, em):
def __init__(self, tag, em=None):
self.tag = tag
self.em = em
self.toptmp = ""
self.tmpdir = ""
self.toptmp = None
self.tmpdir = None
def __del__(self):
try:
if self.toptmp:
shutil.rmtree(self.tmpdir, True)
if self.toptmp:
try:
if self.tmpdir:
shutil.rmtree(self.tmpdir, True)
os.rmdir(self.toptmp)
except Exception as err:
self.em.rclog("delete dir failed for " + self.toptmp)
except Exception as err:
if self.em:
self.em.rclog("delete dir failed for " + self.toptmp)
def vacuumdir(self):
if self.tmpdir:
for fn in os.listdir(self.tmpdir):
path = os.path.join(self.tmpdir, fn)
if os.path.isfile(path):
os.unlink(path)
return True
def getpath(self):
if not self.tmpdir:
envrcltmp = os.getenv('RECOLL_TMPDIR')
@ -286,7 +297,7 @@ class SafeTmpDir:
else:
self.toptmp = tempfile.mkdtemp(prefix='rcltmp')
self.tmpdir = os.path.join(self.toptmp, 'rclsofftmp')
self.tmpdir = os.path.join(self.toptmp, self.tag)
os.makedirs(self.tmpdir)
return self.tmpdir

View File

@ -42,33 +42,25 @@ pdftoppmcmd = None
def _deb(s):
rclexecm.logmsg("rclocrtesseract: %s" % s)
def vacuumdir(dir):
if dir:
for fn in os.listdir(dir):
path = os.path.join(dir, fn)
if os.path.isfile(path):
os.unlink(path)
return True
tmpdir = None
def _maybemaketmpdir():
global tmpdir
if tmpdir:
if not vacuumdir(tmpdir):
_deb("openfile: vacuumdir %s failed" % tmpdir)
if not tmpdir.vacuumdir():
_deb("openfile: vacuumdir %s failed" % tmpdir.getpath())
return False
else:
tmpdir = tempfile.mkdtemp(prefix='rclocrtmp')
tmpdir = rclexecm.SafeTmpDir("rclocrtesseract")
def cleanocr():
global tmpdir
if tmpdir:
vacuumdir(tmpdir)
os.rmdir(tmpdir)
del tmpdir
tmpdir = None
# Return true if tesseract and the appropriate conversion program for
# the file type (e.g. pdftoppt for pdf) appear to be available
def ocrpossible(config, path):
@ -165,12 +157,12 @@ def _pdftesseract(config, path):
tesseractlang = _guesstesseractlang(config, path)
#tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
#tesserrorfile = os.path.join(tmpdir.getpath(), "tesserrorfile")
tmpfile = os.path.join(tmpdir.getpath(), "ocrXXXXXX")
# Split pdf pages
try:
vacuumdir(tmpdir)
tmpdir.vacuumdir()
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
#_deb("Executing %s" % cmd)
subprocess.check_call(cmd)

View File

@ -69,8 +69,7 @@ _htmlsuffix = b'''</pre></body></html>'''
def finalcleanup():
global tmpdir
if tmpdir:
vacuumdir(tmpdir)
os.rmdir(tmpdir)
del tmpdir
tmpdir = None
ocrproc = None
@ -93,14 +92,6 @@ except: pass
try: signal.signal(signal.SIGTERM, signal_handler)
except: pass
def vacuumdir(dir):
if dir:
for fn in os.listdir(dir):
path = os.path.join(dir, fn)
if os.path.isfile(path):
os.unlink(path)
return True
class PDFExtractor:
def __init__(self, em):
self.currentindex = 0
@ -221,7 +212,7 @@ class PDFExtractor:
# no big deal
return True
try:
vacuumdir(tmpdir)
tmpdir.vacuumdir()
# Note: the java version of pdftk sometimes/often fails
# here with writing to stdout:
# Error occurred during initialization of VM
@ -231,9 +222,9 @@ class PDFExtractor:
# output, until we fix the error or preferably find a way
# to do it with poppler...
subprocess.check_call(
[self.pdftk, self.filename, "unpack_files", "output",
tmpdir], stdout=sys.stderr)
self.attachlist = sorted(os.listdir(tmpdir))
[self.pdftk, self.filename, "unpack_files", "output", tmpdir.getpath()],
stdout=sys.stderr)
self.attachlist = sorted(os.listdir(tmpdir.getpath()))
return True
except Exception as e:
self.em.rclog("extractAttach: failed: %s" % e)
@ -407,11 +398,12 @@ class PDFExtractor:
def maybemaketmpdir(self):
global tmpdir
if tmpdir:
if not vacuumdir(tmpdir):
self.em.rclog("openfile: vacuumdir %s failed" % tmpdir)
if not tmpdir.vacuumdir():
self.em.rclog("openfile: vacuumdir %s failed" % tmpdir.getpath())
return False
else:
tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
tmpdir = rclexecm.SafeTmpDir("rclpdf", self.em)
#self.em.rclog("Using temporary directory %s" % tmpdir.getpath())
if self.pdftk and re.match("/snap/", self.pdftk):
# We know this is Unix (Ubuntu actually). Check that tmpdir
# belongs to the user as snap commands can't use /tmp to share
@ -423,9 +415,7 @@ class PDFExtractor:
if st.st_uid == os.getuid():
ok = True
if not ok:
self.em.rclog(
"pdftk is a snap command and needs TMPDIR to be "
"a directory you own")
self.em.rclog("pdftk is a snap command and needs TMPDIR to be owned by you")
def _process_annotations(self, html):
doc = Poppler.Document.new_from_file(
@ -530,7 +520,7 @@ class PDFExtractor:
if not self.attextractdone:
if not self.extractAttach():
return (False, "", "", rclexecm.RclExecM.eofnow)
path = os.path.join(tmpdir, ipath)
path = os.path.join(tmpdir.getpath(), ipath)
if os.path.isfile(path):
f = open(path, "rb")
docdata = f.read();