Python handlers: factorise tmp dir code

This commit is contained in:
Jean-Francois Dockes 2021-12-03 11:03:23 +01:00
parent d942b23c85
commit e121695a3c
3 changed files with 41 additions and 48 deletions

View File

@ -265,19 +265,30 @@ def execPythonScript(icmd):
# Temp dir helper # Temp dir helper
class SafeTmpDir: class SafeTmpDir:
def __init__(self, em): def __init__(self, tag, em=None):
self.tag = tag
self.em = em self.em = em
self.toptmp = "" self.toptmp = None
self.tmpdir = "" self.tmpdir = None
def __del__(self): def __del__(self):
try: if self.toptmp:
if self.toptmp: try:
shutil.rmtree(self.tmpdir, True) if self.tmpdir:
shutil.rmtree(self.tmpdir, True)
os.rmdir(self.toptmp) os.rmdir(self.toptmp)
except Exception as err: except Exception as err:
self.em.rclog("delete dir failed for " + self.toptmp) if self.em:
self.em.rclog("delete dir failed for " + self.toptmp)
def vacuumdir(self):
if self.tmpdir:
for fn in os.listdir(self.tmpdir):
path = os.path.join(self.tmpdir, fn)
if os.path.isfile(path):
os.unlink(path)
return True
def getpath(self): def getpath(self):
if not self.tmpdir: if not self.tmpdir:
envrcltmp = os.getenv('RECOLL_TMPDIR') envrcltmp = os.getenv('RECOLL_TMPDIR')
@ -286,7 +297,7 @@ class SafeTmpDir:
else: else:
self.toptmp = tempfile.mkdtemp(prefix='rcltmp') self.toptmp = tempfile.mkdtemp(prefix='rcltmp')
self.tmpdir = os.path.join(self.toptmp, 'rclsofftmp') self.tmpdir = os.path.join(self.toptmp, self.tag)
os.makedirs(self.tmpdir) os.makedirs(self.tmpdir)
return self.tmpdir return self.tmpdir

View File

@ -42,33 +42,25 @@ pdftoppmcmd = None
def _deb(s): def _deb(s):
rclexecm.logmsg("rclocrtesseract: %s" % s) rclexecm.logmsg("rclocrtesseract: %s" % s)
def vacuumdir(dir):
if dir:
for fn in os.listdir(dir):
path = os.path.join(dir, fn)
if os.path.isfile(path):
os.unlink(path)
return True
tmpdir = None tmpdir = None
def _maybemaketmpdir(): def _maybemaketmpdir():
global tmpdir global tmpdir
if tmpdir: if tmpdir:
if not vacuumdir(tmpdir): if not tmpdir.vacuumdir():
_deb("openfile: vacuumdir %s failed" % tmpdir) _deb("openfile: vacuumdir %s failed" % tmpdir.getpath())
return False return False
else: else:
tmpdir = tempfile.mkdtemp(prefix='rclocrtmp') tmpdir = rclexecm.SafeTmpDir("rclocrtesseract")
def cleanocr(): def cleanocr():
global tmpdir global tmpdir
if tmpdir: if tmpdir:
vacuumdir(tmpdir) del tmpdir
os.rmdir(tmpdir)
tmpdir = None tmpdir = None
# Return true if tesseract and the appropriate conversion program for # Return true if tesseract and the appropriate conversion program for
# the file type (e.g. pdftoppt for pdf) appear to be available # the file type (e.g. pdftoppt for pdf) appear to be available
def ocrpossible(config, path): def ocrpossible(config, path):
@ -165,12 +157,12 @@ def _pdftesseract(config, path):
tesseractlang = _guesstesseractlang(config, path) tesseractlang = _guesstesseractlang(config, path)
#tesserrorfile = os.path.join(tmpdir, "tesserrorfile") #tesserrorfile = os.path.join(tmpdir.getpath(), "tesserrorfile")
tmpfile = os.path.join(tmpdir, "ocrXXXXXX") tmpfile = os.path.join(tmpdir.getpath(), "ocrXXXXXX")
# Split pdf pages # Split pdf pages
try: try:
vacuumdir(tmpdir) tmpdir.vacuumdir()
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile] cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
#_deb("Executing %s" % cmd) #_deb("Executing %s" % cmd)
subprocess.check_call(cmd) subprocess.check_call(cmd)

View File

@ -69,8 +69,7 @@ _htmlsuffix = b'''</pre></body></html>'''
def finalcleanup(): def finalcleanup():
global tmpdir global tmpdir
if tmpdir: if tmpdir:
vacuumdir(tmpdir) del tmpdir
os.rmdir(tmpdir)
tmpdir = None tmpdir = None
ocrproc = None ocrproc = None
@ -93,14 +92,6 @@ except: pass
try: signal.signal(signal.SIGTERM, signal_handler) try: signal.signal(signal.SIGTERM, signal_handler)
except: pass except: pass
def vacuumdir(dir):
if dir:
for fn in os.listdir(dir):
path = os.path.join(dir, fn)
if os.path.isfile(path):
os.unlink(path)
return True
class PDFExtractor: class PDFExtractor:
def __init__(self, em): def __init__(self, em):
self.currentindex = 0 self.currentindex = 0
@ -221,7 +212,7 @@ class PDFExtractor:
# no big deal # no big deal
return True return True
try: try:
vacuumdir(tmpdir) tmpdir.vacuumdir()
# Note: the java version of pdftk sometimes/often fails # Note: the java version of pdftk sometimes/often fails
# here with writing to stdout: # here with writing to stdout:
# Error occurred during initialization of VM # Error occurred during initialization of VM
@ -231,9 +222,9 @@ class PDFExtractor:
# output, until we fix the error or preferably find a way # output, until we fix the error or preferably find a way
# to do it with poppler... # to do it with poppler...
subprocess.check_call( subprocess.check_call(
[self.pdftk, self.filename, "unpack_files", "output", [self.pdftk, self.filename, "unpack_files", "output", tmpdir.getpath()],
tmpdir], stdout=sys.stderr) stdout=sys.stderr)
self.attachlist = sorted(os.listdir(tmpdir)) self.attachlist = sorted(os.listdir(tmpdir.getpath()))
return True return True
except Exception as e: except Exception as e:
self.em.rclog("extractAttach: failed: %s" % e) self.em.rclog("extractAttach: failed: %s" % e)
@ -407,11 +398,12 @@ class PDFExtractor:
def maybemaketmpdir(self): def maybemaketmpdir(self):
global tmpdir global tmpdir
if tmpdir: if tmpdir:
if not vacuumdir(tmpdir): if not tmpdir.vacuumdir():
self.em.rclog("openfile: vacuumdir %s failed" % tmpdir) self.em.rclog("openfile: vacuumdir %s failed" % tmpdir.getpath())
return False return False
else: else:
tmpdir = tempfile.mkdtemp(prefix='rclmpdf') tmpdir = rclexecm.SafeTmpDir("rclpdf", self.em)
#self.em.rclog("Using temporary directory %s" % tmpdir.getpath())
if self.pdftk and re.match("/snap/", self.pdftk): if self.pdftk and re.match("/snap/", self.pdftk):
# We know this is Unix (Ubuntu actually). Check that tmpdir # We know this is Unix (Ubuntu actually). Check that tmpdir
# belongs to the user as snap commands can't use /tmp to share # belongs to the user as snap commands can't use /tmp to share
@ -423,9 +415,7 @@ class PDFExtractor:
if st.st_uid == os.getuid(): if st.st_uid == os.getuid():
ok = True ok = True
if not ok: if not ok:
self.em.rclog( self.em.rclog("pdftk is a snap command and needs TMPDIR to be owned by you")
"pdftk is a snap command and needs TMPDIR to be "
"a directory you own")
def _process_annotations(self, html): def _process_annotations(self, html):
doc = Poppler.Document.new_from_file( doc = Poppler.Document.new_from_file(
@ -530,7 +520,7 @@ class PDFExtractor:
if not self.attextractdone: if not self.attextractdone:
if not self.extractAttach(): if not self.extractAttach():
return (False, "", "", rclexecm.RclExecM.eofnow) return (False, "", "", rclexecm.RclExecM.eofnow)
path = os.path.join(tmpdir, ipath) path = os.path.join(tmpdir.getpath(), ipath)
if os.path.isfile(path): if os.path.isfile(path):
f = open(path, "rb") f = open(path, "rb")
docdata = f.read(); docdata = f.read();