Python handlers: factorise tmp dir code
This commit is contained in:
parent
d942b23c85
commit
e121695a3c
@ -265,19 +265,30 @@ def execPythonScript(icmd):
|
|||||||
|
|
||||||
# Temp dir helper
|
# Temp dir helper
|
||||||
class SafeTmpDir:
|
class SafeTmpDir:
|
||||||
def __init__(self, em):
|
def __init__(self, tag, em=None):
|
||||||
|
self.tag = tag
|
||||||
self.em = em
|
self.em = em
|
||||||
self.toptmp = ""
|
self.toptmp = None
|
||||||
self.tmpdir = ""
|
self.tmpdir = None
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
try:
|
|
||||||
if self.toptmp:
|
if self.toptmp:
|
||||||
|
try:
|
||||||
|
if self.tmpdir:
|
||||||
shutil.rmtree(self.tmpdir, True)
|
shutil.rmtree(self.tmpdir, True)
|
||||||
os.rmdir(self.toptmp)
|
os.rmdir(self.toptmp)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
if self.em:
|
||||||
self.em.rclog("delete dir failed for " + self.toptmp)
|
self.em.rclog("delete dir failed for " + self.toptmp)
|
||||||
|
|
||||||
|
def vacuumdir(self):
|
||||||
|
if self.tmpdir:
|
||||||
|
for fn in os.listdir(self.tmpdir):
|
||||||
|
path = os.path.join(self.tmpdir, fn)
|
||||||
|
if os.path.isfile(path):
|
||||||
|
os.unlink(path)
|
||||||
|
return True
|
||||||
|
|
||||||
def getpath(self):
|
def getpath(self):
|
||||||
if not self.tmpdir:
|
if not self.tmpdir:
|
||||||
envrcltmp = os.getenv('RECOLL_TMPDIR')
|
envrcltmp = os.getenv('RECOLL_TMPDIR')
|
||||||
@ -286,7 +297,7 @@ class SafeTmpDir:
|
|||||||
else:
|
else:
|
||||||
self.toptmp = tempfile.mkdtemp(prefix='rcltmp')
|
self.toptmp = tempfile.mkdtemp(prefix='rcltmp')
|
||||||
|
|
||||||
self.tmpdir = os.path.join(self.toptmp, 'rclsofftmp')
|
self.tmpdir = os.path.join(self.toptmp, self.tag)
|
||||||
os.makedirs(self.tmpdir)
|
os.makedirs(self.tmpdir)
|
||||||
|
|
||||||
return self.tmpdir
|
return self.tmpdir
|
||||||
|
|||||||
@ -42,33 +42,25 @@ pdftoppmcmd = None
|
|||||||
def _deb(s):
|
def _deb(s):
|
||||||
rclexecm.logmsg("rclocrtesseract: %s" % s)
|
rclexecm.logmsg("rclocrtesseract: %s" % s)
|
||||||
|
|
||||||
def vacuumdir(dir):
|
|
||||||
if dir:
|
|
||||||
for fn in os.listdir(dir):
|
|
||||||
path = os.path.join(dir, fn)
|
|
||||||
if os.path.isfile(path):
|
|
||||||
os.unlink(path)
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
tmpdir = None
|
tmpdir = None
|
||||||
|
|
||||||
def _maybemaketmpdir():
|
def _maybemaketmpdir():
|
||||||
global tmpdir
|
global tmpdir
|
||||||
if tmpdir:
|
if tmpdir:
|
||||||
if not vacuumdir(tmpdir):
|
if not tmpdir.vacuumdir():
|
||||||
_deb("openfile: vacuumdir %s failed" % tmpdir)
|
_deb("openfile: vacuumdir %s failed" % tmpdir.getpath())
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
tmpdir = tempfile.mkdtemp(prefix='rclocrtmp')
|
tmpdir = rclexecm.SafeTmpDir("rclocrtesseract")
|
||||||
|
|
||||||
|
|
||||||
def cleanocr():
|
def cleanocr():
|
||||||
global tmpdir
|
global tmpdir
|
||||||
if tmpdir:
|
if tmpdir:
|
||||||
vacuumdir(tmpdir)
|
del tmpdir
|
||||||
os.rmdir(tmpdir)
|
|
||||||
tmpdir = None
|
tmpdir = None
|
||||||
|
|
||||||
|
|
||||||
# Return true if tesseract and the appropriate conversion program for
|
# Return true if tesseract and the appropriate conversion program for
|
||||||
# the file type (e.g. pdftoppt for pdf) appear to be available
|
# the file type (e.g. pdftoppt for pdf) appear to be available
|
||||||
def ocrpossible(config, path):
|
def ocrpossible(config, path):
|
||||||
@ -165,12 +157,12 @@ def _pdftesseract(config, path):
|
|||||||
|
|
||||||
tesseractlang = _guesstesseractlang(config, path)
|
tesseractlang = _guesstesseractlang(config, path)
|
||||||
|
|
||||||
#tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
|
#tesserrorfile = os.path.join(tmpdir.getpath(), "tesserrorfile")
|
||||||
tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
|
tmpfile = os.path.join(tmpdir.getpath(), "ocrXXXXXX")
|
||||||
|
|
||||||
# Split pdf pages
|
# Split pdf pages
|
||||||
try:
|
try:
|
||||||
vacuumdir(tmpdir)
|
tmpdir.vacuumdir()
|
||||||
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
|
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
|
||||||
#_deb("Executing %s" % cmd)
|
#_deb("Executing %s" % cmd)
|
||||||
subprocess.check_call(cmd)
|
subprocess.check_call(cmd)
|
||||||
|
|||||||
@ -69,8 +69,7 @@ _htmlsuffix = b'''</pre></body></html>'''
|
|||||||
def finalcleanup():
|
def finalcleanup():
|
||||||
global tmpdir
|
global tmpdir
|
||||||
if tmpdir:
|
if tmpdir:
|
||||||
vacuumdir(tmpdir)
|
del tmpdir
|
||||||
os.rmdir(tmpdir)
|
|
||||||
tmpdir = None
|
tmpdir = None
|
||||||
|
|
||||||
ocrproc = None
|
ocrproc = None
|
||||||
@ -93,14 +92,6 @@ except: pass
|
|||||||
try: signal.signal(signal.SIGTERM, signal_handler)
|
try: signal.signal(signal.SIGTERM, signal_handler)
|
||||||
except: pass
|
except: pass
|
||||||
|
|
||||||
def vacuumdir(dir):
|
|
||||||
if dir:
|
|
||||||
for fn in os.listdir(dir):
|
|
||||||
path = os.path.join(dir, fn)
|
|
||||||
if os.path.isfile(path):
|
|
||||||
os.unlink(path)
|
|
||||||
return True
|
|
||||||
|
|
||||||
class PDFExtractor:
|
class PDFExtractor:
|
||||||
def __init__(self, em):
|
def __init__(self, em):
|
||||||
self.currentindex = 0
|
self.currentindex = 0
|
||||||
@ -221,7 +212,7 @@ class PDFExtractor:
|
|||||||
# no big deal
|
# no big deal
|
||||||
return True
|
return True
|
||||||
try:
|
try:
|
||||||
vacuumdir(tmpdir)
|
tmpdir.vacuumdir()
|
||||||
# Note: the java version of pdftk sometimes/often fails
|
# Note: the java version of pdftk sometimes/often fails
|
||||||
# here with writing to stdout:
|
# here with writing to stdout:
|
||||||
# Error occurred during initialization of VM
|
# Error occurred during initialization of VM
|
||||||
@ -231,9 +222,9 @@ class PDFExtractor:
|
|||||||
# output, until we fix the error or preferably find a way
|
# output, until we fix the error or preferably find a way
|
||||||
# to do it with poppler...
|
# to do it with poppler...
|
||||||
subprocess.check_call(
|
subprocess.check_call(
|
||||||
[self.pdftk, self.filename, "unpack_files", "output",
|
[self.pdftk, self.filename, "unpack_files", "output", tmpdir.getpath()],
|
||||||
tmpdir], stdout=sys.stderr)
|
stdout=sys.stderr)
|
||||||
self.attachlist = sorted(os.listdir(tmpdir))
|
self.attachlist = sorted(os.listdir(tmpdir.getpath()))
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.em.rclog("extractAttach: failed: %s" % e)
|
self.em.rclog("extractAttach: failed: %s" % e)
|
||||||
@ -407,11 +398,12 @@ class PDFExtractor:
|
|||||||
def maybemaketmpdir(self):
|
def maybemaketmpdir(self):
|
||||||
global tmpdir
|
global tmpdir
|
||||||
if tmpdir:
|
if tmpdir:
|
||||||
if not vacuumdir(tmpdir):
|
if not tmpdir.vacuumdir():
|
||||||
self.em.rclog("openfile: vacuumdir %s failed" % tmpdir)
|
self.em.rclog("openfile: vacuumdir %s failed" % tmpdir.getpath())
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
|
tmpdir = rclexecm.SafeTmpDir("rclpdf", self.em)
|
||||||
|
#self.em.rclog("Using temporary directory %s" % tmpdir.getpath())
|
||||||
if self.pdftk and re.match("/snap/", self.pdftk):
|
if self.pdftk and re.match("/snap/", self.pdftk):
|
||||||
# We know this is Unix (Ubuntu actually). Check that tmpdir
|
# We know this is Unix (Ubuntu actually). Check that tmpdir
|
||||||
# belongs to the user as snap commands can't use /tmp to share
|
# belongs to the user as snap commands can't use /tmp to share
|
||||||
@ -423,9 +415,7 @@ class PDFExtractor:
|
|||||||
if st.st_uid == os.getuid():
|
if st.st_uid == os.getuid():
|
||||||
ok = True
|
ok = True
|
||||||
if not ok:
|
if not ok:
|
||||||
self.em.rclog(
|
self.em.rclog("pdftk is a snap command and needs TMPDIR to be owned by you")
|
||||||
"pdftk is a snap command and needs TMPDIR to be "
|
|
||||||
"a directory you own")
|
|
||||||
|
|
||||||
def _process_annotations(self, html):
|
def _process_annotations(self, html):
|
||||||
doc = Poppler.Document.new_from_file(
|
doc = Poppler.Document.new_from_file(
|
||||||
@ -530,7 +520,7 @@ class PDFExtractor:
|
|||||||
if not self.attextractdone:
|
if not self.attextractdone:
|
||||||
if not self.extractAttach():
|
if not self.extractAttach():
|
||||||
return (False, "", "", rclexecm.RclExecM.eofnow)
|
return (False, "", "", rclexecm.RclExecM.eofnow)
|
||||||
path = os.path.join(tmpdir, ipath)
|
path = os.path.join(tmpdir.getpath(), ipath)
|
||||||
if os.path.isfile(path):
|
if os.path.isfile(path):
|
||||||
f = open(path, "rb")
|
f = open(path, "rb")
|
||||||
docdata = f.read();
|
docdata = f.read();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user