OCR cache: do not create Path entries for temporary files

This commit is contained in:
Jean-Francois Dockes 2022-06-18 16:06:51 +02:00
parent 3071ea203e
commit 9a7561517f

View File

@ -22,37 +22,63 @@
# OCR is extremely slow, caching the results is necessary. # OCR is extremely slow, caching the results is necessary.
# #
# The cache stores 2 kinds of objects: # The cache stores 2 kinds of objects:
# - Path files are named from the hash of the image file path and # - Path files are named from the hash of the image file path and contain the
# contain the image data hash, the modification time and size of the # image data hash, the modification time and size of the image file at the
# image file at the time the OCR'd data was stored in the cache, and # time the OCR'd data was stored in the cache, and the image path itself (the
# the image path itself (the last is for purging only). # last is for purging only).
# - Data files are named with the hash of the image data and contain # - Data files are named with the hash of the image data and contain the
# the zlib-compressed OCR'd data. # zlib-compressed OCR'd data.
# - The cache Path and Data files are stored under top subdirectories: objects/
# and paths/.
# #
# When retrieving data from the cache: # When retrieving data from the cache:
# - We first use the image file size and modification time: if an # - We first use the image file size and modification time: if an entry exists
# entry exists for the imagepath/mtime/size triplet, and is up to # for the imagepath/mtime/size triplet, and is up to date, the corresponding
# date, the corresponding data is obtained from the data file and # data is obtained from the data file and returned.
# returned. # - Else we then use the image data: if an entry exists for the computed hashed
# - Else we then use the image data: if an entry exists for the # value of the data, it is returned. This allows moving files around without
# computed hashed value of the data, it is returned. This allows # needing to run OCR again, but of course, it is more expensive than the
# moving files around without needing to run OCR again, but of # first step
# course, it is more expensive than the first step
# #
# If we need to use the second step, as a side effect, a path file is # In both cases, the paths are hashed with sha1, and the first two characters of
# created or updated so that the data will be found with the first # the hash are used as a top level directory, the rest as a file name. E.g. for:
# step next time around. # pd,pf = self._hashpath(path), the result would be stored under pd/pf
# #
# Purging the cache of obsolete data. # If we need to use the second step, as a side effect, a path file is created or
# updated so that the data will be found with the first step next time around.
# #
# - The cache path and data files are stored under 2 different # When processing embedded documents like email attachments, recoll uses
# directories (objects, paths) to make purging easier. # temporary copies in TMPDIR (which defaults to /tmp) or RECOLL_TMPDIR. Of
# - Purging the paths tree just involves walking it, reading the # course the paths for the temporary files changes when re-processing a given
# files, and checking the existence of the recorded paths. # document. We do not store the Path file for data stored in TMPDIR or
# - There is no easy way to purge the data tree. The only possibility # RECOLL_TMPDIR, because doing so would cause an indefinite accumulation of
# is to input a list of possible source files (e.g. result of a # unusable Path files. This means that access to the OCR data for these
# find in the image files area), and compute all the hashes. Data # documents always causes the computation of the data hash, and is slower. With
# files which do not match one of the hashes are deleted. # recent Recoll versions which cache the text content in the index, this only
# occurs when reindexing (with older versions, this could also occur for
# Preview).
#
# Purging the cache of obsolete data:
#
# This can be done by running this file as a top level script with a --purge
# option (possibly completed by a --purgedata option but see below)
# - Purging the paths tree just involves walking it, reading the files, and
# checking the existence of the recorded paths. Path files for non-existent
# files are deleted.
# - Purging the data tree: we make a list of all Data files referenced by at
# least one Path file, then walk the data tree, deleting unreferenced
# files. This means that Data files from temporary document copies (see
# above) will be deleted, which is quite unsatisfying. This would be
# difficult to change:
# - There is no way to detect the affected files because the Data files store
# no origin information
# - Even if we wanted to store an indication that the data file comes from a
# temporary document, we'd have no way to access the original document
# because the full ipath is not available. Changing this would be close to
# impossible because internfile...
# In consequence the --purgedata option must be explicitely added for a data
# purge to be performed. Only set it if re-OCRing all embedded documents is reasonable.
import sys import sys
import os import os
@ -61,10 +87,18 @@ import urllib.parse
import zlib import zlib
import glob import glob
import rclexecm from rclexecm import logmsg as _deb
def _deb(s): def _catslash(p):
rclexecm.logmsg(s) if p and p[-1] != "/":
p += "/"
return p
_tmpdir = os.environ["TMPDIR"] if "TMPDIR" in os.environ else "/tmp"
_tmpdir = _catslash(_tmpdir)
_recoll_tmpdir = os.environ["RECOLL_TMPDIR"] if "RECOLL_TMPDIR" in os.environ else None
_recoll_tmpdir = _catslash(_recoll_tmpdir)
class OCRCache(object): class OCRCache(object):
@ -90,7 +124,7 @@ class OCRCache(object):
# Compute sha1 of path data contents, as two parts of 2 and 38 chars # Compute sha1 of path data contents, as two parts of 2 and 38 chars
def _hashdata(self, path): def _hashdata(self, path):
#_deb("Hashing DATA") # _deb("Hashing DATA")
m = hashlib.sha1() m = hashlib.sha1()
with open(path, "rb") as f: with open(path, "rb") as f:
while True: while True:
@ -101,35 +135,35 @@ class OCRCache(object):
h = m.hexdigest() h = m.hexdigest()
return h[0:2], h[2:] return h[0:2], h[2:]
def _readpathfile(self, ppf): def _readpathfile(self, ppf):
'''Read path file and return values. We do not decode the image path '''Read path file and return values. We do not decode the image path
as this is only used for purging''' as this is only used for purging'''
with open(ppf, 'r') as f: with open(ppf, 'r') as f:
line = f.read() line = f.read()
dd,df,tm,sz,pth = line.split() dd, df, tm, sz, pth = line.split()
tm = int(tm) tm = int(tm)
sz = int(sz) sz = int(sz)
return dd,df,tm,sz,pth return dd, df, tm, sz, pth
# Try to read the stored attributes for a given path: data hash, # Try to read the stored attributes for a given path: data hash,
# modification time and size. If this fails, the path itself is # modification time and size. If this fails, the path itself is
# not cached (but the data still might be, maybe the file was moved) # not cached (but the data still might be, maybe the file was moved)
def _cachedpathattrs(self, path): def _cachedpathattrs(self, path):
pd,pf = self._hashpath(path) pd, pf = self._hashpath(path)
pathfilepath = os.path.join(self.pathdir, pd, pf) pathfilepath = os.path.join(self.pathdir, pd, pf)
if not os.path.exists(pathfilepath): if not os.path.exists(pathfilepath):
return False, None, None, None, None return False, None, None, None, None
try: try:
dd, df, tm, sz, pth = self._readpathfile(pathfilepath) dd, df, tm, sz, pth = self._readpathfile(pathfilepath)
return True, dd, df, tm, sz return True, dd, df, tm, sz
except: except Exception as ex:
_deb(f"Error while trying to access pathfile {pathfilepath}: {ex}")
return False, None, None, None, None return False, None, None, None, None
# Compute the path hash, and get the mtime and size for given # Compute the path hash, and get the mtime and size for given
# path, for updating the cache path file # path, for updating the cache path file
def _newpathattrs(self, path): def _newpathattrs(self, path):
pd,pf = self._hashpath(path) pd, pf = self._hashpath(path)
tm = int(os.path.getmtime(path)) tm = int(os.path.getmtime(path))
sz = int(os.path.getsize(path)) sz = int(os.path.getsize(path))
return pd, pf, tm, sz return pd, pf, tm, sz
@ -142,31 +176,25 @@ class OCRCache(object):
if not ret: if not ret:
return False, None, None return False, None, None
pd, pf, ntm, nsz = self._newpathattrs(path) pd, pf, ntm, nsz = self._newpathattrs(path)
#_deb(" tm %d sz %d" % (ntm, nsz)) # _deb(" tm %d sz %d" % (ntm, nsz))
#_deb("otm %d osz %d" % (otm, osz)) # _deb("otm %d osz %d" % (otm, osz))
if otm != ntm or osz != nsz: if otm != ntm or osz != nsz:
return False, None, None return False, None, None
return True, od, of return True, od, of
# Check if cache appears up to date for path (no data check),
# return True/False
def pathincache(self, path):
ret, dd, df = self._pathincache(path)
return ret
# Compute the data file name for path. Expensive: we compute the data hash. # Compute the data file name for path. Expensive: we compute the data hash.
# Return both the data file path and path elements (for storage in path file) # Return both the data file path and path elements (for storage in path file)
def _datafilename(self, path): def _datafilename(self, path):
d, f = self._hashdata(path) d, f = self._hashdata(path)
return os.path.join(self.objdir, d, f), d, f return os.path.join(self.objdir, d, f), d, f
# Check if the data for path is in cache: expensive, needs to
# compute the hash for the path's data contents. Returns True/False
def dataincache(self, path):
return os.path.exists(self._datafilename(path)[0])
# Create path file with given elements. # Create path file with given elements.
def _updatepathfile(self, pd, pf, dd, df, tm, sz, path): def _updatepathfile(self, pd, pf, dd, df, tm, sz, path):
global _tmpdir, _recoll_tmpdir
if (_tmpdir and path.startswith(_tmpdir)) or \
(_recoll_tmpdir and path.startswith(_recoll_tmpdir)):
_deb(f"ocrcache: not storing path data for temporary file {path}")
return
dir = os.path.join(self.pathdir, pd) dir = os.path.join(self.pathdir, pd)
if not os.path.exists(dir): if not os.path.exists(dir):
os.makedirs(dir) os.makedirs(dir)
@ -178,7 +206,7 @@ class OCRCache(object):
# Store data for path. Only rewrite an existing data file if told # Store data for path. Only rewrite an existing data file if told
# to do so: this is only useful if we are forcing an OCR re-run. # to do so: this is only useful if we are forcing an OCR re-run.
def store(self, path, datatostore, force=False): def store(self, path, datatostore, force=False):
dd,df = self._hashdata(path) dd, df = self._hashdata(path)
pd, pf, tm, sz = self._newpathattrs(path) pd, pf, tm, sz = self._newpathattrs(path)
self._updatepathfile(pd, pf, dd, df, tm, sz, path) self._updatepathfile(pd, pf, dd, df, tm, sz, path)
dir = os.path.join(self.objdir, dd) dir = os.path.join(self.objdir, dd)
@ -186,7 +214,7 @@ class OCRCache(object):
os.makedirs(dir) os.makedirs(dir)
dfile = os.path.join(dir, df) dfile = os.path.join(dir, df)
if force or not os.path.exists(dfile): if force or not os.path.exists(dfile):
#_deb("Storing data") # _deb("Storing data")
cpressed = zlib.compress(datatostore) cpressed = zlib.compress(datatostore)
with open(dfile, "wb") as f: with open(dfile, "wb") as f:
f.write(cpressed) f.write(cpressed)
@ -203,11 +231,12 @@ class OCRCache(object):
dfn, dd, df = self._datafilename(path) dfn, dd, df = self._datafilename(path)
if not os.path.exists(dfn): if not os.path.exists(dfn):
_deb(f"ocrcache: no existing OCR data file for {path}")
return False, b"" return False, b""
if not pincache: if not pincache:
# File has moved. create/Update path file for next time # File may have moved. Create/Update path file for next time
_deb("ocrcache::get file %s was moved, updating path data" % path) _deb(f"ocrcache::get: data ok but path file for {path} does not exist: creating it")
pd, pf, tm, sz = self._newpathattrs(path) pd, pf, tm, sz = self._newpathattrs(path)
self._updatepathfile(pd, pf, dd, df, tm, sz, path) self._updatepathfile(pd, pf, dd, df, tm, sz, path)
@ -223,7 +252,7 @@ class OCRCache(object):
ntm = int(os.path.getmtime(origpath)) ntm = int(os.path.getmtime(origpath))
nsz = int(os.path.getsize(origpath)) nsz = int(os.path.getsize(origpath))
if ntm != otm or nsz != osz: if ntm != otm or nsz != osz:
#_deb("Purgepaths otm %d ntm %d osz %d nsz %d"%(otm, ntm, osz, nsz)) # _deb("Purgepaths otm %d ntm %d osz %d nsz %d"%(otm, ntm, osz, nsz))
return True return True
return False return False
@ -251,15 +280,15 @@ class OCRCache(object):
def _pgdt_pathcb(self, f): def _pgdt_pathcb(self, f):
'''Get a pathfile name, read it, and record datafile identifier '''Get a pathfile name, read it, and record datafile identifier
(concatenate data file subdir and file name)''' (concatenate data file subdir and file name)'''
#_deb("_pgdt_pathcb: %s" % f) # _deb("_pgdt_pathcb: %s" % f)
dd, df, tm, sz, orgpath = self._readpathfile(f) dd, df, tm, sz, orgpath = self._readpathfile(f)
self._pgdt_alldatafns.add(dd+df) self._pgdt_alldatafns.add(dd+df)
def _pgdt_datacb(self, datafn): def _pgdt_datacb(self, datafn):
'''Get a datafile name and check that it is referenced by a previously '''Get a datafile name and check that it is referenced by a previously
seen pathfile''' seen pathfile'''
p1,fn = os.path.split(datafn) p1, fn = os.path.split(datafn)
p2,dn = os.path.split(p1) p2, dn = os.path.split(p1)
tst = dn+fn tst = dn+fn
if tst in self._pgdt_alldatafns: if tst in self._pgdt_alldatafns:
_deb("purgedata: ok : %s" % datafn) _deb("purgedata: ok : %s" % datafn)
@ -282,48 +311,59 @@ class OCRCache(object):
self._walk(self.objdir, self._pgdt_datacb) self._walk(self.objdir, self._pgdt_datacb)
if __name__ == '__main__': if __name__ == '__main__':
import rclconfig import rclconfig
def _Usage(): import getopt
_deb("Usage: rclocrcache.py --purge")
def Usage(f=sys.stderr):
print("Usage: rclocrcache.py --purge [--purgedata]", file=f)
print("Usage: rclocrcache.py --store <imgdatapath> <ocrdatapath>", file=f)
print("Usage: rclocrcache.py --get <imgdatapath>", file=f)
sys.exit(1) sys.exit(1)
if len(sys.argv) != 2:
_Usage()
if sys.argv[1] != "--purge":
_Usage()
conf = rclconfig.RclConfig() conf = rclconfig.RclConfig()
cache = OCRCache(conf) cache = OCRCache(conf)
cache.purgepaths() opts, args = getopt.getopt(sys.argv[1:], "h", ["help", "purge", "purgedata", "store", "get"])
cache.purgedata() purgedata = False
sys.exit(0) purge = False
for opt, arg in opts:
if opt in ['-h', '--help']:
Usage(sys.stdout)
elif opt in ['--purgedata']:
purgedata = True
elif opt in ['--purge']:
if len(args) != 0:
Usage()
purge = True
elif opt in ['--store']:
if len(args) != 2:
Usage()
imgdatapath = args[0]
ocrdatapath = args[1]
ocrdata = open(ocrdatapath, "rb").read()
cache.store(imgdatapath, ocrdata, force=False)
sys.exit(0)
elif opt in ['--get']:
if len(args) != 1:
Usage()
imgdatapath = args[0]
incache, data = cache.get(imgdatapath)
if incache:
print(f"OCR data from cache {data}")
sys.exit(0)
else:
print("OCR Data was not found in cache", file=sys.stderr)
sys.exit(1)
else:
print(f"Unknown option {opt}", file=sys.stderr)
Usage()
# End options. Need purging ?
if purge:
cache.purgepaths()
if purgedata:
cache.purgedata()
Usage()
# def trycache(p):
# _deb("== CACHE tests for %s"%p)
# ret = cache.pathincache(p)
# s = "" if ret else " not"
# _deb("path for %s%s in cache" % (p, s))
# if not ret:
# return False
# ret = cache.dataincache(p)
# s = "" if ret else " not"
# _deb("data for %s%s in cache" % (p, s))
# return ret
# def trystore(p):
# _deb("== STORE test for %s" % p)
# cache.store(p, b"my OCR'd text is one line\n", force=False)
# def tryget(p):
# _deb("== GET test for %s" % p)
# incache, data = cache.get(p)
# if incache:
# _deb("Data from cache [%s]" % data)
# else:
# _deb("Data was not found in cache")
# return incache, data
# if False:
# path = sys.argv[1]
# incache, data = tryget(path)
# if not incache:
# trystore(path)
#