OCR cache: do not create Path entries for temporary files

2022-06-18 16:06:51 +02:00 · 2022-06-18 16:06:51 +02:00 · 9a7561517f
commit 9a7561517f
parent 3071ea203e
1 changed files with 143 additions and 103 deletions
--- a/src/filters/rclocrcache.py
+++ b/src/filters/rclocrcache.py
@ -22,37 +22,63 @@
 # OCR is extremely slow, caching the results is necessary.
 #
 # The cache stores 2 kinds of objects:
-# - Path files are named from the hash of the image file path and
+# - Path files are named from the hash of the image file path and contain the
-#   contain the image data hash, the modification time and size of the
+#   image data hash, the modification time and size of the image file at the
-#   image file at the time the OCR'd data was stored in the cache, and
+#   time the OCR'd data was stored in the cache, and the image path itself (the
-#   the image path itself (the last is for purging only).
+#   last is for purging only).
-# - Data files are named with the hash of the image data and contain
+# - Data files are named with the hash of the image data and contain the
-#   the zlib-compressed OCR'd data.
+#   zlib-compressed OCR'd data.
 # - The cache Path and Data files are stored under top subdirectories: objects/
 #   and paths/.
 #
 # When retrieving data from the cache:
-#  - We first use the image file size and modification time: if an
+#  - We first use the image file size and modification time: if an entry exists
-#    entry exists for the imagepath/mtime/size triplet, and is up to
+#    for the imagepath/mtime/size triplet, and is up to date, the corresponding
-#    date, the corresponding data is obtained from the data file and
+#    data is obtained from the data file and returned.
-#    returned.
+#  - Else we then use the image data: if an entry exists for the computed hashed
-#  - Else we then use the image data: if an entry exists for the
+#    value of the data, it is returned. This allows moving files around without
-#    computed hashed value of the data, it is returned. This allows
+#    needing to run OCR again, but of course, it is more expensive than the
-#    moving files around without needing to run OCR again, but of
+#    first step
 #    course, it is more expensive than the first step
 #
-#  If we need to use the second step, as a side effect, a path file is
+# In both cases, the paths are hashed with sha1, and the first two characters of
-#  created or updated so that the data will be found with the first
+# the hash are used as a top level directory, the rest as a file name. E.g. for:
-#  step next time around.
+#   pd,pf = self._hashpath(path), the result would be stored under pd/pf
 #
-# Purging the cache of obsolete data.
+# If we need to use the second step, as a side effect, a path file is created or
 # updated so that the data will be found with the first step next time around.
 #
-#  - The cache path and data files are stored under 2 different
+# When processing embedded documents like email attachments, recoll uses
-#    directories (objects, paths) to make purging easier.
+# temporary copies in TMPDIR (which defaults to /tmp) or RECOLL_TMPDIR. Of
-#  - Purging the paths tree just involves walking it, reading the
+# course the paths for the temporary files changes when re-processing a given
-#    files, and checking the existence of the recorded paths.
+# document. We do not store the Path file for data stored in TMPDIR or
-#  - There is no easy way to purge the data tree. The only possibility
+# RECOLL_TMPDIR, because doing so would cause an indefinite accumulation of
-#    is to input a list of possible source files (e.g. result of a
+# unusable Path files. This means that access to the OCR data for these
-#    find in the image files area), and compute all the hashes. Data
+# documents always causes the computation of the data hash, and is slower. With
-#    files which do not match one of the hashes are deleted.
+# recent Recoll versions which cache the text content in the index, this only
 # occurs when reindexing (with older versions, this could also occur for
 # Preview).
 #
 # Purging the cache of obsolete data:
 #
 # This can be done by running this file as a top level script with a --purge
 # option (possibly completed by a --purgedata option but see below)
 #  - Purging the paths tree just involves walking it, reading the files, and
 #    checking the existence of the recorded paths. Path files for non-existent
 #    files are deleted.
 #  - Purging the data tree: we make a list of all Data files referenced by at
 #    least one Path file, then walk the data tree, deleting unreferenced
 #    files. This means that Data files from temporary document copies (see
 #    above) will be deleted, which is quite unsatisfying. This would be
 #    difficult to change:
 #    - There is no way to detect the affected files because the Data files store
 #      no origin information
 #    - Even if we wanted to store an indication that the data file comes from a
 #      temporary document, we'd have no way to access the original document
 #      because the full ipath is not available. Changing this would be close to
 #      impossible because internfile...
 # In consequence the --purgedata option must be explicitely added for a data
 # purge to be performed. Only set it if re-OCRing all embedded documents is reasonable.
 import sys
 import os
@ -61,10 +87,18 @@ import urllib.parse
 import zlib
 import glob
-import rclexecm
+from rclexecm import logmsg as _deb
-def _deb(s):
+def _catslash(p):
-    rclexecm.logmsg(s)
+    if p and p[-1] != "/":
        p += "/"
    return p
 _tmpdir = os.environ["TMPDIR"] if "TMPDIR" in os.environ else "/tmp"
 _tmpdir = _catslash(_tmpdir)
 _recoll_tmpdir = os.environ["RECOLL_TMPDIR"] if "RECOLL_TMPDIR" in os.environ else None
 _recoll_tmpdir = _catslash(_recoll_tmpdir)
 class OCRCache(object):
@ -90,7 +124,7 @@ class OCRCache(object):
    # Compute sha1 of path data contents, as two parts of 2 and 38 chars
    def _hashdata(self, path):
-        #_deb("Hashing DATA")
+        # _deb("Hashing DATA")
        m = hashlib.sha1()
        with open(path, "rb") as f:
            while True:
@ -101,35 +135,35 @@ class OCRCache(object):
                h = m.hexdigest()
        return h[0:2], h[2:]
    def _readpathfile(self, ppf):
        '''Read path file and return values. We do not decode the image path
        as this is only used for purging'''
        with open(ppf, 'r') as f:
            line = f.read()
-        dd,df,tm,sz,pth = line.split()
+        dd, df, tm, sz, pth = line.split()
        tm = int(tm)
        sz = int(sz)
-        return dd,df,tm,sz,pth
+        return dd, df, tm, sz, pth
    # Try to read the stored attributes for a given path: data hash,
    # modification time and size. If this fails, the path itself is
    # not cached (but the data still might be, maybe the file was moved)
    def _cachedpathattrs(self, path):
-        pd,pf = self._hashpath(path)
+        pd, pf = self._hashpath(path)
        pathfilepath = os.path.join(self.pathdir, pd, pf)
        if not os.path.exists(pathfilepath):
            return False, None, None, None, None
        try:
            dd, df, tm, sz, pth = self._readpathfile(pathfilepath)
            return True, dd, df, tm, sz
-        except:
+        except Exception as ex:
            _deb(f"Error while trying to access pathfile {pathfilepath}: {ex}")
            return False, None, None, None, None
    # Compute the path hash, and get the mtime and size for given
    # path, for updating the cache path file
    def _newpathattrs(self, path):
-        pd,pf = self._hashpath(path)
+        pd, pf = self._hashpath(path)
        tm = int(os.path.getmtime(path))
        sz = int(os.path.getsize(path))
        return pd, pf, tm, sz
@ -142,31 +176,25 @@ class OCRCache(object):
        if not ret:
            return False, None, None
        pd, pf, ntm, nsz = self._newpathattrs(path)
-        #_deb(" tm %d  sz %d" % (ntm, nsz))
+        # _deb(" tm %d  sz %d" % (ntm, nsz))
-        #_deb("otm %d osz %d" % (otm, osz))
+        # _deb("otm %d osz %d" % (otm, osz))
        if otm != ntm or osz != nsz:
            return False, None, None
        return True, od, of
    # Check if cache appears up to date for path (no data check),
    # return True/False
    def pathincache(self, path):
        ret, dd, df = self._pathincache(path)
        return ret
    # Compute the data file name for path. Expensive: we compute the data hash.
    # Return both the data file path and path elements (for storage in path file)
    def _datafilename(self, path):
        d, f = self._hashdata(path)
        return os.path.join(self.objdir, d, f), d, f
    # Check if the data for path is in cache: expensive, needs to
    # compute the hash for the path's data contents. Returns True/False
    def dataincache(self, path):
        return os.path.exists(self._datafilename(path)[0])
    # Create path file with given elements.
    def _updatepathfile(self, pd, pf, dd, df, tm, sz, path):
        global _tmpdir, _recoll_tmpdir
        if (_tmpdir and path.startswith(_tmpdir)) or \
           (_recoll_tmpdir and path.startswith(_recoll_tmpdir)):
            _deb(f"ocrcache: not storing path data for temporary file {path}")
            return
        dir = os.path.join(self.pathdir, pd)
        if not os.path.exists(dir):
            os.makedirs(dir)
@ -178,7 +206,7 @@ class OCRCache(object):
    # Store data for path. Only rewrite an existing data file if told
    # to do so: this is only useful if we are forcing an OCR re-run.
    def store(self, path, datatostore, force=False):
-        dd,df = self._hashdata(path)
+        dd, df = self._hashdata(path)
        pd, pf, tm, sz = self._newpathattrs(path)
        self._updatepathfile(pd, pf, dd, df, tm, sz, path)
        dir = os.path.join(self.objdir, dd)
@ -186,7 +214,7 @@ class OCRCache(object):
            os.makedirs(dir)
        dfile = os.path.join(dir, df)
        if force or not os.path.exists(dfile):
-            #_deb("Storing data")
+            # _deb("Storing data")
            cpressed = zlib.compress(datatostore)
            with open(dfile, "wb") as f:
                f.write(cpressed)
@ -203,11 +231,12 @@ class OCRCache(object):
            dfn, dd, df = self._datafilename(path)
        if not os.path.exists(dfn):
            _deb(f"ocrcache: no existing OCR data file for {path}")
            return False, b""
        if not pincache:
-            # File has moved. create/Update path file for next time
+            # File may have moved. Create/Update path file for next time
-            _deb("ocrcache::get file %s was moved, updating path data" % path)
+            _deb(f"ocrcache::get: data ok but path file for {path} does not exist: creating it")
            pd, pf, tm, sz = self._newpathattrs(path)
            self._updatepathfile(pd, pf, dd, df, tm, sz, path)
@ -223,7 +252,7 @@ class OCRCache(object):
        ntm = int(os.path.getmtime(origpath))
        nsz = int(os.path.getsize(origpath))
        if ntm != otm or nsz != osz:
-            #_deb("Purgepaths otm %d ntm %d osz %d nsz %d"%(otm, ntm, osz, nsz))
+            # _deb("Purgepaths otm %d ntm %d osz %d nsz %d"%(otm, ntm, osz, nsz))
            return True
        return False
@ -251,15 +280,15 @@ class OCRCache(object):
    def _pgdt_pathcb(self, f):
        '''Get a pathfile name, read it, and record datafile identifier
        (concatenate data file subdir and file name)'''
-        #_deb("_pgdt_pathcb: %s" % f)
+        # _deb("_pgdt_pathcb: %s" % f)
        dd, df, tm, sz, orgpath = self._readpathfile(f)
        self._pgdt_alldatafns.add(dd+df)
    def _pgdt_datacb(self, datafn):
        '''Get a datafile name and check that it is referenced by a previously
        seen pathfile'''
-        p1,fn = os.path.split(datafn)
+        p1, fn = os.path.split(datafn)
-        p2,dn = os.path.split(p1)
+        p2, dn = os.path.split(p1)
        tst = dn+fn
        if tst in self._pgdt_alldatafns:
            _deb("purgedata: ok         : %s" % datafn)
@ -282,48 +311,59 @@ class OCRCache(object):
        self._walk(self.objdir, self._pgdt_datacb)
 if __name__ == '__main__':
    import rclconfig
-    def _Usage():
+    import getopt
-        _deb("Usage: rclocrcache.py --purge")
+
    def Usage(f=sys.stderr):
        print("Usage: rclocrcache.py --purge [--purgedata]", file=f)
        print("Usage: rclocrcache.py --store <imgdatapath> <ocrdatapath>", file=f)
        print("Usage: rclocrcache.py --get <imgdatapath>", file=f)
        sys.exit(1)
    if len(sys.argv) != 2:
        _Usage()
    if sys.argv[1] != "--purge":
        _Usage()
    conf = rclconfig.RclConfig()
    cache = OCRCache(conf)
-    cache.purgepaths()
+    opts, args = getopt.getopt(sys.argv[1:], "h", ["help", "purge", "purgedata", "store", "get"])
-    cache.purgedata()
+    purgedata = False
-    sys.exit(0)
+    purge = False
    for opt, arg in opts:
        if opt in ['-h', '--help']:
            Usage(sys.stdout)
        elif opt in ['--purgedata']:
            purgedata = True
        elif opt in ['--purge']:
            if len(args) != 0:
                Usage()
            purge = True
        elif opt in ['--store']:
            if len(args) != 2:
                Usage()
            imgdatapath = args[0]
            ocrdatapath = args[1]
            ocrdata = open(ocrdatapath, "rb").read()
            cache.store(imgdatapath, ocrdata, force=False)
            sys.exit(0)
        elif opt in ['--get']:
            if len(args) != 1:
                Usage()
            imgdatapath = args[0]
            incache, data = cache.get(imgdatapath)
            if incache:
                print(f"OCR data from cache {data}")
                sys.exit(0)
            else:
                print("OCR Data was not found in cache", file=sys.stderr)
                sys.exit(1)
        else:
            print(f"Unknown option {opt}", file=sys.stderr)
            Usage()
    # End options. Need purging ?
    if purge:
        cache.purgepaths()
        if purgedata:
            cache.purgedata()
    Usage()
 #    def trycache(p):
 #        _deb("== CACHE tests for %s"%p)
 #        ret = cache.pathincache(p)
 #        s = "" if ret else " not"
 #        _deb("path for %s%s in cache" % (p, s))
 #        if not ret:
 #            return False
 #        ret = cache.dataincache(p)
 #        s = "" if ret else " not"
 #        _deb("data for %s%s in cache" % (p, s))
 #        return ret
 #    def trystore(p):
 #        _deb("== STORE test for %s" % p)
 #        cache.store(p, b"my OCR'd text is one line\n", force=False)
 #    def tryget(p):
 #        _deb("== GET test for %s" % p)
 #        incache, data = cache.get(p)
 #        if incache:
 #            _deb("Data from cache [%s]" % data)
 #        else:
 #            _deb("Data was not found in cache")
 #        return incache, data
 #    if False:
 #        path = sys.argv[1]
 #        incache, data = tryget(path)
 #        if not incache:
 #            trystore(path)
 #