From 9a7561517f1858d9d6f1de781b7f0481c7dcb1e8 Mon Sep 17 00:00:00 2001
From: Jean-Francois Dockes <jf@dockes.org>
Date: Sat, 18 Jun 2022 16:06:51 +0200
Subject: [PATCH] OCR cache: do not create Path entries for temporary files

---
 src/filters/rclocrcache.py | 246 +++++++++++++++++++++----------------
 1 file changed, 143 insertions(+), 103 deletions(-)

diff --git a/src/filters/rclocrcache.py b/src/filters/rclocrcache.py
index 6d71994e..f885ca86 100755
--- a/src/filters/rclocrcache.py
+++ b/src/filters/rclocrcache.py
@@ -22,37 +22,63 @@
 # OCR is extremely slow, caching the results is necessary.
 #
 # The cache stores 2 kinds of objects:
-# - Path files are named from the hash of the image file path and
-#   contain the image data hash, the modification time and size of the
-#   image file at the time the OCR'd data was stored in the cache, and
-#   the image path itself (the last is for purging only).
-# - Data files are named with the hash of the image data and contain
-#   the zlib-compressed OCR'd data.
+# - Path files are named from the hash of the image file path and contain the
+#   image data hash, the modification time and size of the image file at the
+#   time the OCR'd data was stored in the cache, and the image path itself (the
+#   last is for purging only).
+# - Data files are named with the hash of the image data and contain the
+#   zlib-compressed OCR'd data.
+# - The cache Path and Data files are stored under top subdirectories: objects/
+#   and paths/.
 #
 # When retrieving data from the cache:
-#  - We first use the image file size and modification time: if an
-#    entry exists for the imagepath/mtime/size triplet, and is up to
-#    date, the corresponding data is obtained from the data file and
-#    returned.
-#  - Else we then use the image data: if an entry exists for the
-#    computed hashed value of the data, it is returned. This allows
-#    moving files around without needing to run OCR again, but of
-#    course, it is more expensive than the first step
+#  - We first use the image file size and modification time: if an entry exists
+#    for the imagepath/mtime/size triplet, and is up to date, the corresponding
+#    data is obtained from the data file and returned.
+#  - Else we then use the image data: if an entry exists for the computed hashed
+#    value of the data, it is returned. This allows moving files around without
+#    needing to run OCR again, but of course, it is more expensive than the
+#    first step
 #
-#  If we need to use the second step, as a side effect, a path file is
-#  created or updated so that the data will be found with the first
-#  step next time around.
+# In both cases, the paths are hashed with sha1, and the first two characters of
+# the hash are used as a top level directory, the rest as a file name. E.g. for:
+#   pd,pf = self._hashpath(path), the result would be stored under pd/pf
 #
-# Purging the cache of obsolete data.
+# If we need to use the second step, as a side effect, a path file is created or
+# updated so that the data will be found with the first step next time around.
 #
-#  - The cache path and data files are stored under 2 different
-#    directories (objects, paths) to make purging easier.
-#  - Purging the paths tree just involves walking it, reading the
-#    files, and checking the existence of the recorded paths.
-#  - There is no easy way to purge the data tree. The only possibility
-#    is to input a list of possible source files (e.g. result of a
-#    find in the image files area), and compute all the hashes. Data
-#    files which do not match one of the hashes are deleted.
+# When processing embedded documents like email attachments, recoll uses
+# temporary copies in TMPDIR (which defaults to /tmp) or RECOLL_TMPDIR. Of
+# course the paths for the temporary files changes when re-processing a given
+# document. We do not store the Path file for data stored in TMPDIR or
+# RECOLL_TMPDIR, because doing so would cause an indefinite accumulation of
+# unusable Path files. This means that access to the OCR data for these
+# documents always causes the computation of the data hash, and is slower. With
+# recent Recoll versions which cache the text content in the index, this only
+# occurs when reindexing (with older versions, this could also occur for
+# Preview).
+#
+# Purging the cache of obsolete data:
+#
+# This can be done by running this file as a top level script with a --purge
+# option (possibly completed by a --purgedata option but see below)
+#  - Purging the paths tree just involves walking it, reading the files, and
+#    checking the existence of the recorded paths. Path files for non-existent
+#    files are deleted.
+#  - Purging the data tree: we make a list of all Data files referenced by at
+#    least one Path file, then walk the data tree, deleting unreferenced
+#    files. This means that Data files from temporary document copies (see
+#    above) will be deleted, which is quite unsatisfying. This would be
+#    difficult to change:
+#    - There is no way to detect the affected files because the Data files store
+#      no origin information
+#    - Even if we wanted to store an indication that the data file comes from a
+#      temporary document, we'd have no way to access the original document
+#      because the full ipath is not available. Changing this would be close to
+#      impossible because internfile...
+# In consequence the --purgedata option must be explicitely added for a data
+# purge to be performed. Only set it if re-OCRing all embedded documents is reasonable.
+
 
 import sys
 import os
@@ -61,11 +87,19 @@ import urllib.parse
 import zlib
 import glob
 
-import rclexecm
+from rclexecm import logmsg as _deb
+
+def _catslash(p):
+    if p and p[-1] != "/":
+        p += "/"
+    return p
+
+
+_tmpdir = os.environ["TMPDIR"] if "TMPDIR" in os.environ else "/tmp"
+_tmpdir = _catslash(_tmpdir)
+_recoll_tmpdir = os.environ["RECOLL_TMPDIR"] if "RECOLL_TMPDIR" in os.environ else None
+_recoll_tmpdir = _catslash(_recoll_tmpdir)
 
-def _deb(s):
-    rclexecm.logmsg(s)
-    
 
 class OCRCache(object):
     def __init__(self, conf):
@@ -90,7 +124,7 @@ class OCRCache(object):
 
     # Compute sha1 of path data contents, as two parts of 2 and 38 chars
     def _hashdata(self, path):
-        #_deb("Hashing DATA")
+        # _deb("Hashing DATA")
         m = hashlib.sha1()
         with open(path, "rb") as f:
             while True:
@@ -101,39 +135,39 @@ class OCRCache(object):
                 h = m.hexdigest()
         return h[0:2], h[2:]
 
-    
     def _readpathfile(self, ppf):
         '''Read path file and return values. We do not decode the image path
         as this is only used for purging'''
         with open(ppf, 'r') as f:
             line = f.read()
-        dd,df,tm,sz,pth = line.split()
+        dd, df, tm, sz, pth = line.split()
         tm = int(tm)
         sz = int(sz)
-        return dd,df,tm,sz,pth
-        
+        return dd, df, tm, sz, pth
+
     # Try to read the stored attributes for a given path: data hash,
     # modification time and size. If this fails, the path itself is
     # not cached (but the data still might be, maybe the file was moved)
     def _cachedpathattrs(self, path):
-        pd,pf = self._hashpath(path)
+        pd, pf = self._hashpath(path)
         pathfilepath = os.path.join(self.pathdir, pd, pf)
         if not os.path.exists(pathfilepath):
             return False, None, None, None, None
         try:
             dd, df, tm, sz, pth = self._readpathfile(pathfilepath)
             return True, dd, df, tm, sz
-        except:
+        except Exception as ex:
+            _deb(f"Error while trying to access pathfile {pathfilepath}: {ex}")
             return False, None, None, None, None
 
     # Compute the path hash, and get the mtime and size for given
     # path, for updating the cache path file
     def _newpathattrs(self, path):
-        pd,pf = self._hashpath(path)
+        pd, pf = self._hashpath(path)
         tm = int(os.path.getmtime(path))
         sz = int(os.path.getsize(path))
         return pd, pf, tm, sz
-    
+
     # Check if the cache appears up to date for a given path, only
     # using the modification time and size. Return the data file path
     # elements if we get a hit.
@@ -142,31 +176,25 @@ class OCRCache(object):
         if not ret:
             return False, None, None
         pd, pf, ntm, nsz = self._newpathattrs(path)
-        #_deb(" tm %d  sz %d" % (ntm, nsz))
-        #_deb("otm %d osz %d" % (otm, osz))
+        # _deb(" tm %d  sz %d" % (ntm, nsz))
+        # _deb("otm %d osz %d" % (otm, osz))
         if otm != ntm or osz != nsz:
             return False, None, None
         return True, od, of
 
-    # Check if cache appears up to date for path (no data check),
-    # return True/False
-    def pathincache(self, path):
-        ret, dd, df = self._pathincache(path)
-        return ret
-    
     # Compute the data file name for path. Expensive: we compute the data hash.
     # Return both the data file path and path elements (for storage in path file)
     def _datafilename(self, path):
         d, f = self._hashdata(path)
         return os.path.join(self.objdir, d, f), d, f
 
-    # Check if the data for path is in cache: expensive, needs to
-    # compute the hash for the path's data contents. Returns True/False
-    def dataincache(self, path):
-        return os.path.exists(self._datafilename(path)[0])
-
     # Create path file with given elements.
     def _updatepathfile(self, pd, pf, dd, df, tm, sz, path):
+        global _tmpdir, _recoll_tmpdir
+        if (_tmpdir and path.startswith(_tmpdir)) or \
+           (_recoll_tmpdir and path.startswith(_recoll_tmpdir)):
+            _deb(f"ocrcache: not storing path data for temporary file {path}")
+            return
         dir = os.path.join(self.pathdir, pd)
         if not os.path.exists(dir):
             os.makedirs(dir)
@@ -178,7 +206,7 @@ class OCRCache(object):
     # Store data for path. Only rewrite an existing data file if told
     # to do so: this is only useful if we are forcing an OCR re-run.
     def store(self, path, datatostore, force=False):
-        dd,df = self._hashdata(path)
+        dd, df = self._hashdata(path)
         pd, pf, tm, sz = self._newpathattrs(path)
         self._updatepathfile(pd, pf, dd, df, tm, sz, path)
         dir = os.path.join(self.objdir, dd)
@@ -186,7 +214,7 @@ class OCRCache(object):
             os.makedirs(dir)
         dfile = os.path.join(dir, df)
         if force or not os.path.exists(dfile):
-            #_deb("Storing data")
+            # _deb("Storing data")
             cpressed = zlib.compress(datatostore)
             with open(dfile, "wb") as f:
                 f.write(cpressed)
@@ -203,11 +231,12 @@ class OCRCache(object):
             dfn, dd, df = self._datafilename(path)
 
         if not os.path.exists(dfn):
+            _deb(f"ocrcache: no existing OCR data file for {path}")
             return False, b""
 
         if not pincache:
-            # File has moved. create/Update path file for next time
-            _deb("ocrcache::get file %s was moved, updating path data" % path)
+            # File may have moved. Create/Update path file for next time
+            _deb(f"ocrcache::get: data ok but path file for {path} does not exist: creating it")
             pd, pf, tm, sz = self._newpathattrs(path)
             self._updatepathfile(pd, pf, dd, df, tm, sz, path)
 
@@ -223,10 +252,10 @@ class OCRCache(object):
         ntm = int(os.path.getmtime(origpath))
         nsz = int(os.path.getsize(origpath))
         if ntm != otm or nsz != osz:
-            #_deb("Purgepaths otm %d ntm %d osz %d nsz %d"%(otm, ntm, osz, nsz))
+            # _deb("Purgepaths otm %d ntm %d osz %d nsz %d"%(otm, ntm, osz, nsz))
             return True
         return False
-    
+
     def purgepaths(self):
         '''Remove all stale pathfiles: source image does not exist or has
         been changed. Mostly useful for removed files, modified ones would be
@@ -251,15 +280,15 @@ class OCRCache(object):
     def _pgdt_pathcb(self, f):
         '''Get a pathfile name, read it, and record datafile identifier
         (concatenate data file subdir and file name)'''
-        #_deb("_pgdt_pathcb: %s" % f)
+        # _deb("_pgdt_pathcb: %s" % f)
         dd, df, tm, sz, orgpath = self._readpathfile(f)
         self._pgdt_alldatafns.add(dd+df)
 
     def _pgdt_datacb(self, datafn):
         '''Get a datafile name and check that it is referenced by a previously
         seen pathfile'''
-        p1,fn = os.path.split(datafn)
-        p2,dn = os.path.split(p1)
+        p1, fn = os.path.split(datafn)
+        p2, dn = os.path.split(p1)
         tst = dn+fn
         if tst in self._pgdt_alldatafns:
             _deb("purgedata: ok         : %s" % datafn)
@@ -267,7 +296,7 @@ class OCRCache(object):
         else:
             _deb("purgedata: removing   : %s" % datafn)
             os.remove(datafn)
-            
+
     def purgedata(self):
         '''Remove all data files which do not match any from the input list,
         based on data contents hash. We make a list of all data files
@@ -280,50 +309,61 @@ class OCRCache(object):
         self._pgdt_alldatafns = set()
         self._walk(self.pathdir, self._pgdt_pathcb)
         self._walk(self.objdir, self._pgdt_datacb)
-        
-
+   
 
 if __name__ == '__main__':
     import rclconfig
-    def _Usage():
-        _deb("Usage: rclocrcache.py --purge")
+    import getopt
+
+    def Usage(f=sys.stderr):
+        print("Usage: rclocrcache.py --purge [--purgedata]", file=f)
+        print("Usage: rclocrcache.py --store <imgdatapath> <ocrdatapath>", file=f)
+        print("Usage: rclocrcache.py --get <imgdatapath>", file=f)
         sys.exit(1)
-    if len(sys.argv) != 2:
-        _Usage()
-    if sys.argv[1] != "--purge":
-        _Usage()
-    
+
     conf = rclconfig.RclConfig()
     cache = OCRCache(conf)
-    cache.purgepaths()
-    cache.purgedata()
-    sys.exit(0)
-    
-#    def trycache(p):
-#        _deb("== CACHE tests for %s"%p)
-#        ret = cache.pathincache(p)
-#        s = "" if ret else " not"
-#        _deb("path for %s%s in cache" % (p, s))
-#        if not ret:
-#            return False
-#        ret = cache.dataincache(p)
-#        s = "" if ret else " not"
-#        _deb("data for %s%s in cache" % (p, s))
-#        return ret
-#    def trystore(p):
-#        _deb("== STORE test for %s" % p)
-#        cache.store(p, b"my OCR'd text is one line\n", force=False)
-#    def tryget(p):
-#        _deb("== GET test for %s" % p)
-#        incache, data = cache.get(p)
-#        if incache:
-#            _deb("Data from cache [%s]" % data)
-#        else:
-#            _deb("Data was not found in cache")
-#        return incache, data
-#    if False:
-#        path = sys.argv[1]
-#        incache, data = tryget(path)
-#        if not incache:
-#            trystore(path)
-#
+    opts, args = getopt.getopt(sys.argv[1:], "h", ["help", "purge", "purgedata", "store", "get"])
+    purgedata = False
+    purge = False
+
+    for opt, arg in opts:
+        if opt in ['-h', '--help']:
+            Usage(sys.stdout)
+        elif opt in ['--purgedata']:
+            purgedata = True
+        elif opt in ['--purge']:
+            if len(args) != 0:
+                Usage()
+            purge = True
+        elif opt in ['--store']:
+            if len(args) != 2:
+                Usage()
+            imgdatapath = args[0]
+            ocrdatapath = args[1]
+            ocrdata = open(ocrdatapath, "rb").read()
+            cache.store(imgdatapath, ocrdata, force=False)
+            sys.exit(0)
+        elif opt in ['--get']:
+            if len(args) != 1:
+                Usage()
+            imgdatapath = args[0]
+            incache, data = cache.get(imgdatapath)
+            if incache:
+                print(f"OCR data from cache {data}")
+                sys.exit(0)
+            else:
+                print("OCR Data was not found in cache", file=sys.stderr)
+                sys.exit(1)
+        else:
+            print(f"Unknown option {opt}", file=sys.stderr)
+            Usage()
+
+    # End options. Need purging ?
+    if purge:
+        cache.purgepaths()
+        if purgedata:
+            cache.purgedata()
+
+    Usage()
+