diff --git a/src/filters/rclocr.py b/src/filters/rclocr.py
index 45668303..732160ec 100755
--- a/src/filters/rclocr.py
+++ b/src/filters/rclocr.py
@@ -25,11 +25,11 @@ import rclconfig
 import rclocrcache
 import importlib.util
 
-def deb(s):
-    print("%s" % s, file=sys.stderr)
+def _deb(s):
+    print("rclocr: %s" % s, file=sys.stderr)
     
 def Usage():
-    deb("Usage: rclocr.py <imagefilename>")
+    _deb("Usage: rclocr.py <imagefilename>")
     sys.exit(1)
 
 if len(sys.argv) != 2:
@@ -50,9 +50,9 @@ if incache:
 # Retrieve known ocr program names and try to load the corresponding module
 ocrprogs = config.getConfParam("ocrprogs")
 if not ocrprogs:
-    deb("No ocrprogs variable")
+    _deb("No ocrprogs variable in recoll configuration")
     sys.exit(1)
-deb("ocrprogs: %s" % ocrprogs)
+#_deb("ocrprogs: %s" % ocrprogs)
 proglist = ocrprogs.split(" ")
 ok = False
 for ocrprog in proglist:
@@ -63,17 +63,21 @@ for ocrprog in proglist:
             ok = True
             break
     except Exception as err:
-        deb("While loading %s: got: %s" % (modulename, err))
+        _deb("While loading %s: got: %s" % (modulename, err))
         pass
 
 if not ok:
-    deb("No OCR module could be loaded")
+    _deb("No OCR module could be loaded")
     sys.exit(1)
 
-deb("Using ocr module %s" % modulename)
+#_deb("Using ocr module %s" % modulename)
 
-data = ocr.runocr(config, path)
+status, data = ocr.runocr(config, path)
 
+if not status:
+    _deb("runocr failed")
+    sys.exit(1)
+    
 cache.store(path, data)
 sys.stdout.buffer.write(data)
 sys.exit(0)
diff --git a/src/filters/rclocrcache.py b/src/filters/rclocrcache.py
index 72ffa156..1fbb126c 100755
--- a/src/filters/rclocrcache.py
+++ b/src/filters/rclocrcache.py
@@ -18,13 +18,17 @@
 ########################################################
 
 # Caching OCR'd data
-
-# OCR is extremely slow. The cache stores 2 kinds of objects:
-# - Path files are named from the hash of the image path and contain
-#   the image data hash and the modification time and size of the
-#   image at the time the OCR'd data was stored in the cache
+#
+# OCR is extremely slow, caching the results is necessary.
+#
+# The cache stores 2 kinds of objects:
+# - Path files are named from the hash of the image file path and
+#   contain the image data hash, the modification time and size of the
+#   image file at the time the OCR'd data was stored in the cache, and
+#   the image path itself (the last is for purging only).
 # - Data files are named with the hash of the image data and contain
-#   the OCR'd data
+#   the zlib-compressed OCR'd data.
+#
 # When retrieving data from the cache:
 #  - We first use the image file size and modification time: if an
 #    entry exists for the imagepath/mtime/size triplet, and is up to
@@ -38,10 +42,23 @@
 #  If we need to use the second step, as a side effect, a path file is
 #  created or updated so that the data will be found with the first
 #  step next time around.
+#
+# Purging the cache of obsolete data.
+#
+#  - The cache path and data files are stored under 2 different
+#    directories (objects, paths) to make purging easier.
+#  - Purging the paths tree just involves walking it, reading the
+#    files, and checking the existence of the recorded paths.
+#  - There is no easy way to purge the data tree. The only possibility
+#    is to input a list of possible source files (e.g. result of a
+#    find in the image files area), and compute all the hashes. Data
+#    files which do not match one of the hashes are deleted.
 
 import sys
 import os
 import hashlib
+import urllib.parse
+import zlib
 
 def deb(s):
     print("%s" %s, file=sys.stderr)
@@ -53,8 +70,10 @@ class OCRCache(object):
         if not self.cachedir:
             self.cachedir = os.path.join(self.config.getConfDir(), "ocrcache")
         self.objdir = os.path.join(self.cachedir, "objects")
-        if not os.path.exists(self.objdir):
-            os.makedirs(self.objdir)
+        self.pathdir = os.path.join(self.cachedir, "paths")
+        for dir in (self.objdir, self.pathdir):
+            if not os.path.exists(dir):
+                os.makedirs(dir)
 
     # Compute sha1 of path, as two parts of 2 and 38 chars
     def _hashpath(self, data):
@@ -83,11 +102,11 @@ class OCRCache(object):
     # not cached (but the data still might be, maybe the file was moved)
     def _cachedpathattrs(self, path):
         pd,pf = self._hashpath(path)
-        o = os.path.join(self.objdir, pd, pf)
+        o = os.path.join(self.pathdir, pd, pf)
         if not os.path.exists(o):
             return False, None, None, None, None
         line = open(o, "r").read()
-        dd,df,tm,sz = line.split()
+        dd,df,tm,sz,pth = line.split()
         tm = int(tm)
         sz = int(sz)
         return True, dd, df, tm, sz
@@ -132,28 +151,30 @@ class OCRCache(object):
         return os.path.exists(self._datafilename(path)[0])
 
     # Create path file with given elements.
-    def _updatepathfile(self, pd, pf, dd, df, tm, sz):
-        dir = os.path.join(self.objdir, pd)
+    def _updatepathfile(self, pd, pf, dd, df, tm, sz, path):
+        dir = os.path.join(self.pathdir, pd)
         if not os.path.exists(dir):
             os.makedirs(dir)
         pfile = os.path.join(dir, pf)
+        codedpath = urllib.parse.quote(path)
         with open(pfile, "w") as f:
-            f.write("%s %s %d %d\n" % (dd, df, tm, sz))
+            f.write("%s %s %d %d %s\n" % (dd, df, tm, sz, codedpath))
 
     # Store data for path. Only rewrite an existing data file if told
     # to do so: this is only useful if we are forcing an OCR re-run.
     def store(self, path, datatostore, force=False):
         dd,df = self._hashdata(path)
         pd, pf, tm, sz = self._newpathattrs(path)
-        self._updatepathfile(pd, pf, dd, df, tm, sz)
+        self._updatepathfile(pd, pf, dd, df, tm, sz, path)
         dir = os.path.join(self.objdir, dd)
         if not os.path.exists(dir):
             os.makedirs(dir)
         dfile = os.path.join(dir, df)
         if force or not os.path.exists(dfile):
             #deb("Storing data")
+            cpressed = zlib.compress(datatostore)
             with open(dfile, "wb") as f:
-                f.write(datatostore)
+                f.write(cpressed)
         return True
 
     # Retrieve cached OCR'd data for image path. Possibly update the
@@ -171,10 +192,14 @@ class OCRCache(object):
 
         if not pincache:
             # File has moved. create/Update path file for next time
+            deb("ocrcache::get file %s was moved, updating path data" % path)
             pd, pf, tm, sz = self._newpathattrs(path)
-            self._updatepathfile(pd, pf, dd, df, tm, sz)
+            self._updatepathfile(pd, pf, dd, df, tm, sz, path)
 
-        return True, open(dfn, "rb").read()
+        with open(dfn, "rb") as f:
+            cpressed = f.read()
+            data = zlib.decompress(cpressed)
+            return True, data
 
 
 
@@ -184,25 +209,33 @@ if __name__ == '__main__':
     conf = rclconfig.RclConfig()
     cache = OCRCache(conf)
     path = sys.argv[1]
-    deb("Using %s" % path)
+
+    def trycache(p):
+        deb("== CACHE tests for %s"%p)
+        ret = cache.pathincache(p)
+        s = "" if ret else " not"
+        deb("path for %s%s in cache" % (p, s))
+        if not ret:
+            return False
+        ret = cache.dataincache(p)
+        s = "" if ret else " not"
+        deb("data for %s%s in cache" % (p, s))
+        return ret
     
-    deb("== CACHE tests")
-    ret = cache.pathincache(path)
-    s = "" if ret else " not"
-    deb("path for %s%s in cache" % (path, s))
+    def trystore(p):
+        deb("== STORE test for %s" % p)
+        cache.store(p, b"my OCR'd text is one line\n", force=False)
 
-    #ret = cache.dataincache(path)
-    #s = "" if ret else " not"
-    #deb("data for %s%s in cache" % (path, s))
+    def tryget(p):
+        deb("== GET test for %s" % p)
+        incache, data = cache.get(p)
+        if incache:
+            deb("Data from cache [%s]" % data)
+        else:
+            deb("Data was not found in cache")
+        return incache, data
 
-    if False:
-        deb("== STORE tests")
-        cache.store(path, b"my OCR'd text is one line\n", force=False)
-
-    deb("== GET tests")
-    incache, data = cache.get(path)
-    if incache:
-        deb("Data from cache [%s]" % data)
-    else:
-        deb("Data was not found in cache")
+    incache, data = tryget(path)
+    if not incache:
+        trystore(path)
         
diff --git a/src/filters/rclocrtesseract.py b/src/filters/rclocrtesseract.py
index 8d47c235..57689675 100755
--- a/src/filters/rclocrtesseract.py
+++ b/src/filters/rclocrtesseract.py
@@ -36,8 +36,7 @@ _okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
 
 def _deb(s):
     if not _mswindows:
-        #print("%s" % s, file=sys.stderr)
-        pass
+        print("rclocrtesseract: %s" % s, file=sys.stderr)
 
 def vacuumdir(dir):
     if dir:
@@ -154,8 +153,19 @@ def _pdftesseract(config, path):
         _deb("pdftoppm failed: %s" % e)
         return b""
 
-    files = glob.glob(tmpfile + "*")
-    for f in files:
+    # Note: unfortunately, pdftoppm silently fails if the temp file
+    # system is full. There is no really good way to check for
+    # this. We consider any empty file to signal an error
+    
+    ppmfiles = glob.glob(tmpfile + "*")
+    for f in ppmfiles:
+        size = os.path.getsize(f)
+        if os.path.getsize(f) == 0:
+            _deb("pdftoppm created empty files. "
+                 "Suspecting full file system, failing")
+            return False, ""
+        
+    for f in sorted(ppmfiles):
         out = b''
         try:
             out = subprocess.check_output(
@@ -165,16 +175,16 @@ def _pdftesseract(config, path):
             _deb("tesseract failed: %s" % e)
 
         errlines = out.split(b'\n')
-        if len(errlines) > 2:
-            _deb("Tesseract error: %s" % out)
+        if len(errlines) > 5:
+            _deb("Tesseract error output: %d %s" % (len(errlines),out))
 
     # Concatenate the result files
-    files = glob.glob(tmpfile + "*" + ".txt")
+    txtfiles = glob.glob(tmpfile + "*" + ".txt")
     data = b""
-    for f in files:
+    for f in sorted(txtfiles):
         data += open(f, "rb").read()
 
-    return data
+    return True,data
 
 
 def _simpletesseract(config, path):
@@ -186,8 +196,8 @@ def _simpletesseract(config, path):
             stderr=subprocess.DEVNULL)
     except Exception as e:
         _deb("tesseract failed: %s" % e)
-
-    return out
+        return False, ""
+    return True, out
 
 
 # run ocr on the input path and output the result data.