rclocr ckpt: cache+tesseract indexing working

This commit is contained in:
Jean-Francois Dockes 2020-02-26 17:30:12 +01:00
parent b151dc3624
commit 747e37a980
3 changed files with 102 additions and 55 deletions

View File

@ -25,11 +25,11 @@ import rclconfig
import rclocrcache import rclocrcache
import importlib.util import importlib.util
def deb(s): def _deb(s):
print("%s" % s, file=sys.stderr) print("rclocr: %s" % s, file=sys.stderr)
def Usage(): def Usage():
deb("Usage: rclocr.py <imagefilename>") _deb("Usage: rclocr.py <imagefilename>")
sys.exit(1) sys.exit(1)
if len(sys.argv) != 2: if len(sys.argv) != 2:
@ -50,9 +50,9 @@ if incache:
# Retrieve known ocr program names and try to load the corresponding module # Retrieve known ocr program names and try to load the corresponding module
ocrprogs = config.getConfParam("ocrprogs") ocrprogs = config.getConfParam("ocrprogs")
if not ocrprogs: if not ocrprogs:
deb("No ocrprogs variable") _deb("No ocrprogs variable in recoll configuration")
sys.exit(1) sys.exit(1)
deb("ocrprogs: %s" % ocrprogs) #_deb("ocrprogs: %s" % ocrprogs)
proglist = ocrprogs.split(" ") proglist = ocrprogs.split(" ")
ok = False ok = False
for ocrprog in proglist: for ocrprog in proglist:
@ -63,17 +63,21 @@ for ocrprog in proglist:
ok = True ok = True
break break
except Exception as err: except Exception as err:
deb("While loading %s: got: %s" % (modulename, err)) _deb("While loading %s: got: %s" % (modulename, err))
pass pass
if not ok: if not ok:
deb("No OCR module could be loaded") _deb("No OCR module could be loaded")
sys.exit(1) sys.exit(1)
deb("Using ocr module %s" % modulename) #_deb("Using ocr module %s" % modulename)
data = ocr.runocr(config, path) status, data = ocr.runocr(config, path)
if not status:
_deb("runocr failed")
sys.exit(1)
cache.store(path, data) cache.store(path, data)
sys.stdout.buffer.write(data) sys.stdout.buffer.write(data)
sys.exit(0) sys.exit(0)

View File

@ -18,13 +18,17 @@
######################################################## ########################################################
# Caching OCR'd data # Caching OCR'd data
#
# OCR is extremely slow. The cache stores 2 kinds of objects: # OCR is extremely slow, caching the results is necessary.
# - Path files are named from the hash of the image path and contain #
# the image data hash and the modification time and size of the # The cache stores 2 kinds of objects:
# image at the time the OCR'd data was stored in the cache # - Path files are named from the hash of the image file path and
# contain the image data hash, the modification time and size of the
# image file at the time the OCR'd data was stored in the cache, and
# the image path itself (the last is for purging only).
# - Data files are named with the hash of the image data and contain # - Data files are named with the hash of the image data and contain
# the OCR'd data # the zlib-compressed OCR'd data.
#
# When retrieving data from the cache: # When retrieving data from the cache:
# - We first use the image file size and modification time: if an # - We first use the image file size and modification time: if an
# entry exists for the imagepath/mtime/size triplet, and is up to # entry exists for the imagepath/mtime/size triplet, and is up to
@ -38,10 +42,23 @@
# If we need to use the second step, as a side effect, a path file is # If we need to use the second step, as a side effect, a path file is
# created or updated so that the data will be found with the first # created or updated so that the data will be found with the first
# step next time around. # step next time around.
#
# Purging the cache of obsolete data.
#
# - The cache path and data files are stored under 2 different
# directories (objects, paths) to make purging easier.
# - Purging the paths tree just involves walking it, reading the
# files, and checking the existence of the recorded paths.
# - There is no easy way to purge the data tree. The only possibility
# is to input a list of possible source files (e.g. result of a
# find in the image files area), and compute all the hashes. Data
# files which do not match one of the hashes are deleted.
import sys import sys
import os import os
import hashlib import hashlib
import urllib.parse
import zlib
def deb(s): def deb(s):
print("%s" %s, file=sys.stderr) print("%s" %s, file=sys.stderr)
@ -53,8 +70,10 @@ class OCRCache(object):
if not self.cachedir: if not self.cachedir:
self.cachedir = os.path.join(self.config.getConfDir(), "ocrcache") self.cachedir = os.path.join(self.config.getConfDir(), "ocrcache")
self.objdir = os.path.join(self.cachedir, "objects") self.objdir = os.path.join(self.cachedir, "objects")
if not os.path.exists(self.objdir): self.pathdir = os.path.join(self.cachedir, "paths")
os.makedirs(self.objdir) for dir in (self.objdir, self.pathdir):
if not os.path.exists(dir):
os.makedirs(dir)
# Compute sha1 of path, as two parts of 2 and 38 chars # Compute sha1 of path, as two parts of 2 and 38 chars
def _hashpath(self, data): def _hashpath(self, data):
@ -83,11 +102,11 @@ class OCRCache(object):
# not cached (but the data still might be, maybe the file was moved) # not cached (but the data still might be, maybe the file was moved)
def _cachedpathattrs(self, path): def _cachedpathattrs(self, path):
pd,pf = self._hashpath(path) pd,pf = self._hashpath(path)
o = os.path.join(self.objdir, pd, pf) o = os.path.join(self.pathdir, pd, pf)
if not os.path.exists(o): if not os.path.exists(o):
return False, None, None, None, None return False, None, None, None, None
line = open(o, "r").read() line = open(o, "r").read()
dd,df,tm,sz = line.split() dd,df,tm,sz,pth = line.split()
tm = int(tm) tm = int(tm)
sz = int(sz) sz = int(sz)
return True, dd, df, tm, sz return True, dd, df, tm, sz
@ -132,28 +151,30 @@ class OCRCache(object):
return os.path.exists(self._datafilename(path)[0]) return os.path.exists(self._datafilename(path)[0])
# Create path file with given elements. # Create path file with given elements.
def _updatepathfile(self, pd, pf, dd, df, tm, sz): def _updatepathfile(self, pd, pf, dd, df, tm, sz, path):
dir = os.path.join(self.objdir, pd) dir = os.path.join(self.pathdir, pd)
if not os.path.exists(dir): if not os.path.exists(dir):
os.makedirs(dir) os.makedirs(dir)
pfile = os.path.join(dir, pf) pfile = os.path.join(dir, pf)
codedpath = urllib.parse.quote(path)
with open(pfile, "w") as f: with open(pfile, "w") as f:
f.write("%s %s %d %d\n" % (dd, df, tm, sz)) f.write("%s %s %d %d %s\n" % (dd, df, tm, sz, codedpath))
# Store data for path. Only rewrite an existing data file if told # Store data for path. Only rewrite an existing data file if told
# to do so: this is only useful if we are forcing an OCR re-run. # to do so: this is only useful if we are forcing an OCR re-run.
def store(self, path, datatostore, force=False): def store(self, path, datatostore, force=False):
dd,df = self._hashdata(path) dd,df = self._hashdata(path)
pd, pf, tm, sz = self._newpathattrs(path) pd, pf, tm, sz = self._newpathattrs(path)
self._updatepathfile(pd, pf, dd, df, tm, sz) self._updatepathfile(pd, pf, dd, df, tm, sz, path)
dir = os.path.join(self.objdir, dd) dir = os.path.join(self.objdir, dd)
if not os.path.exists(dir): if not os.path.exists(dir):
os.makedirs(dir) os.makedirs(dir)
dfile = os.path.join(dir, df) dfile = os.path.join(dir, df)
if force or not os.path.exists(dfile): if force or not os.path.exists(dfile):
#deb("Storing data") #deb("Storing data")
cpressed = zlib.compress(datatostore)
with open(dfile, "wb") as f: with open(dfile, "wb") as f:
f.write(datatostore) f.write(cpressed)
return True return True
# Retrieve cached OCR'd data for image path. Possibly update the # Retrieve cached OCR'd data for image path. Possibly update the
@ -171,10 +192,14 @@ class OCRCache(object):
if not pincache: if not pincache:
# File has moved. create/Update path file for next time # File has moved. create/Update path file for next time
deb("ocrcache::get file %s was moved, updating path data" % path)
pd, pf, tm, sz = self._newpathattrs(path) pd, pf, tm, sz = self._newpathattrs(path)
self._updatepathfile(pd, pf, dd, df, tm, sz) self._updatepathfile(pd, pf, dd, df, tm, sz, path)
return True, open(dfn, "rb").read() with open(dfn, "rb") as f:
cpressed = f.read()
data = zlib.decompress(cpressed)
return True, data
@ -184,25 +209,33 @@ if __name__ == '__main__':
conf = rclconfig.RclConfig() conf = rclconfig.RclConfig()
cache = OCRCache(conf) cache = OCRCache(conf)
path = sys.argv[1] path = sys.argv[1]
deb("Using %s" % path)
def trycache(p):
deb("== CACHE tests for %s"%p)
ret = cache.pathincache(p)
s = "" if ret else " not"
deb("path for %s%s in cache" % (p, s))
if not ret:
return False
ret = cache.dataincache(p)
s = "" if ret else " not"
deb("data for %s%s in cache" % (p, s))
return ret
deb("== CACHE tests") def trystore(p):
ret = cache.pathincache(path) deb("== STORE test for %s" % p)
s = "" if ret else " not" cache.store(p, b"my OCR'd text is one line\n", force=False)
deb("path for %s%s in cache" % (path, s))
#ret = cache.dataincache(path) def tryget(p):
#s = "" if ret else " not" deb("== GET test for %s" % p)
#deb("data for %s%s in cache" % (path, s)) incache, data = cache.get(p)
if incache:
deb("Data from cache [%s]" % data)
else:
deb("Data was not found in cache")
return incache, data
if False: incache, data = tryget(path)
deb("== STORE tests") if not incache:
cache.store(path, b"my OCR'd text is one line\n", force=False) trystore(path)
deb("== GET tests")
incache, data = cache.get(path)
if incache:
deb("Data from cache [%s]" % data)
else:
deb("Data was not found in cache")

View File

@ -36,8 +36,7 @@ _okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
def _deb(s): def _deb(s):
if not _mswindows: if not _mswindows:
#print("%s" % s, file=sys.stderr) print("rclocrtesseract: %s" % s, file=sys.stderr)
pass
def vacuumdir(dir): def vacuumdir(dir):
if dir: if dir:
@ -154,8 +153,19 @@ def _pdftesseract(config, path):
_deb("pdftoppm failed: %s" % e) _deb("pdftoppm failed: %s" % e)
return b"" return b""
files = glob.glob(tmpfile + "*") # Note: unfortunately, pdftoppm silently fails if the temp file
for f in files: # system is full. There is no really good way to check for
# this. We consider any empty file to signal an error
ppmfiles = glob.glob(tmpfile + "*")
for f in ppmfiles:
size = os.path.getsize(f)
if os.path.getsize(f) == 0:
_deb("pdftoppm created empty files. "
"Suspecting full file system, failing")
return False, ""
for f in sorted(ppmfiles):
out = b'' out = b''
try: try:
out = subprocess.check_output( out = subprocess.check_output(
@ -165,16 +175,16 @@ def _pdftesseract(config, path):
_deb("tesseract failed: %s" % e) _deb("tesseract failed: %s" % e)
errlines = out.split(b'\n') errlines = out.split(b'\n')
if len(errlines) > 2: if len(errlines) > 5:
_deb("Tesseract error: %s" % out) _deb("Tesseract error output: %d %s" % (len(errlines),out))
# Concatenate the result files # Concatenate the result files
files = glob.glob(tmpfile + "*" + ".txt") txtfiles = glob.glob(tmpfile + "*" + ".txt")
data = b"" data = b""
for f in files: for f in sorted(txtfiles):
data += open(f, "rb").read() data += open(f, "rb").read()
return data return True,data
def _simpletesseract(config, path): def _simpletesseract(config, path):
@ -186,8 +196,8 @@ def _simpletesseract(config, path):
stderr=subprocess.DEVNULL) stderr=subprocess.DEVNULL)
except Exception as e: except Exception as e:
_deb("tesseract failed: %s" % e) _deb("tesseract failed: %s" % e)
return False, ""
return out return True, out
# run ocr on the input path and output the result data. # run ocr on the input path and output the result data.