rclocr ckpt: cache+tesseract indexing working
This commit is contained in:
parent
b151dc3624
commit
747e37a980
@ -25,11 +25,11 @@ import rclconfig
|
|||||||
import rclocrcache
|
import rclocrcache
|
||||||
import importlib.util
|
import importlib.util
|
||||||
|
|
||||||
def deb(s):
|
def _deb(s):
|
||||||
print("%s" % s, file=sys.stderr)
|
print("rclocr: %s" % s, file=sys.stderr)
|
||||||
|
|
||||||
def Usage():
|
def Usage():
|
||||||
deb("Usage: rclocr.py <imagefilename>")
|
_deb("Usage: rclocr.py <imagefilename>")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
@ -50,9 +50,9 @@ if incache:
|
|||||||
# Retrieve known ocr program names and try to load the corresponding module
|
# Retrieve known ocr program names and try to load the corresponding module
|
||||||
ocrprogs = config.getConfParam("ocrprogs")
|
ocrprogs = config.getConfParam("ocrprogs")
|
||||||
if not ocrprogs:
|
if not ocrprogs:
|
||||||
deb("No ocrprogs variable")
|
_deb("No ocrprogs variable in recoll configuration")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
deb("ocrprogs: %s" % ocrprogs)
|
#_deb("ocrprogs: %s" % ocrprogs)
|
||||||
proglist = ocrprogs.split(" ")
|
proglist = ocrprogs.split(" ")
|
||||||
ok = False
|
ok = False
|
||||||
for ocrprog in proglist:
|
for ocrprog in proglist:
|
||||||
@ -63,17 +63,21 @@ for ocrprog in proglist:
|
|||||||
ok = True
|
ok = True
|
||||||
break
|
break
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
deb("While loading %s: got: %s" % (modulename, err))
|
_deb("While loading %s: got: %s" % (modulename, err))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if not ok:
|
if not ok:
|
||||||
deb("No OCR module could be loaded")
|
_deb("No OCR module could be loaded")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
deb("Using ocr module %s" % modulename)
|
#_deb("Using ocr module %s" % modulename)
|
||||||
|
|
||||||
data = ocr.runocr(config, path)
|
status, data = ocr.runocr(config, path)
|
||||||
|
|
||||||
|
if not status:
|
||||||
|
_deb("runocr failed")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
cache.store(path, data)
|
cache.store(path, data)
|
||||||
sys.stdout.buffer.write(data)
|
sys.stdout.buffer.write(data)
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|||||||
@ -18,13 +18,17 @@
|
|||||||
########################################################
|
########################################################
|
||||||
|
|
||||||
# Caching OCR'd data
|
# Caching OCR'd data
|
||||||
|
#
|
||||||
# OCR is extremely slow. The cache stores 2 kinds of objects:
|
# OCR is extremely slow, caching the results is necessary.
|
||||||
# - Path files are named from the hash of the image path and contain
|
#
|
||||||
# the image data hash and the modification time and size of the
|
# The cache stores 2 kinds of objects:
|
||||||
# image at the time the OCR'd data was stored in the cache
|
# - Path files are named from the hash of the image file path and
|
||||||
|
# contain the image data hash, the modification time and size of the
|
||||||
|
# image file at the time the OCR'd data was stored in the cache, and
|
||||||
|
# the image path itself (the last is for purging only).
|
||||||
# - Data files are named with the hash of the image data and contain
|
# - Data files are named with the hash of the image data and contain
|
||||||
# the OCR'd data
|
# the zlib-compressed OCR'd data.
|
||||||
|
#
|
||||||
# When retrieving data from the cache:
|
# When retrieving data from the cache:
|
||||||
# - We first use the image file size and modification time: if an
|
# - We first use the image file size and modification time: if an
|
||||||
# entry exists for the imagepath/mtime/size triplet, and is up to
|
# entry exists for the imagepath/mtime/size triplet, and is up to
|
||||||
@ -38,10 +42,23 @@
|
|||||||
# If we need to use the second step, as a side effect, a path file is
|
# If we need to use the second step, as a side effect, a path file is
|
||||||
# created or updated so that the data will be found with the first
|
# created or updated so that the data will be found with the first
|
||||||
# step next time around.
|
# step next time around.
|
||||||
|
#
|
||||||
|
# Purging the cache of obsolete data.
|
||||||
|
#
|
||||||
|
# - The cache path and data files are stored under 2 different
|
||||||
|
# directories (objects, paths) to make purging easier.
|
||||||
|
# - Purging the paths tree just involves walking it, reading the
|
||||||
|
# files, and checking the existence of the recorded paths.
|
||||||
|
# - There is no easy way to purge the data tree. The only possibility
|
||||||
|
# is to input a list of possible source files (e.g. result of a
|
||||||
|
# find in the image files area), and compute all the hashes. Data
|
||||||
|
# files which do not match one of the hashes are deleted.
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import urllib.parse
|
||||||
|
import zlib
|
||||||
|
|
||||||
def deb(s):
|
def deb(s):
|
||||||
print("%s" %s, file=sys.stderr)
|
print("%s" %s, file=sys.stderr)
|
||||||
@ -53,8 +70,10 @@ class OCRCache(object):
|
|||||||
if not self.cachedir:
|
if not self.cachedir:
|
||||||
self.cachedir = os.path.join(self.config.getConfDir(), "ocrcache")
|
self.cachedir = os.path.join(self.config.getConfDir(), "ocrcache")
|
||||||
self.objdir = os.path.join(self.cachedir, "objects")
|
self.objdir = os.path.join(self.cachedir, "objects")
|
||||||
if not os.path.exists(self.objdir):
|
self.pathdir = os.path.join(self.cachedir, "paths")
|
||||||
os.makedirs(self.objdir)
|
for dir in (self.objdir, self.pathdir):
|
||||||
|
if not os.path.exists(dir):
|
||||||
|
os.makedirs(dir)
|
||||||
|
|
||||||
# Compute sha1 of path, as two parts of 2 and 38 chars
|
# Compute sha1 of path, as two parts of 2 and 38 chars
|
||||||
def _hashpath(self, data):
|
def _hashpath(self, data):
|
||||||
@ -83,11 +102,11 @@ class OCRCache(object):
|
|||||||
# not cached (but the data still might be, maybe the file was moved)
|
# not cached (but the data still might be, maybe the file was moved)
|
||||||
def _cachedpathattrs(self, path):
|
def _cachedpathattrs(self, path):
|
||||||
pd,pf = self._hashpath(path)
|
pd,pf = self._hashpath(path)
|
||||||
o = os.path.join(self.objdir, pd, pf)
|
o = os.path.join(self.pathdir, pd, pf)
|
||||||
if not os.path.exists(o):
|
if not os.path.exists(o):
|
||||||
return False, None, None, None, None
|
return False, None, None, None, None
|
||||||
line = open(o, "r").read()
|
line = open(o, "r").read()
|
||||||
dd,df,tm,sz = line.split()
|
dd,df,tm,sz,pth = line.split()
|
||||||
tm = int(tm)
|
tm = int(tm)
|
||||||
sz = int(sz)
|
sz = int(sz)
|
||||||
return True, dd, df, tm, sz
|
return True, dd, df, tm, sz
|
||||||
@ -132,28 +151,30 @@ class OCRCache(object):
|
|||||||
return os.path.exists(self._datafilename(path)[0])
|
return os.path.exists(self._datafilename(path)[0])
|
||||||
|
|
||||||
# Create path file with given elements.
|
# Create path file with given elements.
|
||||||
def _updatepathfile(self, pd, pf, dd, df, tm, sz):
|
def _updatepathfile(self, pd, pf, dd, df, tm, sz, path):
|
||||||
dir = os.path.join(self.objdir, pd)
|
dir = os.path.join(self.pathdir, pd)
|
||||||
if not os.path.exists(dir):
|
if not os.path.exists(dir):
|
||||||
os.makedirs(dir)
|
os.makedirs(dir)
|
||||||
pfile = os.path.join(dir, pf)
|
pfile = os.path.join(dir, pf)
|
||||||
|
codedpath = urllib.parse.quote(path)
|
||||||
with open(pfile, "w") as f:
|
with open(pfile, "w") as f:
|
||||||
f.write("%s %s %d %d\n" % (dd, df, tm, sz))
|
f.write("%s %s %d %d %s\n" % (dd, df, tm, sz, codedpath))
|
||||||
|
|
||||||
# Store data for path. Only rewrite an existing data file if told
|
# Store data for path. Only rewrite an existing data file if told
|
||||||
# to do so: this is only useful if we are forcing an OCR re-run.
|
# to do so: this is only useful if we are forcing an OCR re-run.
|
||||||
def store(self, path, datatostore, force=False):
|
def store(self, path, datatostore, force=False):
|
||||||
dd,df = self._hashdata(path)
|
dd,df = self._hashdata(path)
|
||||||
pd, pf, tm, sz = self._newpathattrs(path)
|
pd, pf, tm, sz = self._newpathattrs(path)
|
||||||
self._updatepathfile(pd, pf, dd, df, tm, sz)
|
self._updatepathfile(pd, pf, dd, df, tm, sz, path)
|
||||||
dir = os.path.join(self.objdir, dd)
|
dir = os.path.join(self.objdir, dd)
|
||||||
if not os.path.exists(dir):
|
if not os.path.exists(dir):
|
||||||
os.makedirs(dir)
|
os.makedirs(dir)
|
||||||
dfile = os.path.join(dir, df)
|
dfile = os.path.join(dir, df)
|
||||||
if force or not os.path.exists(dfile):
|
if force or not os.path.exists(dfile):
|
||||||
#deb("Storing data")
|
#deb("Storing data")
|
||||||
|
cpressed = zlib.compress(datatostore)
|
||||||
with open(dfile, "wb") as f:
|
with open(dfile, "wb") as f:
|
||||||
f.write(datatostore)
|
f.write(cpressed)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Retrieve cached OCR'd data for image path. Possibly update the
|
# Retrieve cached OCR'd data for image path. Possibly update the
|
||||||
@ -171,10 +192,14 @@ class OCRCache(object):
|
|||||||
|
|
||||||
if not pincache:
|
if not pincache:
|
||||||
# File has moved. create/Update path file for next time
|
# File has moved. create/Update path file for next time
|
||||||
|
deb("ocrcache::get file %s was moved, updating path data" % path)
|
||||||
pd, pf, tm, sz = self._newpathattrs(path)
|
pd, pf, tm, sz = self._newpathattrs(path)
|
||||||
self._updatepathfile(pd, pf, dd, df, tm, sz)
|
self._updatepathfile(pd, pf, dd, df, tm, sz, path)
|
||||||
|
|
||||||
return True, open(dfn, "rb").read()
|
with open(dfn, "rb") as f:
|
||||||
|
cpressed = f.read()
|
||||||
|
data = zlib.decompress(cpressed)
|
||||||
|
return True, data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -184,25 +209,33 @@ if __name__ == '__main__':
|
|||||||
conf = rclconfig.RclConfig()
|
conf = rclconfig.RclConfig()
|
||||||
cache = OCRCache(conf)
|
cache = OCRCache(conf)
|
||||||
path = sys.argv[1]
|
path = sys.argv[1]
|
||||||
deb("Using %s" % path)
|
|
||||||
|
def trycache(p):
|
||||||
|
deb("== CACHE tests for %s"%p)
|
||||||
|
ret = cache.pathincache(p)
|
||||||
|
s = "" if ret else " not"
|
||||||
|
deb("path for %s%s in cache" % (p, s))
|
||||||
|
if not ret:
|
||||||
|
return False
|
||||||
|
ret = cache.dataincache(p)
|
||||||
|
s = "" if ret else " not"
|
||||||
|
deb("data for %s%s in cache" % (p, s))
|
||||||
|
return ret
|
||||||
|
|
||||||
deb("== CACHE tests")
|
def trystore(p):
|
||||||
ret = cache.pathincache(path)
|
deb("== STORE test for %s" % p)
|
||||||
s = "" if ret else " not"
|
cache.store(p, b"my OCR'd text is one line\n", force=False)
|
||||||
deb("path for %s%s in cache" % (path, s))
|
|
||||||
|
|
||||||
#ret = cache.dataincache(path)
|
def tryget(p):
|
||||||
#s = "" if ret else " not"
|
deb("== GET test for %s" % p)
|
||||||
#deb("data for %s%s in cache" % (path, s))
|
incache, data = cache.get(p)
|
||||||
|
if incache:
|
||||||
|
deb("Data from cache [%s]" % data)
|
||||||
|
else:
|
||||||
|
deb("Data was not found in cache")
|
||||||
|
return incache, data
|
||||||
|
|
||||||
if False:
|
incache, data = tryget(path)
|
||||||
deb("== STORE tests")
|
if not incache:
|
||||||
cache.store(path, b"my OCR'd text is one line\n", force=False)
|
trystore(path)
|
||||||
|
|
||||||
deb("== GET tests")
|
|
||||||
incache, data = cache.get(path)
|
|
||||||
if incache:
|
|
||||||
deb("Data from cache [%s]" % data)
|
|
||||||
else:
|
|
||||||
deb("Data was not found in cache")
|
|
||||||
|
|
||||||
|
|||||||
@ -36,8 +36,7 @@ _okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
|
|||||||
|
|
||||||
def _deb(s):
|
def _deb(s):
|
||||||
if not _mswindows:
|
if not _mswindows:
|
||||||
#print("%s" % s, file=sys.stderr)
|
print("rclocrtesseract: %s" % s, file=sys.stderr)
|
||||||
pass
|
|
||||||
|
|
||||||
def vacuumdir(dir):
|
def vacuumdir(dir):
|
||||||
if dir:
|
if dir:
|
||||||
@ -154,8 +153,19 @@ def _pdftesseract(config, path):
|
|||||||
_deb("pdftoppm failed: %s" % e)
|
_deb("pdftoppm failed: %s" % e)
|
||||||
return b""
|
return b""
|
||||||
|
|
||||||
files = glob.glob(tmpfile + "*")
|
# Note: unfortunately, pdftoppm silently fails if the temp file
|
||||||
for f in files:
|
# system is full. There is no really good way to check for
|
||||||
|
# this. We consider any empty file to signal an error
|
||||||
|
|
||||||
|
ppmfiles = glob.glob(tmpfile + "*")
|
||||||
|
for f in ppmfiles:
|
||||||
|
size = os.path.getsize(f)
|
||||||
|
if os.path.getsize(f) == 0:
|
||||||
|
_deb("pdftoppm created empty files. "
|
||||||
|
"Suspecting full file system, failing")
|
||||||
|
return False, ""
|
||||||
|
|
||||||
|
for f in sorted(ppmfiles):
|
||||||
out = b''
|
out = b''
|
||||||
try:
|
try:
|
||||||
out = subprocess.check_output(
|
out = subprocess.check_output(
|
||||||
@ -165,16 +175,16 @@ def _pdftesseract(config, path):
|
|||||||
_deb("tesseract failed: %s" % e)
|
_deb("tesseract failed: %s" % e)
|
||||||
|
|
||||||
errlines = out.split(b'\n')
|
errlines = out.split(b'\n')
|
||||||
if len(errlines) > 2:
|
if len(errlines) > 5:
|
||||||
_deb("Tesseract error: %s" % out)
|
_deb("Tesseract error output: %d %s" % (len(errlines),out))
|
||||||
|
|
||||||
# Concatenate the result files
|
# Concatenate the result files
|
||||||
files = glob.glob(tmpfile + "*" + ".txt")
|
txtfiles = glob.glob(tmpfile + "*" + ".txt")
|
||||||
data = b""
|
data = b""
|
||||||
for f in files:
|
for f in sorted(txtfiles):
|
||||||
data += open(f, "rb").read()
|
data += open(f, "rb").read()
|
||||||
|
|
||||||
return data
|
return True,data
|
||||||
|
|
||||||
|
|
||||||
def _simpletesseract(config, path):
|
def _simpletesseract(config, path):
|
||||||
@ -186,8 +196,8 @@ def _simpletesseract(config, path):
|
|||||||
stderr=subprocess.DEVNULL)
|
stderr=subprocess.DEVNULL)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_deb("tesseract failed: %s" % e)
|
_deb("tesseract failed: %s" % e)
|
||||||
|
return False, ""
|
||||||
return out
|
return True, out
|
||||||
|
|
||||||
|
|
||||||
# run ocr on the input path and output the result data.
|
# run ocr on the input path and output the result data.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user