1st version of the cached ocr mechanism

This commit is contained in:
Jean-Francois Dockes 2020-02-15 21:19:13 +01:00
parent aa40531bbe
commit 38dfa5f841
4 changed files with 517 additions and 113 deletions

80
src/filters/rclocr.py Executable file
View File

@ -0,0 +1,80 @@
#!/usr/bin/env python3
#################################
# Copyright (C) 2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
########################################################
# Running OCR programs for Recoll
import os
import sys
import rclconfig
import rclocrcache
import importlib.util
def deb(s):
print("%s" % s, file=sys.stderr)
def Usage():
deb("Usage: rclocr.py <imagefilename>")
sys.exit(1)
if len(sys.argv) != 2:
Usage()
path = sys.argv[1]
config = rclconfig.RclConfig()
cache = rclocrcache.OCRCache(config)
incache, data = cache.get(path)
if incache:
sys.stdout.buffer.write(data)
sys.exit(0)
#### Data not in cache
# Retrieve known ocr program names and try to load the corresponding module
ocrprogs = config.getConfParam("ocrprogs")
if not ocrprogs:
deb("No ocrprogs variable")
sys.exit(1)
deb("ocrprogs: %s" % ocrprogs)
proglist = ocrprogs.split(" ")
ok = False
for ocrprog in proglist:
try:
modulename = "rclocr" + ocrprog
ocr = importlib.import_module(modulename)
if ocr.ocrpossible(path):
ok = True
break
except Exception as err:
deb("While loading %s: got: %s" % (modulename, err))
pass
if not ok:
deb("No OCR module could be loaded")
sys.exit(1)
deb("Using ocr module %s" % modulename)
data = ocr.runocr(config, path)
cache.store(path, data)
sys.stdout.buffer.write(data)
sys.exit(0)

208
src/filters/rclocrcache.py Executable file
View File

@ -0,0 +1,208 @@
#!/usr/bin/env python3
#################################
# Copyright (C) 2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
########################################################
# Caching OCR'd data
# OCR is extremely slow. The cache stores 2 kinds of objects:
# - Path files are named from the hash of the image path and contain
# the image data hash and the modification time and size of the
# image at the time the OCR'd data was stored in the cache
# - Data files are named with the hash of the image data and contain
# the OCR'd data
# When retrieving data from the cache:
# - We first use the image file size and modification time: if an
# entry exists for the imagepath/mtime/size triplet, and is up to
# date, the corresponding data is obtained from the data file and
# returned.
# - Else we then use the image data: if an entry exists for the
# computed hashed value of the data, it is returned. This allows
# moving files around without needing to run OCR again, but of
# course, it is more expensive than the first step
#
# If we need to use the second step, as a side effect, a path file is
# created or updated so that the data will be found with the first
# step next time around.
import sys
import os
import hashlib
def deb(s):
print("%s" %s, file=sys.stderr)
class OCRCache(object):
def __init__(self, conf):
self.config = conf
self.cachedir = conf.getConfParam("ocrcachedir")
if not self.cachedir:
self.cachedir = os.path.join(self.config.getConfDir(), "ocrcache")
self.objdir = os.path.join(self.cachedir, "objects")
if not os.path.exists(self.objdir):
os.makedirs(self.objdir)
# Compute sha1 of path, as two parts of 2 and 38 chars
def _hashpath(self, data):
if type(data) != type(b""):
data = data.encode('utf-8')
m = hashlib.sha1()
m.update(data)
h = m.hexdigest()
return h[0:2], h[2:]
# Compute sha1 of path data contents, as two parts of 2 and 38 chars
def _hashdata(self, path):
#deb("Hashing DATA")
m = hashlib.sha1()
with open(path, "rb") as f:
while True:
d = f.read(8192)
if not d:
break
m.update(d)
h = m.hexdigest()
return h[0:2], h[2:]
# Try to read the stored attributes for a given path: data hash,
# modification time and size. If this fails, the path itself is
# not cached (but the data still might be, maybe the file was moved)
def _cachedpathattrs(self, path):
pd,pf = self._hashpath(path)
o = os.path.join(self.objdir, pd, pf)
if not os.path.exists(o):
return False, None, None, None, None
line = open(o, "r").read()
dd,df,tm,sz = line.split()
tm = int(tm)
sz = int(sz)
return True, dd, df, tm, sz
# Compute the path hash, and get the mtime and size for given
# path, for updating the cache path file
def _newpathattrs(self, path):
pd,pf = self._hashpath(path)
tm = int(os.path.getmtime(path))
sz = int(os.path.getsize(path))
return pd, pf, tm, sz
# Check if the cache appears up to date for a given path, only
# using the modification time and size. Return the data file path
# elements if we get a hit.
def _pathincache(self, path):
ret, od, of, otm, osz = self._cachedpathattrs(path)
if not ret:
return False, None, None
pd, pf, ntm, nsz = self._newpathattrs(path)
#deb(" tm %d sz %d" % (ntm, nsz))
#deb("otm %d osz %d" % (otm, osz))
if otm != ntm or osz != nsz:
return False, None, None
return True, od, of
# Check if cache appears up to date for path (no data check),
# return True/False
def pathincache(self, path):
ret, dd, df = self._pathincache(path)
return ret
# Compute the data file name for path. Expensive: we compute the data hash.
# Return both the data file path and path elements (for storage in path file)
def _datafilename(self, path):
d, f = self._hashdata(path)
return os.path.join(self.objdir, d, f), d, f
# Check if the data for path is in cache: expensive, needs to
# compute the hash for the path's data contents. Returns True/False
def dataincache(self, path):
return os.path.exists(self._datafilename(path)[0])
# Create path file with given elements.
def _updatepathfile(self, pd, pf, dd, df, tm, sz):
dir = os.path.join(self.objdir, pd)
if not os.path.exists(dir):
os.makedirs(dir)
pfile = os.path.join(dir, pf)
with open(pfile, "w") as f:
f.write("%s %s %d %d\n" % (dd, df, tm, sz))
# Store data for path. Only rewrite an existing data file if told
# to do so: this is only useful if we are forcing an OCR re-run.
def store(self, path, datatostore, force=False):
dd,df = self._hashdata(path)
pd, pf, tm, sz = self._newpathattrs(path)
self._updatepathfile(pd, pf, dd, df, tm, sz)
dir = os.path.join(self.objdir, dd)
if not os.path.exists(dir):
os.makedirs(dir)
dfile = os.path.join(dir, df)
if force or not os.path.exists(dfile):
#deb("Storing data")
with open(dfile, "wb") as f:
f.write(datatostore)
return True
# Retrieve cached OCR'd data for image path. Possibly update the
# path file as a side effect (case where the image has moved, but
# the data has not changed).
def get(self, path):
pincache, dd, df = self._pathincache(path)
if pincache:
dfn = os.path.join(self.objdir, dd, df)
else:
dfn, dd, df = self._datafilename(path)
if not os.path.exists(dfn):
return False, b""
if not pincache:
# File has moved. create/Update path file for next time
pd, pf, tm, sz = self._newpathattrs(path)
self._updatepathfile(pd, pf, dd, df, tm, sz)
return True, open(dfn, "rb").read()
if __name__ == '__main__':
import rclconfig
conf = rclconfig.RclConfig()
cache = OCRCache(conf)
path = sys.argv[1]
deb("Using %s" % path)
deb("== CACHE tests")
ret = cache.pathincache(path)
s = "" if ret else " not"
deb("path for %s%s in cache" % (path, s))
#ret = cache.dataincache(path)
#s = "" if ret else " not"
#deb("data for %s%s in cache" % (path, s))
if False:
deb("== STORE tests")
cache.store(path, b"my OCR'd text is one line\n", force=False)
deb("== GET tests")
incache, data = cache.get(path)
if incache:
deb("Data from cache [%s]" % data)
else:
deb("Data was not found in cache")

217
src/filters/rclocrtesseract.py Executable file
View File

@ -0,0 +1,217 @@
#!/usr/bin/env python3
#################################
# Copyright (C) 2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
########################################################
import os
import sys
import atexit
import tempfile
import subprocess
import glob
import rclexecm
_mswindows = (sys.platform == "win32")
if _mswindows:
ocrlangfile = ".rclocrlang"
else:
ocrlangfile = ".rclocrlang"
_okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
def _deb(s):
if not _mswindows:
#print("%s" % s, file=sys.stderr)
pass
def vacuumdir(dir):
if dir:
for fn in os.listdir(dir):
path = os.path.join(dir, fn)
if os.path.isfile(path):
os.unlink(path)
return True
tmpdir = None
def _maybemaketmpdir():
global tmpdir
if tmpdir:
if not vacuumdir(tmpdir):
_deb("openfile: vacuumdir %s failed" % tmpdir)
return False
else:
tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
def finalcleanup():
if tmpdir:
vacuumdir(tmpdir)
os.rmdir(tmpdir)
atexit.register(finalcleanup)
# Return true if tesseract and the appropriate conversion program for
# the file type (e.g. pdftoppt for pdf) appear to be available
def ocrpossible(path):
# Check for tesseract
global tesseract
tesseract = rclexecm.which("tesseract")
if not tesseract:
return False
# Check input format
base,ext = os.path.splitext(path)
ext = ext.lower()
if ext in _okexts:
return True
if ext == '.pdf':
# Check for pdftoppm. We could use pdftocairo, which can
# produce a multi-page pdf and make the rest simpler, but the
# legacy code used pdftoppm for some reason, and it appears
# that the newest builds from conda-forge do not include
# pdftocairo. So stay with pdftoppm.
global pdftoppm
pdftoppm = rclexecm.which("pdftoppm")
if pdftoppm:
return True
return False
# Try to guess tesseract language. This should depend on the input
# file, but we have no general way to determine it. So use the
# environment and hope for the best.
def _guesstesseractlang(config, path):
tesseractlang = ""
dirname = os.path.dirname(path)
# First look for a language def file in the file's directory
pdflangfile = os.path.join(dirname, ocrlangfile)
if os.path.isfile(pdflangfile):
tesseractlang = open(pdflangfile, "r").read().strip()
if tesseractlang:
_deb("Tesseract lang from file: %s" % tesseractlang)
return tesseractlang
# Then look for a config file option.
config.setKeyDir(dirname)
tesseractlang = config.getConfParam("tesseractlang")
if tesseractlang:
_deb("Tesseract lang from config: %s" % tesseractlang)
return tesseractlang
# Half-assed trial to guess from LANG then default to english
try:
localelang = os.environ.get("LANG", "").split("_")[0]
if localelang == "en":
tesseractlang = "eng"
elif localelang == "de":
tesseractlang = "deu"
elif localelang == "fr":
tesseractlang = "fra"
except:
pass
if not tesseractlang:
tesseractlang = "eng"
_deb("Tesseract lang (guessed): %s" % tesseractlang)
return tesseractlang
# Process pdf file: use pdftoppm to split it into ppm pages, then run
# tesseract on each and concatenate the result. It would probably be
# possible instead to use pdftocairo to produce a tiff, buf pdftocairo
# is sometimes not available (windows).
def _pdftesseract(config, path):
if not tmpdir:
return b""
tesseractlang = _guesstesseractlang(config, path)
#tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
# Split pdf pages
try:
vacuumdir(tmpdir)
subprocess.check_call([pdftoppm, "-r", "300", path, tmpfile])
except Exception as e:
_deb("pdftoppm failed: %s" % e)
return b""
files = glob.glob(tmpfile + "*")
for f in files:
out = b''
try:
out = subprocess.check_output(
[tesseract, f, f, "-l", tesseractlang],
stderr=subprocess.STDOUT)
except Exception as e:
_deb("tesseract failed: %s" % e)
errlines = out.split(b'\n')
if len(errlines) > 2:
_deb("Tesseract error: %s" % out)
# Concatenate the result files
files = glob.glob(tmpfile + "*" + ".txt")
data = b""
for f in files:
data += open(f, "rb").read()
return data
def _simpletesseract(config, path):
tesseractlang = _guesstesseractlang(config, path)
try:
out = subprocess.check_output(
[tesseract, path, 'stdout', '-l', tesseractlang],
stderr=subprocess.DEVNULL)
except Exception as e:
_deb("tesseract failed: %s" % e)
return out
# run ocr on the input path and output the result data.
def runocr(config, path):
_maybemaketmpdir()
base,ext = os.path.splitext(path)
ext = ext.lower()
if ext in _okexts:
return _simpletesseract(config, path)
else:
return _pdftesseract(config, path)
if __name__ == '__main__':
import rclconfig
config = rclconfig.RclConfig()
path = sys.argv[1]
if ocrpossible(path):
data = runocr(config, sys.argv[1])
else:
_deb("ocrpossible returned false")
sys.exit(1)
sys.stdout.buffer.write(data)

View File

@ -47,8 +47,14 @@ import glob
import traceback
_mswindows = (sys.platform == "win32")
tmpdir = None
_htmlprefix =b'''<html><head>
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
</head><body><pre>'''
_htmlsuffix = b'''</pre></body></html>'''
def finalcleanup():
if tmpdir:
vacuumdir(tmpdir)
@ -120,18 +126,6 @@ class PDFExtractor:
except:
pass
# See if we'll try to perform OCR. Need the commands and the
# either the presence of a file in the config dir (historical)
# or a set config variable.
self.ocrpossible = False
self.tesseract = rclexecm.which("tesseract")
if self.tesseract:
self.pdftoppm = rclexecm.which("pdftoppm")
if self.pdftoppm:
self.ocrpossible = True
self.maybemaketmpdir()
# self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
# Pdftk is optionally used to extract attachments. This takes
# a hit on performance even in the absence of any attachments,
# so it can be disabled in the configuration.
@ -236,100 +230,6 @@ class PDFExtractor:
return (True, docdata, ipath, eof)
# Try to guess tesseract language. This should depend on the input
# file, but we have no general way to determine it. So use the
# environment and hope for the best.
def guesstesseractlang(self):
tesseractlang = ""
# First look for a language def file in the file's directory
pdflangfile = os.path.join(os.path.dirname(self.filename),
b".ocrpdflang")
if os.path.isfile(pdflangfile):
tesseractlang = open(pdflangfile, "r").read().strip()
if tesseractlang:
return tesseractlang
# Then look for a global option. The normal way now that we
# have config reading capability in the handlers is to use the
# config. Then, for backwards compat, environment variable and
# file inside the configuration directory
tesseractlang = self.config.getConfParam("pdfocrlang")
if tesseractlang:
return tesseractlang
tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", "");
if tesseractlang:
return tesseractlang
pdflangfile = os.path.join(self.confdir, "ocrpdf")
if os.path.isfile(pdflangfile):
tesseractlang = open(pdflangfile, "r").read().strip()
if tesseractlang:
return tesseractlang
# Half-assed trial to guess from LANG then default to english
localelang = os.environ.get("LANG", "").split("_")[0]
if localelang == "en":
tesseractlang = "eng"
elif localelang == "de":
tesseractlang = "deu"
elif localelang == "fr":
tesseractlang = "fra"
if tesseractlang:
return tesseractlang
if not tesseractlang:
tesseractlang = "eng"
return tesseractlang
# PDF has no text content and tesseract is available. Give OCR a try
def ocrpdf(self):
global tmpdir
if not tmpdir:
return b""
tesseractlang = self.guesstesseractlang()
# self.em.rclog("tesseractlang %s" % tesseractlang)
tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
# Split pdf pages
try:
vacuumdir(tmpdir)
subprocess.check_call([self.pdftoppm, "-r", "300", self.filename,
tmpfile])
except Exception as e:
self.em.rclog("pdftoppm failed: %s" % e)
return b""
files = glob.glob(tmpfile + "*")
for f in files:
out = b''
try:
out = subprocess.check_output([self.tesseract, f, f, "-l",
tesseractlang],
stderr = subprocess.STDOUT)
except Exception as e:
self.em.rclog("tesseract failed: %s" % e)
errlines = out.split(b'\n')
if len(errlines) > 2:
self.em.rclog("Tesseract error: %s" % out)
# Concatenate the result files
files = glob.glob(tmpfile + "*" + ".txt")
data = b""
for f in files:
data += open(f, "rb").read()
return b'''<html><head>
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
</head><body><pre>''' + \
self.em.htmlescape(data) + \
b'''</pre></body></html>'''
# pdftotext (used to?) badly escape text inside the header
# fields. We do it here. This is not an html parser, and depends a
# lot on the actual format output by pdftotext.
@ -510,13 +410,11 @@ class PDFExtractor:
html, isempty = self._fixhtml(html)
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
if isempty and self.ocrpossible:
self.config.setKeyDir(os.path.dirname(self.filename))
s = self.config.getConfParam("pdfocr")
cf_doocr = rclexecm.configparamtrue(s)
file_doocr = os.path.isfile(os.path.join(self.confdir, "ocrpdf"))
if cf_doocr or file_doocr:
html = self.ocrpdf()
if isempty:
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
self.filename]
data = subprocess.check_output(cmd)
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
if self.extrameta:
try:
@ -592,6 +490,7 @@ class PDFExtractor:
# Main program: create protocol handler and extractor and run them
_execdir = os.path.dirname(sys.argv[0])
proto = rclexecm.RclExecM()
extract = PDFExtractor(proto)
rclexecm.main(proto, extract)