1st version of the cached ocr mechanism

2020-02-15 21:19:13 +01:00 · 2020-02-15 21:19:13 +01:00 · 38dfa5f841
commit 38dfa5f841
parent aa40531bbe
4 changed files with 517 additions and 113 deletions
--- a/src/filters/rclocr.py
+++ b/src/filters/rclocr.py
@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+#################################
+# Copyright (C) 2020 J.F.Dockes
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the
+#   Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+########################################################
+
+# Running OCR programs for Recoll
+
+import os
+import sys
+import rclconfig
+import rclocrcache
+import importlib.util
+
+def deb(s):
+    print("%s" % s, file=sys.stderr)
+    
+def Usage():
+    deb("Usage: rclocr.py <imagefilename>")
+    sys.exit(1)
+
+if len(sys.argv) != 2:
+    Usage()
+
+path = sys.argv[1]
+
+config = rclconfig.RclConfig()
+cache = rclocrcache.OCRCache(config)
+
+incache, data = cache.get(path)
+if incache:
+    sys.stdout.buffer.write(data)
+    sys.exit(0)
+    
+#### Data not in cache
+
+# Retrieve known ocr program names and try to load the corresponding module
+ocrprogs = config.getConfParam("ocrprogs")
+if not ocrprogs:
+    deb("No ocrprogs variable")
+    sys.exit(1)
+deb("ocrprogs: %s" % ocrprogs)
+proglist = ocrprogs.split(" ")
+ok = False
+for ocrprog in proglist:
+    try:
+        modulename = "rclocr" + ocrprog
+        ocr = importlib.import_module(modulename)
+        if ocr.ocrpossible(path):
+            ok = True
+            break
+    except Exception as err:
+        deb("While loading %s: got: %s" % (modulename, err))
+        pass
+
+if not ok:
+    deb("No OCR module could be loaded")
+    sys.exit(1)
+
+deb("Using ocr module %s" % modulename)
+
+data = ocr.runocr(config, path)
+
+cache.store(path, data)
+sys.stdout.buffer.write(data)
+sys.exit(0)
+
--- a/src/filters/rclocrcache.py
+++ b/src/filters/rclocrcache.py
@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+#################################
+# Copyright (C) 2020 J.F.Dockes
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the
+#   Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+########################################################
+
+# Caching OCR'd data
+
+# OCR is extremely slow. The cache stores 2 kinds of objects:
+# - Path files are named from the hash of the image path and contain
+#   the image data hash and the modification time and size of the
+#   image at the time the OCR'd data was stored in the cache
+# - Data files are named with the hash of the image data and contain
+#   the OCR'd data
+# When retrieving data from the cache:
+#  - We first use the image file size and modification time: if an
+#    entry exists for the imagepath/mtime/size triplet, and is up to
+#    date, the corresponding data is obtained from the data file and
+#    returned.
+#  - Else we then use the image data: if an entry exists for the
+#    computed hashed value of the data, it is returned. This allows
+#    moving files around without needing to run OCR again, but of
+#    course, it is more expensive than the first step
+#
+#  If we need to use the second step, as a side effect, a path file is
+#  created or updated so that the data will be found with the first
+#  step next time around.
+
+import sys
+import os
+import hashlib
+
+def deb(s):
+    print("%s" %s, file=sys.stderr)
+    
+class OCRCache(object):
+    def __init__(self, conf):
+        self.config = conf
+        self.cachedir = conf.getConfParam("ocrcachedir")
+        if not self.cachedir:
+            self.cachedir = os.path.join(self.config.getConfDir(), "ocrcache")
+        self.objdir = os.path.join(self.cachedir, "objects")
+        if not os.path.exists(self.objdir):
+            os.makedirs(self.objdir)
+
+    # Compute sha1 of path, as two parts of 2 and 38 chars
+    def _hashpath(self, data):
+        if type(data) != type(b""):
+            data = data.encode('utf-8')
+            m = hashlib.sha1()
+            m.update(data)
+            h = m.hexdigest()
+        return h[0:2], h[2:]
+
+    # Compute sha1 of path data contents, as two parts of 2 and 38 chars
+    def _hashdata(self, path):
+        #deb("Hashing DATA")
+        m = hashlib.sha1()
+        with open(path, "rb") as f:
+            while True:
+                d = f.read(8192)
+                if not d:
+                    break
+                m.update(d)
+                h = m.hexdigest()
+        return h[0:2], h[2:]
+
+    # Try to read the stored attributes for a given path: data hash,
+    # modification time and size. If this fails, the path itself is
+    # not cached (but the data still might be, maybe the file was moved)
+    def _cachedpathattrs(self, path):
+        pd,pf = self._hashpath(path)
+        o = os.path.join(self.objdir, pd, pf)
+        if not os.path.exists(o):
+            return False, None, None, None, None
+        line = open(o, "r").read()
+        dd,df,tm,sz = line.split()
+        tm = int(tm)
+        sz = int(sz)
+        return True, dd, df, tm, sz
+
+    # Compute the path hash, and get the mtime and size for given
+    # path, for updating the cache path file
+    def _newpathattrs(self, path):
+        pd,pf = self._hashpath(path)
+        tm = int(os.path.getmtime(path))
+        sz = int(os.path.getsize(path))
+        return pd, pf, tm, sz
+    
+    # Check if the cache appears up to date for a given path, only
+    # using the modification time and size. Return the data file path
+    # elements if we get a hit.
+    def _pathincache(self, path):
+        ret, od, of, otm, osz = self._cachedpathattrs(path)
+        if not ret:
+            return False, None, None
+        pd, pf, ntm, nsz = self._newpathattrs(path)
+        #deb(" tm %d  sz %d" % (ntm, nsz))
+        #deb("otm %d osz %d" % (otm, osz))
+        if otm != ntm or osz != nsz:
+            return False, None, None
+        return True, od, of
+
+    # Check if cache appears up to date for path (no data check),
+    # return True/False
+    def pathincache(self, path):
+        ret, dd, df = self._pathincache(path)
+        return ret
+    
+    # Compute the data file name for path. Expensive: we compute the data hash.
+    # Return both the data file path and path elements (for storage in path file)
+    def _datafilename(self, path):
+        d, f = self._hashdata(path)
+        return os.path.join(self.objdir, d, f), d, f
+
+    # Check if the data for path is in cache: expensive, needs to
+    # compute the hash for the path's data contents. Returns True/False
+    def dataincache(self, path):
+        return os.path.exists(self._datafilename(path)[0])
+
+    # Create path file with given elements.
+    def _updatepathfile(self, pd, pf, dd, df, tm, sz):
+        dir = os.path.join(self.objdir, pd)
+        if not os.path.exists(dir):
+            os.makedirs(dir)
+        pfile = os.path.join(dir, pf)
+        with open(pfile, "w") as f:
+            f.write("%s %s %d %d\n" % (dd, df, tm, sz))
+
+    # Store data for path. Only rewrite an existing data file if told
+    # to do so: this is only useful if we are forcing an OCR re-run.
+    def store(self, path, datatostore, force=False):
+        dd,df = self._hashdata(path)
+        pd, pf, tm, sz = self._newpathattrs(path)
+        self._updatepathfile(pd, pf, dd, df, tm, sz)
+        dir = os.path.join(self.objdir, dd)
+        if not os.path.exists(dir):
+            os.makedirs(dir)
+        dfile = os.path.join(dir, df)
+        if force or not os.path.exists(dfile):
+            #deb("Storing data")
+            with open(dfile, "wb") as f:
+                f.write(datatostore)
+        return True
+
+    # Retrieve cached OCR'd data for image path. Possibly update the
+    # path file as a side effect (case where the image has moved, but
+    # the data has not changed).
+    def get(self, path):
+        pincache, dd, df = self._pathincache(path)
+        if pincache:
+            dfn = os.path.join(self.objdir, dd, df)
+        else:
+            dfn, dd, df = self._datafilename(path)
+
+        if not os.path.exists(dfn):
+            return False, b""
+
+        if not pincache:
+            # File has moved. create/Update path file for next time
+            pd, pf, tm, sz = self._newpathattrs(path)
+            self._updatepathfile(pd, pf, dd, df, tm, sz)
+
+        return True, open(dfn, "rb").read()
+
+
+
+if __name__ == '__main__':
+    import rclconfig
+
+    conf = rclconfig.RclConfig()
+    cache = OCRCache(conf)
+    path = sys.argv[1]
+    deb("Using %s" % path)
+    
+    deb("== CACHE tests")
+    ret = cache.pathincache(path)
+    s = "" if ret else " not"
+    deb("path for %s%s in cache" % (path, s))
+
+    #ret = cache.dataincache(path)
+    #s = "" if ret else " not"
+    #deb("data for %s%s in cache" % (path, s))
+
+    if False:
+        deb("== STORE tests")
+        cache.store(path, b"my OCR'd text is one line\n", force=False)
+
+    deb("== GET tests")
+    incache, data = cache.get(path)
+    if incache:
+        deb("Data from cache [%s]" % data)
+    else:
+        deb("Data was not found in cache")
+        
--- a/src/filters/rclocrtesseract.py
+++ b/src/filters/rclocrtesseract.py
@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+#################################
+# Copyright (C) 2020 J.F.Dockes
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the
+#   Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+########################################################
+
+import os
+import sys
+import atexit
+import tempfile
+import subprocess
+import glob
+
+import rclexecm
+
+_mswindows = (sys.platform == "win32")
+if _mswindows:
+    ocrlangfile = ".rclocrlang"
+else:
+    ocrlangfile = ".rclocrlang"
+
+_okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
+
+def _deb(s):
+    if not _mswindows:
+        #print("%s" % s, file=sys.stderr)
+        pass
+
+def vacuumdir(dir):
+    if dir:
+        for fn in os.listdir(dir):
+            path = os.path.join(dir, fn)
+            if os.path.isfile(path):
+                os.unlink(path)
+    return True
+
+tmpdir = None
+def _maybemaketmpdir():
+    global tmpdir
+    if tmpdir:
+        if not vacuumdir(tmpdir):
+            _deb("openfile: vacuumdir %s failed" % tmpdir)
+            return False
+    else:
+        tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
+
+def finalcleanup():
+    if tmpdir:
+        vacuumdir(tmpdir)
+        os.rmdir(tmpdir)
+
+atexit.register(finalcleanup)
+
+# Return true if tesseract and the appropriate conversion program for
+# the file type (e.g. pdftoppt for pdf) appear to be available
+def ocrpossible(path):
+    # Check for tesseract
+    global tesseract
+    tesseract = rclexecm.which("tesseract")
+    if not tesseract:
+        return False
+
+    # Check input format
+    base,ext = os.path.splitext(path)
+    ext = ext.lower()
+    if ext in _okexts:
+        return True
+
+    if ext == '.pdf':
+        # Check for pdftoppm. We could use pdftocairo, which can
+        # produce a multi-page pdf and make the rest simpler, but the
+        # legacy code used pdftoppm for some reason, and it appears
+        # that the newest builds from conda-forge do not include
+        # pdftocairo. So stay with pdftoppm.
+        global pdftoppm
+        pdftoppm = rclexecm.which("pdftoppm")
+        if pdftoppm:
+            return True
+
+    return False
+
+
+# Try to guess tesseract language. This should depend on the input
+# file, but we have no general way to determine it. So use the
+# environment and hope for the best.
+def _guesstesseractlang(config, path):
+    tesseractlang = ""
+
+    dirname = os.path.dirname(path)
+
+    # First look for a language def file in the file's directory
+    pdflangfile = os.path.join(dirname, ocrlangfile)
+    if os.path.isfile(pdflangfile):
+        tesseractlang = open(pdflangfile, "r").read().strip()
+    if tesseractlang:
+        _deb("Tesseract lang from file: %s" % tesseractlang)
+        return tesseractlang
+
+    # Then look for a config file  option.
+    config.setKeyDir(dirname)
+    tesseractlang = config.getConfParam("tesseractlang")
+    if tesseractlang:
+        _deb("Tesseract lang from config: %s" % tesseractlang)
+        return tesseractlang
+
+    # Half-assed trial to guess from LANG then default to english
+    try:
+        localelang = os.environ.get("LANG", "").split("_")[0]
+        if localelang == "en":
+            tesseractlang = "eng"
+        elif localelang == "de":
+            tesseractlang = "deu"
+        elif localelang == "fr":
+            tesseractlang = "fra"
+    except:
+        pass
+
+    if not tesseractlang:
+        tesseractlang = "eng"
+    _deb("Tesseract lang (guessed): %s" % tesseractlang)
+    return tesseractlang
+
+# Process pdf file: use pdftoppm to split it into ppm pages, then run
+# tesseract on each and concatenate the result. It would probably be
+# possible instead to use pdftocairo to produce a tiff, buf pdftocairo
+# is sometimes not available (windows).
+def _pdftesseract(config, path):
+    if not tmpdir:
+        return b""
+
+    tesseractlang = _guesstesseractlang(config, path)
+
+    #tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
+    tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
+
+    # Split pdf pages
+    try:
+        vacuumdir(tmpdir)
+        subprocess.check_call([pdftoppm, "-r", "300", path, tmpfile])
+    except Exception as e:
+        _deb("pdftoppm failed: %s" % e)
+        return b""
+
+    files = glob.glob(tmpfile + "*")
+    for f in files:
+        out = b''
+        try:
+            out = subprocess.check_output(
+                [tesseract, f, f, "-l", tesseractlang],
+                stderr=subprocess.STDOUT)
+        except Exception as e:
+            _deb("tesseract failed: %s" % e)
+
+        errlines = out.split(b'\n')
+        if len(errlines) > 2:
+            _deb("Tesseract error: %s" % out)
+
+    # Concatenate the result files
+    files = glob.glob(tmpfile + "*" + ".txt")
+    data = b""
+    for f in files:
+        data += open(f, "rb").read()
+
+    return data
+
+
+def _simpletesseract(config, path):
+    tesseractlang = _guesstesseractlang(config, path)
+
+    try:
+        out = subprocess.check_output(
+            [tesseract, path, 'stdout', '-l', tesseractlang],
+            stderr=subprocess.DEVNULL)
+    except Exception as e:
+        _deb("tesseract failed: %s" % e)
+
+    return out
+
+
+# run ocr on the input path and output the result data.
+def runocr(config, path):
+    _maybemaketmpdir()
+    base,ext = os.path.splitext(path)
+    ext = ext.lower()
+    if ext in _okexts:
+        return _simpletesseract(config, path)
+    else:
+        return _pdftesseract(config, path)
+
+   
+
+
+if __name__ == '__main__':
+    import rclconfig
+    config = rclconfig.RclConfig()
+    path =  sys.argv[1]
+    if ocrpossible(path):
+        data = runocr(config, sys.argv[1])
+    else:
+        _deb("ocrpossible returned false")
+        sys.exit(1)
+    sys.stdout.buffer.write(data)
+    
+
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@ -47,8 +47,14 @@ import glob
 import traceback

 _mswindows = (sys.platform == "win32")
+    
 tmpdir = None

+_htmlprefix =b'''<html><head>
+<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
+</head><body><pre>'''
+_htmlsuffix = b'''</pre></body></html>'''
+
 def finalcleanup():
    if tmpdir:
        vacuumdir(tmpdir)
@ -120,18 +126,6 @@ class PDFExtractor:
        except:
            pass
        
-        # See if we'll try to perform OCR. Need the commands and the
-        # either the presence of a file in the config dir (historical)
-        # or a set config variable.
-        self.ocrpossible = False
-        self.tesseract = rclexecm.which("tesseract")
-        if self.tesseract:
-            self.pdftoppm = rclexecm.which("pdftoppm")
-            if self.pdftoppm:
-                self.ocrpossible = True
-                self.maybemaketmpdir()
-        # self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
-
        # Pdftk is optionally used to extract attachments. This takes
        # a hit on performance even in the absence of any attachments,
        # so it can be disabled in the configuration.
@ -236,100 +230,6 @@ class PDFExtractor:
        return (True, docdata, ipath, eof)


-    # Try to guess tesseract language. This should depend on the input
-    # file, but we have no general way to determine it. So use the
-    # environment and hope for the best.
-    def guesstesseractlang(self):
-        tesseractlang = ""
-
-        # First look for a language def file in the file's directory 
-        pdflangfile = os.path.join(os.path.dirname(self.filename),
-                                   b".ocrpdflang")
-        if os.path.isfile(pdflangfile):
-            tesseractlang = open(pdflangfile, "r").read().strip()
-        if tesseractlang:
-            return tesseractlang
-
-        # Then look for a global option. The normal way now that we
-        # have config reading capability in the handlers is to use the
-        # config. Then, for backwards compat, environment variable and
-        # file inside the configuration directory
-        tesseractlang = self.config.getConfParam("pdfocrlang")
-        if tesseractlang:
-            return tesseractlang
-        tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", "");
-        if tesseractlang:
-            return tesseractlang
-        pdflangfile = os.path.join(self.confdir, "ocrpdf")
-        if os.path.isfile(pdflangfile):
-            tesseractlang = open(pdflangfile, "r").read().strip()
-        if tesseractlang:
-            return tesseractlang
-
-        # Half-assed trial to guess from LANG then default to english
-        localelang = os.environ.get("LANG", "").split("_")[0]
-        if localelang == "en":
-            tesseractlang = "eng"
-        elif localelang == "de":
-            tesseractlang = "deu"
-        elif localelang == "fr":
-            tesseractlang = "fra"
-        if tesseractlang:
-            return tesseractlang
-
-        if not tesseractlang:
-            tesseractlang = "eng"
-        return tesseractlang
-
-    # PDF has no text content and tesseract is available. Give OCR a try
-    def ocrpdf(self):
-
-        global tmpdir
-        if not tmpdir:
-            return b""
-
-        tesseractlang = self.guesstesseractlang()
-        # self.em.rclog("tesseractlang %s" % tesseractlang)
-
-        tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
-        tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
-
-        # Split pdf pages
-        try:
-            vacuumdir(tmpdir)
-            subprocess.check_call([self.pdftoppm, "-r", "300", self.filename,
-                                   tmpfile])
-        except Exception as e:
-            self.em.rclog("pdftoppm failed: %s" % e)
-            return b""
-
-        files = glob.glob(tmpfile + "*")
-        for f in files:
-            out = b''
-            try:
-                out = subprocess.check_output([self.tesseract, f, f, "-l",
-                                               tesseractlang],
-                                              stderr = subprocess.STDOUT)
-            except Exception as e:
-                self.em.rclog("tesseract failed: %s" % e)
-
-            errlines = out.split(b'\n')
-            if len(errlines) > 2:
-                self.em.rclog("Tesseract error: %s" % out)
-
-        # Concatenate the result files
-        files = glob.glob(tmpfile + "*" + ".txt")
-        data = b""
-        for f in files:
-            data += open(f, "rb").read()
-
-        return b'''<html><head>
-        <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
-        </head><body><pre>''' + \
-        self.em.htmlescape(data) + \
-        b'''</pre></body></html>'''
-
-
    # pdftotext (used to?) badly escape text inside the header
    # fields. We do it here. This is not an html parser, and depends a
    # lot on the actual format output by pdftotext.
@ -510,13 +410,11 @@ class PDFExtractor:
        html, isempty = self._fixhtml(html)
        #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))

-        if isempty and self.ocrpossible:
-            self.config.setKeyDir(os.path.dirname(self.filename))
-            s = self.config.getConfParam("pdfocr")
-            cf_doocr = rclexecm.configparamtrue(s)
-            file_doocr = os.path.isfile(os.path.join(self.confdir, "ocrpdf"))
-            if cf_doocr or file_doocr:
-                html = self.ocrpdf()
+        if isempty:
+            cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
+                   self.filename]
+            data = subprocess.check_output(cmd)
+            html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix

        if self.extrameta:
            try:
@ -592,6 +490,7 @@ class PDFExtractor:


 # Main program: create protocol handler and extractor and run them
+_execdir = os.path.dirname(sys.argv[0])
 proto = rclexecm.RclExecM()
 extract = PDFExtractor(proto)
 rclexecm.main(proto, extract)