1st version of the cached ocr mechanism

2020-02-15 21:19:13 +01:00 · 2020-02-15 21:19:13 +01:00 · 38dfa5f841
commit 38dfa5f841
parent aa40531bbe
4 changed files with 517 additions and 113 deletions
--- a/src/filters/rclocr.py
+++ b/src/filters/rclocr.py
@ -0,0 +1,80 @@
 #!/usr/bin/env python3
 #################################
 # Copyright (C) 2020 J.F.Dockes
 #   This program is free software; you can redistribute it and/or modify
 #   it under the terms of the GNU General Public License as published by
 #   the Free Software Foundation; either version 2 of the License, or
 #   (at your option) any later version.
 #
 #   This program is distributed in the hope that it will be useful,
 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #   GNU General Public License for more details.
 #
 #   You should have received a copy of the GNU General Public License
 #   along with this program; if not, write to the
 #   Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ########################################################
 # Running OCR programs for Recoll
 import os
 import sys
 import rclconfig
 import rclocrcache
 import importlib.util
 def deb(s):
    print("%s" % s, file=sys.stderr)
 def Usage():
    deb("Usage: rclocr.py <imagefilename>")
    sys.exit(1)
 if len(sys.argv) != 2:
    Usage()
 path = sys.argv[1]
 config = rclconfig.RclConfig()
 cache = rclocrcache.OCRCache(config)
 incache, data = cache.get(path)
 if incache:
    sys.stdout.buffer.write(data)
    sys.exit(0)
 #### Data not in cache
 # Retrieve known ocr program names and try to load the corresponding module
 ocrprogs = config.getConfParam("ocrprogs")
 if not ocrprogs:
    deb("No ocrprogs variable")
    sys.exit(1)
 deb("ocrprogs: %s" % ocrprogs)
 proglist = ocrprogs.split(" ")
 ok = False
 for ocrprog in proglist:
    try:
        modulename = "rclocr" + ocrprog
        ocr = importlib.import_module(modulename)
        if ocr.ocrpossible(path):
            ok = True
            break
    except Exception as err:
        deb("While loading %s: got: %s" % (modulename, err))
        pass
 if not ok:
    deb("No OCR module could be loaded")
    sys.exit(1)
 deb("Using ocr module %s" % modulename)
 data = ocr.runocr(config, path)
 cache.store(path, data)
 sys.stdout.buffer.write(data)
 sys.exit(0)
--- a/src/filters/rclocrcache.py
+++ b/src/filters/rclocrcache.py
@ -0,0 +1,208 @@
 #!/usr/bin/env python3
 #################################
 # Copyright (C) 2020 J.F.Dockes
 #   This program is free software; you can redistribute it and/or modify
 #   it under the terms of the GNU General Public License as published by
 #   the Free Software Foundation; either version 2 of the License, or
 #   (at your option) any later version.
 #
 #   This program is distributed in the hope that it will be useful,
 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #   GNU General Public License for more details.
 #
 #   You should have received a copy of the GNU General Public License
 #   along with this program; if not, write to the
 #   Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ########################################################
 # Caching OCR'd data
 # OCR is extremely slow. The cache stores 2 kinds of objects:
 # - Path files are named from the hash of the image path and contain
 #   the image data hash and the modification time and size of the
 #   image at the time the OCR'd data was stored in the cache
 # - Data files are named with the hash of the image data and contain
 #   the OCR'd data
 # When retrieving data from the cache:
 #  - We first use the image file size and modification time: if an
 #    entry exists for the imagepath/mtime/size triplet, and is up to
 #    date, the corresponding data is obtained from the data file and
 #    returned.
 #  - Else we then use the image data: if an entry exists for the
 #    computed hashed value of the data, it is returned. This allows
 #    moving files around without needing to run OCR again, but of
 #    course, it is more expensive than the first step
 #
 #  If we need to use the second step, as a side effect, a path file is
 #  created or updated so that the data will be found with the first
 #  step next time around.
 import sys
 import os
 import hashlib
 def deb(s):
    print("%s" %s, file=sys.stderr)
 class OCRCache(object):
    def __init__(self, conf):
        self.config = conf
        self.cachedir = conf.getConfParam("ocrcachedir")
        if not self.cachedir:
            self.cachedir = os.path.join(self.config.getConfDir(), "ocrcache")
        self.objdir = os.path.join(self.cachedir, "objects")
        if not os.path.exists(self.objdir):
            os.makedirs(self.objdir)
    # Compute sha1 of path, as two parts of 2 and 38 chars
    def _hashpath(self, data):
        if type(data) != type(b""):
            data = data.encode('utf-8')
            m = hashlib.sha1()
            m.update(data)
            h = m.hexdigest()
        return h[0:2], h[2:]
    # Compute sha1 of path data contents, as two parts of 2 and 38 chars
    def _hashdata(self, path):
        #deb("Hashing DATA")
        m = hashlib.sha1()
        with open(path, "rb") as f:
            while True:
                d = f.read(8192)
                if not d:
                    break
                m.update(d)
                h = m.hexdigest()
        return h[0:2], h[2:]
    # Try to read the stored attributes for a given path: data hash,
    # modification time and size. If this fails, the path itself is
    # not cached (but the data still might be, maybe the file was moved)
    def _cachedpathattrs(self, path):
        pd,pf = self._hashpath(path)
        o = os.path.join(self.objdir, pd, pf)
        if not os.path.exists(o):
            return False, None, None, None, None
        line = open(o, "r").read()
        dd,df,tm,sz = line.split()
        tm = int(tm)
        sz = int(sz)
        return True, dd, df, tm, sz
    # Compute the path hash, and get the mtime and size for given
    # path, for updating the cache path file
    def _newpathattrs(self, path):
        pd,pf = self._hashpath(path)
        tm = int(os.path.getmtime(path))
        sz = int(os.path.getsize(path))
        return pd, pf, tm, sz
    # Check if the cache appears up to date for a given path, only
    # using the modification time and size. Return the data file path
    # elements if we get a hit.
    def _pathincache(self, path):
        ret, od, of, otm, osz = self._cachedpathattrs(path)
        if not ret:
            return False, None, None
        pd, pf, ntm, nsz = self._newpathattrs(path)
        #deb(" tm %d  sz %d" % (ntm, nsz))
        #deb("otm %d osz %d" % (otm, osz))
        if otm != ntm or osz != nsz:
            return False, None, None
        return True, od, of
    # Check if cache appears up to date for path (no data check),
    # return True/False
    def pathincache(self, path):
        ret, dd, df = self._pathincache(path)
        return ret
    # Compute the data file name for path. Expensive: we compute the data hash.
    # Return both the data file path and path elements (for storage in path file)
    def _datafilename(self, path):
        d, f = self._hashdata(path)
        return os.path.join(self.objdir, d, f), d, f
    # Check if the data for path is in cache: expensive, needs to
    # compute the hash for the path's data contents. Returns True/False
    def dataincache(self, path):
        return os.path.exists(self._datafilename(path)[0])
    # Create path file with given elements.
    def _updatepathfile(self, pd, pf, dd, df, tm, sz):
        dir = os.path.join(self.objdir, pd)
        if not os.path.exists(dir):
            os.makedirs(dir)
        pfile = os.path.join(dir, pf)
        with open(pfile, "w") as f:
            f.write("%s %s %d %d\n" % (dd, df, tm, sz))
    # Store data for path. Only rewrite an existing data file if told
    # to do so: this is only useful if we are forcing an OCR re-run.
    def store(self, path, datatostore, force=False):
        dd,df = self._hashdata(path)
        pd, pf, tm, sz = self._newpathattrs(path)
        self._updatepathfile(pd, pf, dd, df, tm, sz)
        dir = os.path.join(self.objdir, dd)
        if not os.path.exists(dir):
            os.makedirs(dir)
        dfile = os.path.join(dir, df)
        if force or not os.path.exists(dfile):
            #deb("Storing data")
            with open(dfile, "wb") as f:
                f.write(datatostore)
        return True
    # Retrieve cached OCR'd data for image path. Possibly update the
    # path file as a side effect (case where the image has moved, but
    # the data has not changed).
    def get(self, path):
        pincache, dd, df = self._pathincache(path)
        if pincache:
            dfn = os.path.join(self.objdir, dd, df)
        else:
            dfn, dd, df = self._datafilename(path)
        if not os.path.exists(dfn):
            return False, b""
        if not pincache:
            # File has moved. create/Update path file for next time
            pd, pf, tm, sz = self._newpathattrs(path)
            self._updatepathfile(pd, pf, dd, df, tm, sz)
        return True, open(dfn, "rb").read()
 if __name__ == '__main__':
    import rclconfig
    conf = rclconfig.RclConfig()
    cache = OCRCache(conf)
    path = sys.argv[1]
    deb("Using %s" % path)
    deb("== CACHE tests")
    ret = cache.pathincache(path)
    s = "" if ret else " not"
    deb("path for %s%s in cache" % (path, s))
    #ret = cache.dataincache(path)
    #s = "" if ret else " not"
    #deb("data for %s%s in cache" % (path, s))
    if False:
        deb("== STORE tests")
        cache.store(path, b"my OCR'd text is one line\n", force=False)
    deb("== GET tests")
    incache, data = cache.get(path)
    if incache:
        deb("Data from cache [%s]" % data)
    else:
        deb("Data was not found in cache")
--- a/src/filters/rclocrtesseract.py
+++ b/src/filters/rclocrtesseract.py
@ -0,0 +1,217 @@
 #!/usr/bin/env python3
 #################################
 # Copyright (C) 2020 J.F.Dockes
 #   This program is free software; you can redistribute it and/or modify
 #   it under the terms of the GNU General Public License as published by
 #   the Free Software Foundation; either version 2 of the License, or
 #   (at your option) any later version.
 #
 #   This program is distributed in the hope that it will be useful,
 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #   GNU General Public License for more details.
 #
 #   You should have received a copy of the GNU General Public License
 #   along with this program; if not, write to the
 #   Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ########################################################
 import os
 import sys
 import atexit
 import tempfile
 import subprocess
 import glob
 import rclexecm
 _mswindows = (sys.platform == "win32")
 if _mswindows:
    ocrlangfile = ".rclocrlang"
 else:
    ocrlangfile = ".rclocrlang"
 _okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
 def _deb(s):
    if not _mswindows:
        #print("%s" % s, file=sys.stderr)
        pass
 def vacuumdir(dir):
    if dir:
        for fn in os.listdir(dir):
            path = os.path.join(dir, fn)
            if os.path.isfile(path):
                os.unlink(path)
    return True
 tmpdir = None
 def _maybemaketmpdir():
    global tmpdir
    if tmpdir:
        if not vacuumdir(tmpdir):
            _deb("openfile: vacuumdir %s failed" % tmpdir)
            return False
    else:
        tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
 def finalcleanup():
    if tmpdir:
        vacuumdir(tmpdir)
        os.rmdir(tmpdir)
 atexit.register(finalcleanup)
 # Return true if tesseract and the appropriate conversion program for
 # the file type (e.g. pdftoppt for pdf) appear to be available
 def ocrpossible(path):
    # Check for tesseract
    global tesseract
    tesseract = rclexecm.which("tesseract")
    if not tesseract:
        return False
    # Check input format
    base,ext = os.path.splitext(path)
    ext = ext.lower()
    if ext in _okexts:
        return True
    if ext == '.pdf':
        # Check for pdftoppm. We could use pdftocairo, which can
        # produce a multi-page pdf and make the rest simpler, but the
        # legacy code used pdftoppm for some reason, and it appears
        # that the newest builds from conda-forge do not include
        # pdftocairo. So stay with pdftoppm.
        global pdftoppm
        pdftoppm = rclexecm.which("pdftoppm")
        if pdftoppm:
            return True
    return False
 # Try to guess tesseract language. This should depend on the input
 # file, but we have no general way to determine it. So use the
 # environment and hope for the best.
 def _guesstesseractlang(config, path):
    tesseractlang = ""
    dirname = os.path.dirname(path)
    # First look for a language def file in the file's directory
    pdflangfile = os.path.join(dirname, ocrlangfile)
    if os.path.isfile(pdflangfile):
        tesseractlang = open(pdflangfile, "r").read().strip()
    if tesseractlang:
        _deb("Tesseract lang from file: %s" % tesseractlang)
        return tesseractlang
    # Then look for a config file  option.
    config.setKeyDir(dirname)
    tesseractlang = config.getConfParam("tesseractlang")
    if tesseractlang:
        _deb("Tesseract lang from config: %s" % tesseractlang)
        return tesseractlang
    # Half-assed trial to guess from LANG then default to english
    try:
        localelang = os.environ.get("LANG", "").split("_")[0]
        if localelang == "en":
            tesseractlang = "eng"
        elif localelang == "de":
            tesseractlang = "deu"
        elif localelang == "fr":
            tesseractlang = "fra"
    except:
        pass
    if not tesseractlang:
        tesseractlang = "eng"
    _deb("Tesseract lang (guessed): %s" % tesseractlang)
    return tesseractlang
 # Process pdf file: use pdftoppm to split it into ppm pages, then run
 # tesseract on each and concatenate the result. It would probably be
 # possible instead to use pdftocairo to produce a tiff, buf pdftocairo
 # is sometimes not available (windows).
 def _pdftesseract(config, path):
    if not tmpdir:
        return b""
    tesseractlang = _guesstesseractlang(config, path)
    #tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
    tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
    # Split pdf pages
    try:
        vacuumdir(tmpdir)
        subprocess.check_call([pdftoppm, "-r", "300", path, tmpfile])
    except Exception as e:
        _deb("pdftoppm failed: %s" % e)
        return b""
    files = glob.glob(tmpfile + "*")
    for f in files:
        out = b''
        try:
            out = subprocess.check_output(
                [tesseract, f, f, "-l", tesseractlang],
                stderr=subprocess.STDOUT)
        except Exception as e:
            _deb("tesseract failed: %s" % e)
        errlines = out.split(b'\n')
        if len(errlines) > 2:
            _deb("Tesseract error: %s" % out)
    # Concatenate the result files
    files = glob.glob(tmpfile + "*" + ".txt")
    data = b""
    for f in files:
        data += open(f, "rb").read()
    return data
 def _simpletesseract(config, path):
    tesseractlang = _guesstesseractlang(config, path)
    try:
        out = subprocess.check_output(
            [tesseract, path, 'stdout', '-l', tesseractlang],
            stderr=subprocess.DEVNULL)
    except Exception as e:
        _deb("tesseract failed: %s" % e)
    return out
 # run ocr on the input path and output the result data.
 def runocr(config, path):
    _maybemaketmpdir()
    base,ext = os.path.splitext(path)
    ext = ext.lower()
    if ext in _okexts:
        return _simpletesseract(config, path)
    else:
        return _pdftesseract(config, path)
 if __name__ == '__main__':
    import rclconfig
    config = rclconfig.RclConfig()
    path =  sys.argv[1]
    if ocrpossible(path):
        data = runocr(config, sys.argv[1])
    else:
        _deb("ocrpossible returned false")
        sys.exit(1)
    sys.stdout.buffer.write(data)
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@ -47,8 +47,14 @@ import glob
 import traceback
 _mswindows = (sys.platform == "win32")
 tmpdir = None
 _htmlprefix =b'''<html><head>
 <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
 </head><body><pre>'''
 _htmlsuffix = b'''</pre></body></html>'''
 def finalcleanup():
    if tmpdir:
        vacuumdir(tmpdir)
@ -120,18 +126,6 @@ class PDFExtractor:
        except:
            pass
        # See if we'll try to perform OCR. Need the commands and the
        # either the presence of a file in the config dir (historical)
        # or a set config variable.
        self.ocrpossible = False
        self.tesseract = rclexecm.which("tesseract")
        if self.tesseract:
            self.pdftoppm = rclexecm.which("pdftoppm")
            if self.pdftoppm:
                self.ocrpossible = True
                self.maybemaketmpdir()
        # self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
        # Pdftk is optionally used to extract attachments. This takes
        # a hit on performance even in the absence of any attachments,
        # so it can be disabled in the configuration.
@ -236,100 +230,6 @@ class PDFExtractor:
        return (True, docdata, ipath, eof)
    # Try to guess tesseract language. This should depend on the input
    # file, but we have no general way to determine it. So use the
    # environment and hope for the best.
    def guesstesseractlang(self):
        tesseractlang = ""
        # First look for a language def file in the file's directory 
        pdflangfile = os.path.join(os.path.dirname(self.filename),
                                   b".ocrpdflang")
        if os.path.isfile(pdflangfile):
            tesseractlang = open(pdflangfile, "r").read().strip()
        if tesseractlang:
            return tesseractlang
        # Then look for a global option. The normal way now that we
        # have config reading capability in the handlers is to use the
        # config. Then, for backwards compat, environment variable and
        # file inside the configuration directory
        tesseractlang = self.config.getConfParam("pdfocrlang")
        if tesseractlang:
            return tesseractlang
        tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", "");
        if tesseractlang:
            return tesseractlang
        pdflangfile = os.path.join(self.confdir, "ocrpdf")
        if os.path.isfile(pdflangfile):
            tesseractlang = open(pdflangfile, "r").read().strip()
        if tesseractlang:
            return tesseractlang
        # Half-assed trial to guess from LANG then default to english
        localelang = os.environ.get("LANG", "").split("_")[0]
        if localelang == "en":
            tesseractlang = "eng"
        elif localelang == "de":
            tesseractlang = "deu"
        elif localelang == "fr":
            tesseractlang = "fra"
        if tesseractlang:
            return tesseractlang
        if not tesseractlang:
            tesseractlang = "eng"
        return tesseractlang
    # PDF has no text content and tesseract is available. Give OCR a try
    def ocrpdf(self):
        global tmpdir
        if not tmpdir:
            return b""
        tesseractlang = self.guesstesseractlang()
        # self.em.rclog("tesseractlang %s" % tesseractlang)
        tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
        tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
        # Split pdf pages
        try:
            vacuumdir(tmpdir)
            subprocess.check_call([self.pdftoppm, "-r", "300", self.filename,
                                   tmpfile])
        except Exception as e:
            self.em.rclog("pdftoppm failed: %s" % e)
            return b""
        files = glob.glob(tmpfile + "*")
        for f in files:
            out = b''
            try:
                out = subprocess.check_output([self.tesseract, f, f, "-l",
                                               tesseractlang],
                                              stderr = subprocess.STDOUT)
            except Exception as e:
                self.em.rclog("tesseract failed: %s" % e)
            errlines = out.split(b'\n')
            if len(errlines) > 2:
                self.em.rclog("Tesseract error: %s" % out)
        # Concatenate the result files
        files = glob.glob(tmpfile + "*" + ".txt")
        data = b""
        for f in files:
            data += open(f, "rb").read()
        return b'''<html><head>
        <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
        </head><body><pre>''' + \
        self.em.htmlescape(data) + \
        b'''</pre></body></html>'''
    # pdftotext (used to?) badly escape text inside the header
    # fields. We do it here. This is not an html parser, and depends a
    # lot on the actual format output by pdftotext.
@ -510,13 +410,11 @@ class PDFExtractor:
        html, isempty = self._fixhtml(html)
        #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
-        if isempty and self.ocrpossible:
+        if isempty:
-            self.config.setKeyDir(os.path.dirname(self.filename))
+            cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
-            s = self.config.getConfParam("pdfocr")
+                   self.filename]
-            cf_doocr = rclexecm.configparamtrue(s)
+            data = subprocess.check_output(cmd)
-            file_doocr = os.path.isfile(os.path.join(self.confdir, "ocrpdf"))
+            html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
            if cf_doocr or file_doocr:
                html = self.ocrpdf()
        if self.extrameta:
            try:
@ -592,6 +490,7 @@ class PDFExtractor:
 # Main program: create protocol handler and extractor and run them
 _execdir = os.path.dirname(sys.argv[0])
 proto = rclexecm.RclExecM()
 extract = PDFExtractor(proto)
 rclexecm.main(proto, extract)