diff --git a/src/filters/rclocr.py b/src/filters/rclocr.py new file mode 100755 index 00000000..45668303 --- /dev/null +++ b/src/filters/rclocr.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +################################# +# Copyright (C) 2020 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +######################################################## + +# Running OCR programs for Recoll + +import os +import sys +import rclconfig +import rclocrcache +import importlib.util + +def deb(s): + print("%s" % s, file=sys.stderr) + +def Usage(): + deb("Usage: rclocr.py ") + sys.exit(1) + +if len(sys.argv) != 2: + Usage() + +path = sys.argv[1] + +config = rclconfig.RclConfig() +cache = rclocrcache.OCRCache(config) + +incache, data = cache.get(path) +if incache: + sys.stdout.buffer.write(data) + sys.exit(0) + +#### Data not in cache + +# Retrieve known ocr program names and try to load the corresponding module +ocrprogs = config.getConfParam("ocrprogs") +if not ocrprogs: + deb("No ocrprogs variable") + sys.exit(1) +deb("ocrprogs: %s" % ocrprogs) +proglist = ocrprogs.split(" ") +ok = False +for ocrprog in proglist: + try: + modulename = "rclocr" + ocrprog + ocr = importlib.import_module(modulename) + if ocr.ocrpossible(path): + ok = True + break + except Exception as err: + deb("While loading %s: got: %s" % (modulename, err)) + pass + +if not ok: + deb("No OCR module could be loaded") + sys.exit(1) + +deb("Using ocr module %s" % modulename) + +data = ocr.runocr(config, path) + +cache.store(path, data) +sys.stdout.buffer.write(data) +sys.exit(0) + diff --git a/src/filters/rclocrcache.py b/src/filters/rclocrcache.py new file mode 100755 index 00000000..72ffa156 --- /dev/null +++ b/src/filters/rclocrcache.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +################################# +# Copyright (C) 2020 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +######################################################## + +# Caching OCR'd data + +# OCR is extremely slow. The cache stores 2 kinds of objects: +# - Path files are named from the hash of the image path and contain +# the image data hash and the modification time and size of the +# image at the time the OCR'd data was stored in the cache +# - Data files are named with the hash of the image data and contain +# the OCR'd data +# When retrieving data from the cache: +# - We first use the image file size and modification time: if an +# entry exists for the imagepath/mtime/size triplet, and is up to +# date, the corresponding data is obtained from the data file and +# returned. +# - Else we then use the image data: if an entry exists for the +# computed hashed value of the data, it is returned. This allows +# moving files around without needing to run OCR again, but of +# course, it is more expensive than the first step +# +# If we need to use the second step, as a side effect, a path file is +# created or updated so that the data will be found with the first +# step next time around. + +import sys +import os +import hashlib + +def deb(s): + print("%s" %s, file=sys.stderr) + +class OCRCache(object): + def __init__(self, conf): + self.config = conf + self.cachedir = conf.getConfParam("ocrcachedir") + if not self.cachedir: + self.cachedir = os.path.join(self.config.getConfDir(), "ocrcache") + self.objdir = os.path.join(self.cachedir, "objects") + if not os.path.exists(self.objdir): + os.makedirs(self.objdir) + + # Compute sha1 of path, as two parts of 2 and 38 chars + def _hashpath(self, data): + if type(data) != type(b""): + data = data.encode('utf-8') + m = hashlib.sha1() + m.update(data) + h = m.hexdigest() + return h[0:2], h[2:] + + # Compute sha1 of path data contents, as two parts of 2 and 38 chars + def _hashdata(self, path): + #deb("Hashing DATA") + m = hashlib.sha1() + with open(path, "rb") as f: + while True: + d = f.read(8192) + if not d: + break + m.update(d) + h = m.hexdigest() + return h[0:2], h[2:] + + # Try to read the stored attributes for a given path: data hash, + # modification time and size. If this fails, the path itself is + # not cached (but the data still might be, maybe the file was moved) + def _cachedpathattrs(self, path): + pd,pf = self._hashpath(path) + o = os.path.join(self.objdir, pd, pf) + if not os.path.exists(o): + return False, None, None, None, None + line = open(o, "r").read() + dd,df,tm,sz = line.split() + tm = int(tm) + sz = int(sz) + return True, dd, df, tm, sz + + # Compute the path hash, and get the mtime and size for given + # path, for updating the cache path file + def _newpathattrs(self, path): + pd,pf = self._hashpath(path) + tm = int(os.path.getmtime(path)) + sz = int(os.path.getsize(path)) + return pd, pf, tm, sz + + # Check if the cache appears up to date for a given path, only + # using the modification time and size. Return the data file path + # elements if we get a hit. + def _pathincache(self, path): + ret, od, of, otm, osz = self._cachedpathattrs(path) + if not ret: + return False, None, None + pd, pf, ntm, nsz = self._newpathattrs(path) + #deb(" tm %d sz %d" % (ntm, nsz)) + #deb("otm %d osz %d" % (otm, osz)) + if otm != ntm or osz != nsz: + return False, None, None + return True, od, of + + # Check if cache appears up to date for path (no data check), + # return True/False + def pathincache(self, path): + ret, dd, df = self._pathincache(path) + return ret + + # Compute the data file name for path. Expensive: we compute the data hash. + # Return both the data file path and path elements (for storage in path file) + def _datafilename(self, path): + d, f = self._hashdata(path) + return os.path.join(self.objdir, d, f), d, f + + # Check if the data for path is in cache: expensive, needs to + # compute the hash for the path's data contents. Returns True/False + def dataincache(self, path): + return os.path.exists(self._datafilename(path)[0]) + + # Create path file with given elements. + def _updatepathfile(self, pd, pf, dd, df, tm, sz): + dir = os.path.join(self.objdir, pd) + if not os.path.exists(dir): + os.makedirs(dir) + pfile = os.path.join(dir, pf) + with open(pfile, "w") as f: + f.write("%s %s %d %d\n" % (dd, df, tm, sz)) + + # Store data for path. Only rewrite an existing data file if told + # to do so: this is only useful if we are forcing an OCR re-run. + def store(self, path, datatostore, force=False): + dd,df = self._hashdata(path) + pd, pf, tm, sz = self._newpathattrs(path) + self._updatepathfile(pd, pf, dd, df, tm, sz) + dir = os.path.join(self.objdir, dd) + if not os.path.exists(dir): + os.makedirs(dir) + dfile = os.path.join(dir, df) + if force or not os.path.exists(dfile): + #deb("Storing data") + with open(dfile, "wb") as f: + f.write(datatostore) + return True + + # Retrieve cached OCR'd data for image path. Possibly update the + # path file as a side effect (case where the image has moved, but + # the data has not changed). + def get(self, path): + pincache, dd, df = self._pathincache(path) + if pincache: + dfn = os.path.join(self.objdir, dd, df) + else: + dfn, dd, df = self._datafilename(path) + + if not os.path.exists(dfn): + return False, b"" + + if not pincache: + # File has moved. create/Update path file for next time + pd, pf, tm, sz = self._newpathattrs(path) + self._updatepathfile(pd, pf, dd, df, tm, sz) + + return True, open(dfn, "rb").read() + + + +if __name__ == '__main__': + import rclconfig + + conf = rclconfig.RclConfig() + cache = OCRCache(conf) + path = sys.argv[1] + deb("Using %s" % path) + + deb("== CACHE tests") + ret = cache.pathincache(path) + s = "" if ret else " not" + deb("path for %s%s in cache" % (path, s)) + + #ret = cache.dataincache(path) + #s = "" if ret else " not" + #deb("data for %s%s in cache" % (path, s)) + + if False: + deb("== STORE tests") + cache.store(path, b"my OCR'd text is one line\n", force=False) + + deb("== GET tests") + incache, data = cache.get(path) + if incache: + deb("Data from cache [%s]" % data) + else: + deb("Data was not found in cache") + diff --git a/src/filters/rclocrtesseract.py b/src/filters/rclocrtesseract.py new file mode 100755 index 00000000..8d47c235 --- /dev/null +++ b/src/filters/rclocrtesseract.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 +################################# +# Copyright (C) 2020 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +######################################################## + +import os +import sys +import atexit +import tempfile +import subprocess +import glob + +import rclexecm + +_mswindows = (sys.platform == "win32") +if _mswindows: + ocrlangfile = ".rclocrlang" +else: + ocrlangfile = ".rclocrlang" + +_okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg') + +def _deb(s): + if not _mswindows: + #print("%s" % s, file=sys.stderr) + pass + +def vacuumdir(dir): + if dir: + for fn in os.listdir(dir): + path = os.path.join(dir, fn) + if os.path.isfile(path): + os.unlink(path) + return True + +tmpdir = None +def _maybemaketmpdir(): + global tmpdir + if tmpdir: + if not vacuumdir(tmpdir): + _deb("openfile: vacuumdir %s failed" % tmpdir) + return False + else: + tmpdir = tempfile.mkdtemp(prefix='rclmpdf') + +def finalcleanup(): + if tmpdir: + vacuumdir(tmpdir) + os.rmdir(tmpdir) + +atexit.register(finalcleanup) + +# Return true if tesseract and the appropriate conversion program for +# the file type (e.g. pdftoppt for pdf) appear to be available +def ocrpossible(path): + # Check for tesseract + global tesseract + tesseract = rclexecm.which("tesseract") + if not tesseract: + return False + + # Check input format + base,ext = os.path.splitext(path) + ext = ext.lower() + if ext in _okexts: + return True + + if ext == '.pdf': + # Check for pdftoppm. We could use pdftocairo, which can + # produce a multi-page pdf and make the rest simpler, but the + # legacy code used pdftoppm for some reason, and it appears + # that the newest builds from conda-forge do not include + # pdftocairo. So stay with pdftoppm. + global pdftoppm + pdftoppm = rclexecm.which("pdftoppm") + if pdftoppm: + return True + + return False + + +# Try to guess tesseract language. This should depend on the input +# file, but we have no general way to determine it. So use the +# environment and hope for the best. +def _guesstesseractlang(config, path): + tesseractlang = "" + + dirname = os.path.dirname(path) + + # First look for a language def file in the file's directory + pdflangfile = os.path.join(dirname, ocrlangfile) + if os.path.isfile(pdflangfile): + tesseractlang = open(pdflangfile, "r").read().strip() + if tesseractlang: + _deb("Tesseract lang from file: %s" % tesseractlang) + return tesseractlang + + # Then look for a config file option. + config.setKeyDir(dirname) + tesseractlang = config.getConfParam("tesseractlang") + if tesseractlang: + _deb("Tesseract lang from config: %s" % tesseractlang) + return tesseractlang + + # Half-assed trial to guess from LANG then default to english + try: + localelang = os.environ.get("LANG", "").split("_")[0] + if localelang == "en": + tesseractlang = "eng" + elif localelang == "de": + tesseractlang = "deu" + elif localelang == "fr": + tesseractlang = "fra" + except: + pass + + if not tesseractlang: + tesseractlang = "eng" + _deb("Tesseract lang (guessed): %s" % tesseractlang) + return tesseractlang + +# Process pdf file: use pdftoppm to split it into ppm pages, then run +# tesseract on each and concatenate the result. It would probably be +# possible instead to use pdftocairo to produce a tiff, buf pdftocairo +# is sometimes not available (windows). +def _pdftesseract(config, path): + if not tmpdir: + return b"" + + tesseractlang = _guesstesseractlang(config, path) + + #tesserrorfile = os.path.join(tmpdir, "tesserrorfile") + tmpfile = os.path.join(tmpdir, "ocrXXXXXX") + + # Split pdf pages + try: + vacuumdir(tmpdir) + subprocess.check_call([pdftoppm, "-r", "300", path, tmpfile]) + except Exception as e: + _deb("pdftoppm failed: %s" % e) + return b"" + + files = glob.glob(tmpfile + "*") + for f in files: + out = b'' + try: + out = subprocess.check_output( + [tesseract, f, f, "-l", tesseractlang], + stderr=subprocess.STDOUT) + except Exception as e: + _deb("tesseract failed: %s" % e) + + errlines = out.split(b'\n') + if len(errlines) > 2: + _deb("Tesseract error: %s" % out) + + # Concatenate the result files + files = glob.glob(tmpfile + "*" + ".txt") + data = b"" + for f in files: + data += open(f, "rb").read() + + return data + + +def _simpletesseract(config, path): + tesseractlang = _guesstesseractlang(config, path) + + try: + out = subprocess.check_output( + [tesseract, path, 'stdout', '-l', tesseractlang], + stderr=subprocess.DEVNULL) + except Exception as e: + _deb("tesseract failed: %s" % e) + + return out + + +# run ocr on the input path and output the result data. +def runocr(config, path): + _maybemaketmpdir() + base,ext = os.path.splitext(path) + ext = ext.lower() + if ext in _okexts: + return _simpletesseract(config, path) + else: + return _pdftesseract(config, path) + + + + +if __name__ == '__main__': + import rclconfig + config = rclconfig.RclConfig() + path = sys.argv[1] + if ocrpossible(path): + data = runocr(config, sys.argv[1]) + else: + _deb("ocrpossible returned false") + sys.exit(1) + sys.stdout.buffer.write(data) + + diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 2b8d5108..63999c3a 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -47,8 +47,14 @@ import glob import traceback _mswindows = (sys.platform == "win32") + tmpdir = None +_htmlprefix =b''' + +
'''
+_htmlsuffix = b'''
''' + def finalcleanup(): if tmpdir: vacuumdir(tmpdir) @@ -120,18 +126,6 @@ class PDFExtractor: except: pass - # See if we'll try to perform OCR. Need the commands and the - # either the presence of a file in the config dir (historical) - # or a set config variable. - self.ocrpossible = False - self.tesseract = rclexecm.which("tesseract") - if self.tesseract: - self.pdftoppm = rclexecm.which("pdftoppm") - if self.pdftoppm: - self.ocrpossible = True - self.maybemaketmpdir() - # self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible) - # Pdftk is optionally used to extract attachments. This takes # a hit on performance even in the absence of any attachments, # so it can be disabled in the configuration. @@ -236,100 +230,6 @@ class PDFExtractor: return (True, docdata, ipath, eof) - # Try to guess tesseract language. This should depend on the input - # file, but we have no general way to determine it. So use the - # environment and hope for the best. - def guesstesseractlang(self): - tesseractlang = "" - - # First look for a language def file in the file's directory - pdflangfile = os.path.join(os.path.dirname(self.filename), - b".ocrpdflang") - if os.path.isfile(pdflangfile): - tesseractlang = open(pdflangfile, "r").read().strip() - if tesseractlang: - return tesseractlang - - # Then look for a global option. The normal way now that we - # have config reading capability in the handlers is to use the - # config. Then, for backwards compat, environment variable and - # file inside the configuration directory - tesseractlang = self.config.getConfParam("pdfocrlang") - if tesseractlang: - return tesseractlang - tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", ""); - if tesseractlang: - return tesseractlang - pdflangfile = os.path.join(self.confdir, "ocrpdf") - if os.path.isfile(pdflangfile): - tesseractlang = open(pdflangfile, "r").read().strip() - if tesseractlang: - return tesseractlang - - # Half-assed trial to guess from LANG then default to english - localelang = os.environ.get("LANG", "").split("_")[0] - if localelang == "en": - tesseractlang = "eng" - elif localelang == "de": - tesseractlang = "deu" - elif localelang == "fr": - tesseractlang = "fra" - if tesseractlang: - return tesseractlang - - if not tesseractlang: - tesseractlang = "eng" - return tesseractlang - - # PDF has no text content and tesseract is available. Give OCR a try - def ocrpdf(self): - - global tmpdir - if not tmpdir: - return b"" - - tesseractlang = self.guesstesseractlang() - # self.em.rclog("tesseractlang %s" % tesseractlang) - - tesserrorfile = os.path.join(tmpdir, "tesserrorfile") - tmpfile = os.path.join(tmpdir, "ocrXXXXXX") - - # Split pdf pages - try: - vacuumdir(tmpdir) - subprocess.check_call([self.pdftoppm, "-r", "300", self.filename, - tmpfile]) - except Exception as e: - self.em.rclog("pdftoppm failed: %s" % e) - return b"" - - files = glob.glob(tmpfile + "*") - for f in files: - out = b'' - try: - out = subprocess.check_output([self.tesseract, f, f, "-l", - tesseractlang], - stderr = subprocess.STDOUT) - except Exception as e: - self.em.rclog("tesseract failed: %s" % e) - - errlines = out.split(b'\n') - if len(errlines) > 2: - self.em.rclog("Tesseract error: %s" % out) - - # Concatenate the result files - files = glob.glob(tmpfile + "*" + ".txt") - data = b"" - for f in files: - data += open(f, "rb").read() - - return b''' - -
''' + \
-        self.em.htmlescape(data) + \
-        b'''
''' - - # pdftotext (used to?) badly escape text inside the header # fields. We do it here. This is not an html parser, and depends a # lot on the actual format output by pdftotext. @@ -510,13 +410,11 @@ class PDFExtractor: html, isempty = self._fixhtml(html) #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html)) - if isempty and self.ocrpossible: - self.config.setKeyDir(os.path.dirname(self.filename)) - s = self.config.getConfParam("pdfocr") - cf_doocr = rclexecm.configparamtrue(s) - file_doocr = os.path.isfile(os.path.join(self.confdir, "ocrpdf")) - if cf_doocr or file_doocr: - html = self.ocrpdf() + if isempty: + cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"), + self.filename] + data = subprocess.check_output(cmd) + html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix if self.extrameta: try: @@ -592,6 +490,7 @@ class PDFExtractor: # Main program: create protocol handler and extractor and run them +_execdir = os.path.dirname(sys.argv[0]) proto = rclexecm.RclExecM() extract = PDFExtractor(proto) rclexecm.main(proto, extract)