diff --git a/src/filters/rclocr.py b/src/filters/rclocr.py index 732160ec..dd30ecda 100755 --- a/src/filters/rclocr.py +++ b/src/filters/rclocr.py @@ -17,7 +17,12 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ######################################################## -# Running OCR programs for Recoll +# Running OCR programs for Recoll. This is excecuted from, +# e.g. rclpdf.py if pdftotext returns no data. +# +# The script tries to retrieve the data from the ocr cache, else it +# runs the configured OCR program and updates the cache. In both cases it writes +# the resulting text to stdout. import os import sys @@ -47,7 +52,8 @@ if incache: #### Data not in cache -# Retrieve known ocr program names and try to load the corresponding module +# Retrieve configured OCR program names and try to load the +# corresponding module ocrprogs = config.getConfParam("ocrprogs") if not ocrprogs: _deb("No ocrprogs variable in recoll configuration") @@ -59,7 +65,7 @@ for ocrprog in proglist: try: modulename = "rclocr" + ocrprog ocr = importlib.import_module(modulename) - if ocr.ocrpossible(path): + if ocr.ocrpossible(config, path): ok = True break except Exception as err: @@ -72,6 +78,8 @@ if not ok: #_deb("Using ocr module %s" % modulename) +# The OCR module will retrieve its specific parameters from the +# configuration status, data = ocr.runocr(config, path) if not status: diff --git a/src/filters/rclocrabbyy.py b/src/filters/rclocrabbyy.py new file mode 100755 index 00000000..e3450a09 --- /dev/null +++ b/src/filters/rclocrabbyy.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +################################# +# Copyright (C) 2020 J.F.Dockes +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the +# Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +######################################################## + +# Running abbyyocr for Recoll OCR (see rclocr.py) + +import os +import sys +import atexit +import tempfile +import subprocess +import glob + +import rclexecm + +_mswindows = (sys.platform == "win32") +if _mswindows: + ocrlangfile = "rclocrlang.txt" +else: + ocrlangfile = ".rclocrlang" + +_okexts = ('.pdf', '.tif', '.tiff', '.jpg', '.png', '.jpeg') + +abbyyocrcmd = "" +abbyocrdir = "" + +def _deb(s): + if not _mswindows: + print("rclocrabbyy: %s" % s, file=sys.stderr) + +# Return true if abbyy appears to be available +def ocrpossible(config, path): + global abbyyocrcmd + if not abbyyocrcmd: + config.setKeyDir(os.path.dirname(path)) + abbyyocrcmd = config.getConfParam("abbyyocrcmd") + if not abbyyocrcmd: + abbyyocrcmd = rclexecm.which("abbyyocr11") + if not abbyyocrcmd: + return False + global abbyyocrdir + abbyyocrdir = os.path.dirname(abbyyocrcmd) + + # Check input format + base,ext = os.path.splitext(path) + ext = ext.lower() + if ext in _okexts: + return True + return False + + +# Try to guess tesseract language. This should depend on the input +# file, but we have no general way to determine it. So use the +# environment and hope for the best. +def _guessocrlang(config, path): + ocrlang = "" + + dirname = os.path.dirname(path) + + # First look for a language def file in the file's directory + langfile = os.path.join(dirname, ocrlangfile) + if os.path.isfile(langfile): + ocrlang = open(langfile, "r").read().strip() + if ocrlang: + _deb("OCR lang from file: %s" % ocrlang) + return ocrlang + + # Then look for a config file option. + config.setKeyDir(dirname) + ocrlang = config.getConfParam("abbyylang") + if ocrlang: + _deb("OCR lang from config: %s" % ocrlang) + return ocrlang + + # Half-assed trial to guess from LANG then default to english + try: + localelang = os.environ.get("LANG", "").split("_")[0] + if localelang == "en": + ocrlang = "English" + elif localelang == "de": + ocrlang = "German" + elif localelang == "fr": + ocrlang = "French" + except: + pass + + if not ocrlang: + ocrlang = "English" + _deb("OCR lang (guessed): %s" % ocrlang) + return ocrlang + + +# run ocr on the input path and output the result data. +def runocr(config, path): + ocrlang = _guessocrlang(config, path) + my_env = os.environ.copy() + eldpn = "LD_LIBRARY_PATH" + if eldpn in my_env: + oldpath = ":" + my_env[eldpn] + else: + oldpath = "" + my_env[eldpn] = abbyyocrdir + oldpath + + try: + out = subprocess.check_output( + [abbyyocrcmd, "-lpp", "BookArchiving_Accuracy", + "-rl", ocrlang, + "-tet", "UTF8", + "-f", "TextUnicodeDefaults", + "-if", path, + "-c"], + env = my_env, + stderr=subprocess.DEVNULL) + except Exception as e: + _deb("%s failed: %s" % (abbyyocrcmd,e)) + return False, "" + return True, out + + +if __name__ == '__main__': + import rclconfig + config = rclconfig.RclConfig() + path = sys.argv[1] + if ocrpossible(config, path): + ok, data = runocr(config, sys.argv[1]) + else: + _deb("ocrpossible returned false") + sys.exit(1) + if ok: + sys.stdout.buffer.write(data) + else: + _deb("OCR program failed") diff --git a/src/filters/rclocrtesseract.py b/src/filters/rclocrtesseract.py index 57689675..cc2734f6 100755 --- a/src/filters/rclocrtesseract.py +++ b/src/filters/rclocrtesseract.py @@ -17,6 +17,8 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ######################################################## +# Running tesseract for Recoll OCR (see rclocr.py) + import os import sys import atexit @@ -28,7 +30,7 @@ import rclexecm _mswindows = (sys.platform == "win32") if _mswindows: - ocrlangfile = ".rclocrlang" + ocrlangfile = "rclocrlang.txt" else: ocrlangfile = ".rclocrlang" @@ -65,7 +67,7 @@ atexit.register(finalcleanup) # Return true if tesseract and the appropriate conversion program for # the file type (e.g. pdftoppt for pdf) appear to be available -def ocrpossible(path): +def ocrpossible(config, path): # Check for tesseract global tesseract tesseract = rclexecm.which("tesseract") @@ -217,11 +219,12 @@ if __name__ == '__main__': import rclconfig config = rclconfig.RclConfig() path = sys.argv[1] - if ocrpossible(path): - data = runocr(config, sys.argv[1]) + if ocrpossible(config, path): + ok, data = runocr(config, sys.argv[1]) else: _deb("ocrpossible returned false") sys.exit(1) - sys.stdout.buffer.write(data) - - + if ok: + sys.stdout.buffer.write(data) + else: + _deb("OCR program failed")