added ocr module for abbyy

This commit is contained in:
Jean-Francois Dockes 2020-02-27 11:35:23 +01:00
parent 7bc70a30ae
commit abb7ef8803
3 changed files with 168 additions and 10 deletions

View File

@ -17,7 +17,12 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
######################################################## ########################################################
# Running OCR programs for Recoll # Running OCR programs for Recoll. This is excecuted from,
# e.g. rclpdf.py if pdftotext returns no data.
#
# The script tries to retrieve the data from the ocr cache, else it
# runs the configured OCR program and updates the cache. In both cases it writes
# the resulting text to stdout.
import os import os
import sys import sys
@ -47,7 +52,8 @@ if incache:
#### Data not in cache #### Data not in cache
# Retrieve known ocr program names and try to load the corresponding module # Retrieve configured OCR program names and try to load the
# corresponding module
ocrprogs = config.getConfParam("ocrprogs") ocrprogs = config.getConfParam("ocrprogs")
if not ocrprogs: if not ocrprogs:
_deb("No ocrprogs variable in recoll configuration") _deb("No ocrprogs variable in recoll configuration")
@ -59,7 +65,7 @@ for ocrprog in proglist:
try: try:
modulename = "rclocr" + ocrprog modulename = "rclocr" + ocrprog
ocr = importlib.import_module(modulename) ocr = importlib.import_module(modulename)
if ocr.ocrpossible(path): if ocr.ocrpossible(config, path):
ok = True ok = True
break break
except Exception as err: except Exception as err:
@ -72,6 +78,8 @@ if not ok:
#_deb("Using ocr module %s" % modulename) #_deb("Using ocr module %s" % modulename)
# The OCR module will retrieve its specific parameters from the
# configuration
status, data = ocr.runocr(config, path) status, data = ocr.runocr(config, path)
if not status: if not status:

147
src/filters/rclocrabbyy.py Executable file
View File

@ -0,0 +1,147 @@
#!/usr/bin/env python3
#################################
# Copyright (C) 2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
########################################################
# Running abbyyocr for Recoll OCR (see rclocr.py)
import os
import sys
import atexit
import tempfile
import subprocess
import glob
import rclexecm
_mswindows = (sys.platform == "win32")
if _mswindows:
ocrlangfile = "rclocrlang.txt"
else:
ocrlangfile = ".rclocrlang"
_okexts = ('.pdf', '.tif', '.tiff', '.jpg', '.png', '.jpeg')
abbyyocrcmd = ""
abbyocrdir = ""
def _deb(s):
if not _mswindows:
print("rclocrabbyy: %s" % s, file=sys.stderr)
# Return true if abbyy appears to be available
def ocrpossible(config, path):
global abbyyocrcmd
if not abbyyocrcmd:
config.setKeyDir(os.path.dirname(path))
abbyyocrcmd = config.getConfParam("abbyyocrcmd")
if not abbyyocrcmd:
abbyyocrcmd = rclexecm.which("abbyyocr11")
if not abbyyocrcmd:
return False
global abbyyocrdir
abbyyocrdir = os.path.dirname(abbyyocrcmd)
# Check input format
base,ext = os.path.splitext(path)
ext = ext.lower()
if ext in _okexts:
return True
return False
# Try to guess tesseract language. This should depend on the input
# file, but we have no general way to determine it. So use the
# environment and hope for the best.
def _guessocrlang(config, path):
ocrlang = ""
dirname = os.path.dirname(path)
# First look for a language def file in the file's directory
langfile = os.path.join(dirname, ocrlangfile)
if os.path.isfile(langfile):
ocrlang = open(langfile, "r").read().strip()
if ocrlang:
_deb("OCR lang from file: %s" % ocrlang)
return ocrlang
# Then look for a config file option.
config.setKeyDir(dirname)
ocrlang = config.getConfParam("abbyylang")
if ocrlang:
_deb("OCR lang from config: %s" % ocrlang)
return ocrlang
# Half-assed trial to guess from LANG then default to english
try:
localelang = os.environ.get("LANG", "").split("_")[0]
if localelang == "en":
ocrlang = "English"
elif localelang == "de":
ocrlang = "German"
elif localelang == "fr":
ocrlang = "French"
except:
pass
if not ocrlang:
ocrlang = "English"
_deb("OCR lang (guessed): %s" % ocrlang)
return ocrlang
# run ocr on the input path and output the result data.
def runocr(config, path):
ocrlang = _guessocrlang(config, path)
my_env = os.environ.copy()
eldpn = "LD_LIBRARY_PATH"
if eldpn in my_env:
oldpath = ":" + my_env[eldpn]
else:
oldpath = ""
my_env[eldpn] = abbyyocrdir + oldpath
try:
out = subprocess.check_output(
[abbyyocrcmd, "-lpp", "BookArchiving_Accuracy",
"-rl", ocrlang,
"-tet", "UTF8",
"-f", "TextUnicodeDefaults",
"-if", path,
"-c"],
env = my_env,
stderr=subprocess.DEVNULL)
except Exception as e:
_deb("%s failed: %s" % (abbyyocrcmd,e))
return False, ""
return True, out
if __name__ == '__main__':
import rclconfig
config = rclconfig.RclConfig()
path = sys.argv[1]
if ocrpossible(config, path):
ok, data = runocr(config, sys.argv[1])
else:
_deb("ocrpossible returned false")
sys.exit(1)
if ok:
sys.stdout.buffer.write(data)
else:
_deb("OCR program failed")

View File

@ -17,6 +17,8 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
######################################################## ########################################################
# Running tesseract for Recoll OCR (see rclocr.py)
import os import os
import sys import sys
import atexit import atexit
@ -28,7 +30,7 @@ import rclexecm
_mswindows = (sys.platform == "win32") _mswindows = (sys.platform == "win32")
if _mswindows: if _mswindows:
ocrlangfile = ".rclocrlang" ocrlangfile = "rclocrlang.txt"
else: else:
ocrlangfile = ".rclocrlang" ocrlangfile = ".rclocrlang"
@ -65,7 +67,7 @@ atexit.register(finalcleanup)
# Return true if tesseract and the appropriate conversion program for # Return true if tesseract and the appropriate conversion program for
# the file type (e.g. pdftoppt for pdf) appear to be available # the file type (e.g. pdftoppt for pdf) appear to be available
def ocrpossible(path): def ocrpossible(config, path):
# Check for tesseract # Check for tesseract
global tesseract global tesseract
tesseract = rclexecm.which("tesseract") tesseract = rclexecm.which("tesseract")
@ -217,11 +219,12 @@ if __name__ == '__main__':
import rclconfig import rclconfig
config = rclconfig.RclConfig() config = rclconfig.RclConfig()
path = sys.argv[1] path = sys.argv[1]
if ocrpossible(path): if ocrpossible(config, path):
data = runocr(config, sys.argv[1]) ok, data = runocr(config, sys.argv[1])
else: else:
_deb("ocrpossible returned false") _deb("ocrpossible returned false")
sys.exit(1) sys.exit(1)
sys.stdout.buffer.write(data) if ok:
sys.stdout.buffer.write(data)
else:
_deb("OCR program failed")