added ocr module for abbyy
This commit is contained in:
parent
7bc70a30ae
commit
abb7ef8803
@ -17,7 +17,12 @@
|
|||||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
########################################################
|
########################################################
|
||||||
|
|
||||||
# Running OCR programs for Recoll
|
# Running OCR programs for Recoll. This is excecuted from,
|
||||||
|
# e.g. rclpdf.py if pdftotext returns no data.
|
||||||
|
#
|
||||||
|
# The script tries to retrieve the data from the ocr cache, else it
|
||||||
|
# runs the configured OCR program and updates the cache. In both cases it writes
|
||||||
|
# the resulting text to stdout.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@ -47,7 +52,8 @@ if incache:
|
|||||||
|
|
||||||
#### Data not in cache
|
#### Data not in cache
|
||||||
|
|
||||||
# Retrieve known ocr program names and try to load the corresponding module
|
# Retrieve configured OCR program names and try to load the
|
||||||
|
# corresponding module
|
||||||
ocrprogs = config.getConfParam("ocrprogs")
|
ocrprogs = config.getConfParam("ocrprogs")
|
||||||
if not ocrprogs:
|
if not ocrprogs:
|
||||||
_deb("No ocrprogs variable in recoll configuration")
|
_deb("No ocrprogs variable in recoll configuration")
|
||||||
@ -59,7 +65,7 @@ for ocrprog in proglist:
|
|||||||
try:
|
try:
|
||||||
modulename = "rclocr" + ocrprog
|
modulename = "rclocr" + ocrprog
|
||||||
ocr = importlib.import_module(modulename)
|
ocr = importlib.import_module(modulename)
|
||||||
if ocr.ocrpossible(path):
|
if ocr.ocrpossible(config, path):
|
||||||
ok = True
|
ok = True
|
||||||
break
|
break
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
@ -72,6 +78,8 @@ if not ok:
|
|||||||
|
|
||||||
#_deb("Using ocr module %s" % modulename)
|
#_deb("Using ocr module %s" % modulename)
|
||||||
|
|
||||||
|
# The OCR module will retrieve its specific parameters from the
|
||||||
|
# configuration
|
||||||
status, data = ocr.runocr(config, path)
|
status, data = ocr.runocr(config, path)
|
||||||
|
|
||||||
if not status:
|
if not status:
|
||||||
|
|||||||
147
src/filters/rclocrabbyy.py
Executable file
147
src/filters/rclocrabbyy.py
Executable file
@ -0,0 +1,147 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#################################
|
||||||
|
# Copyright (C) 2020 J.F.Dockes
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the
|
||||||
|
# Free Software Foundation, Inc.,
|
||||||
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
########################################################
|
||||||
|
|
||||||
|
# Running abbyyocr for Recoll OCR (see rclocr.py)
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import atexit
|
||||||
|
import tempfile
|
||||||
|
import subprocess
|
||||||
|
import glob
|
||||||
|
|
||||||
|
import rclexecm
|
||||||
|
|
||||||
|
_mswindows = (sys.platform == "win32")
|
||||||
|
if _mswindows:
|
||||||
|
ocrlangfile = "rclocrlang.txt"
|
||||||
|
else:
|
||||||
|
ocrlangfile = ".rclocrlang"
|
||||||
|
|
||||||
|
_okexts = ('.pdf', '.tif', '.tiff', '.jpg', '.png', '.jpeg')
|
||||||
|
|
||||||
|
abbyyocrcmd = ""
|
||||||
|
abbyocrdir = ""
|
||||||
|
|
||||||
|
def _deb(s):
|
||||||
|
if not _mswindows:
|
||||||
|
print("rclocrabbyy: %s" % s, file=sys.stderr)
|
||||||
|
|
||||||
|
# Return true if abbyy appears to be available
|
||||||
|
def ocrpossible(config, path):
|
||||||
|
global abbyyocrcmd
|
||||||
|
if not abbyyocrcmd:
|
||||||
|
config.setKeyDir(os.path.dirname(path))
|
||||||
|
abbyyocrcmd = config.getConfParam("abbyyocrcmd")
|
||||||
|
if not abbyyocrcmd:
|
||||||
|
abbyyocrcmd = rclexecm.which("abbyyocr11")
|
||||||
|
if not abbyyocrcmd:
|
||||||
|
return False
|
||||||
|
global abbyyocrdir
|
||||||
|
abbyyocrdir = os.path.dirname(abbyyocrcmd)
|
||||||
|
|
||||||
|
# Check input format
|
||||||
|
base,ext = os.path.splitext(path)
|
||||||
|
ext = ext.lower()
|
||||||
|
if ext in _okexts:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# Try to guess tesseract language. This should depend on the input
|
||||||
|
# file, but we have no general way to determine it. So use the
|
||||||
|
# environment and hope for the best.
|
||||||
|
def _guessocrlang(config, path):
|
||||||
|
ocrlang = ""
|
||||||
|
|
||||||
|
dirname = os.path.dirname(path)
|
||||||
|
|
||||||
|
# First look for a language def file in the file's directory
|
||||||
|
langfile = os.path.join(dirname, ocrlangfile)
|
||||||
|
if os.path.isfile(langfile):
|
||||||
|
ocrlang = open(langfile, "r").read().strip()
|
||||||
|
if ocrlang:
|
||||||
|
_deb("OCR lang from file: %s" % ocrlang)
|
||||||
|
return ocrlang
|
||||||
|
|
||||||
|
# Then look for a config file option.
|
||||||
|
config.setKeyDir(dirname)
|
||||||
|
ocrlang = config.getConfParam("abbyylang")
|
||||||
|
if ocrlang:
|
||||||
|
_deb("OCR lang from config: %s" % ocrlang)
|
||||||
|
return ocrlang
|
||||||
|
|
||||||
|
# Half-assed trial to guess from LANG then default to english
|
||||||
|
try:
|
||||||
|
localelang = os.environ.get("LANG", "").split("_")[0]
|
||||||
|
if localelang == "en":
|
||||||
|
ocrlang = "English"
|
||||||
|
elif localelang == "de":
|
||||||
|
ocrlang = "German"
|
||||||
|
elif localelang == "fr":
|
||||||
|
ocrlang = "French"
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if not ocrlang:
|
||||||
|
ocrlang = "English"
|
||||||
|
_deb("OCR lang (guessed): %s" % ocrlang)
|
||||||
|
return ocrlang
|
||||||
|
|
||||||
|
|
||||||
|
# run ocr on the input path and output the result data.
|
||||||
|
def runocr(config, path):
|
||||||
|
ocrlang = _guessocrlang(config, path)
|
||||||
|
my_env = os.environ.copy()
|
||||||
|
eldpn = "LD_LIBRARY_PATH"
|
||||||
|
if eldpn in my_env:
|
||||||
|
oldpath = ":" + my_env[eldpn]
|
||||||
|
else:
|
||||||
|
oldpath = ""
|
||||||
|
my_env[eldpn] = abbyyocrdir + oldpath
|
||||||
|
|
||||||
|
try:
|
||||||
|
out = subprocess.check_output(
|
||||||
|
[abbyyocrcmd, "-lpp", "BookArchiving_Accuracy",
|
||||||
|
"-rl", ocrlang,
|
||||||
|
"-tet", "UTF8",
|
||||||
|
"-f", "TextUnicodeDefaults",
|
||||||
|
"-if", path,
|
||||||
|
"-c"],
|
||||||
|
env = my_env,
|
||||||
|
stderr=subprocess.DEVNULL)
|
||||||
|
except Exception as e:
|
||||||
|
_deb("%s failed: %s" % (abbyyocrcmd,e))
|
||||||
|
return False, ""
|
||||||
|
return True, out
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import rclconfig
|
||||||
|
config = rclconfig.RclConfig()
|
||||||
|
path = sys.argv[1]
|
||||||
|
if ocrpossible(config, path):
|
||||||
|
ok, data = runocr(config, sys.argv[1])
|
||||||
|
else:
|
||||||
|
_deb("ocrpossible returned false")
|
||||||
|
sys.exit(1)
|
||||||
|
if ok:
|
||||||
|
sys.stdout.buffer.write(data)
|
||||||
|
else:
|
||||||
|
_deb("OCR program failed")
|
||||||
@ -17,6 +17,8 @@
|
|||||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
########################################################
|
########################################################
|
||||||
|
|
||||||
|
# Running tesseract for Recoll OCR (see rclocr.py)
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import atexit
|
import atexit
|
||||||
@ -28,7 +30,7 @@ import rclexecm
|
|||||||
|
|
||||||
_mswindows = (sys.platform == "win32")
|
_mswindows = (sys.platform == "win32")
|
||||||
if _mswindows:
|
if _mswindows:
|
||||||
ocrlangfile = ".rclocrlang"
|
ocrlangfile = "rclocrlang.txt"
|
||||||
else:
|
else:
|
||||||
ocrlangfile = ".rclocrlang"
|
ocrlangfile = ".rclocrlang"
|
||||||
|
|
||||||
@ -65,7 +67,7 @@ atexit.register(finalcleanup)
|
|||||||
|
|
||||||
# Return true if tesseract and the appropriate conversion program for
|
# Return true if tesseract and the appropriate conversion program for
|
||||||
# the file type (e.g. pdftoppt for pdf) appear to be available
|
# the file type (e.g. pdftoppt for pdf) appear to be available
|
||||||
def ocrpossible(path):
|
def ocrpossible(config, path):
|
||||||
# Check for tesseract
|
# Check for tesseract
|
||||||
global tesseract
|
global tesseract
|
||||||
tesseract = rclexecm.which("tesseract")
|
tesseract = rclexecm.which("tesseract")
|
||||||
@ -217,11 +219,12 @@ if __name__ == '__main__':
|
|||||||
import rclconfig
|
import rclconfig
|
||||||
config = rclconfig.RclConfig()
|
config = rclconfig.RclConfig()
|
||||||
path = sys.argv[1]
|
path = sys.argv[1]
|
||||||
if ocrpossible(path):
|
if ocrpossible(config, path):
|
||||||
data = runocr(config, sys.argv[1])
|
ok, data = runocr(config, sys.argv[1])
|
||||||
else:
|
else:
|
||||||
_deb("ocrpossible returned false")
|
_deb("ocrpossible returned false")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
sys.stdout.buffer.write(data)
|
if ok:
|
||||||
|
sys.stdout.buffer.write(data)
|
||||||
|
else:
|
||||||
|
_deb("OCR program failed")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user