recoll/src/filters/rclocrtesseract.py

262 lines
8.0 KiB
Python
Executable File

#!/usr/bin/env python3
#################################
# Copyright (C) 2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
########################################################
# Running tesseract for Recoll OCR (see rclocr.py)
import os
import sys
import tempfile
import subprocess
import glob
import rclexecm
_mswindows = (sys.platform == "win32")
if _mswindows:
ocrlangfile = "rclocrlang.txt"
else:
ocrlangfile = ".rclocrlang"
_okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
tesseractcmd = None
pdftoppmcmd = None
pdftocairocmd = None
def _deb(s):
rclexecm.logmsg("rclocrtesseract: %s" % s)
tmpdir = None
def _maybemaketmpdir():
global tmpdir
if tmpdir:
if not tmpdir.vacuumdir():
_deb("openfile: vacuumdir %s failed" % tmpdir.getpath())
return False
else:
tmpdir = rclexecm.SafeTmpDir("rclocrtesseract")
def cleanocr():
global tmpdir
if tmpdir:
del tmpdir
tmpdir = None
# Return true if tesseract and the appropriate conversion program for
# the file type (e.g. pdftoppt for pdf) appear to be available
def ocrpossible(config, path):
# Check for tesseract
global tesseractcmd
if not tesseractcmd:
config.setKeyDir(os.path.dirname(path))
tesseractcmd = config.getConfParam("tesseractcmd")
if tesseractcmd:
# It is very tempting to quote this value, esp. on Windows where it
# will contain whitespace. There is no chance that an actual
# command line would have quotes, so unquote it.
tesseractcmd = tesseractcmd.strip('"')
else:
tesseractcmd = rclexecm.which("tesseract")
if not tesseractcmd:
_deb("tesseractcmd not found")
return False
if not os.path.isfile(tesseractcmd):
_deb("tesseractcmd parameter [%s] is not a file" % tesseractcmd)
return False
# Check input format
base,ext = os.path.splitext(path)
ext = ext.lower()
if ext in _okexts:
return True
if ext == '.pdf':
# Check for pdftoppm. We could use pdftocairo, which can
# produce a multi-page pdf and make the rest simpler, but the
# legacy code used pdftoppm for some reason, and it appears
# that the newest builds from conda-forge do not include
# pdftocairo. So stay with pdftoppm.
global pdftoppmcmd, pdftocairocmd
if not pdftoppmcmd and not pdftocairocmd:
pdftocairocmd = rclexecm.which("pdftocairo")
if not pdftocairocmd:
pdftocairocmd = rclexecm.which("poppler/pdftocairo")
if not pdftocairocmd:
pdftoppmcmd = rclexecm.which("pdftoppm")
if not pdftoppmcmd:
pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
if pdftoppmcmd or pdftocairocmd:
return True
return False
# Try to guess tesseract language. This should depend on the input
# file, but we have no general way to determine it. So use the
# environment and hope for the best.
def _guesstesseractlang(config, path):
tesseractlang = ""
dirname = os.path.dirname(path)
# First look for a language def file in the file's directory
pdflangfile = os.path.join(dirname, ocrlangfile)
if os.path.isfile(pdflangfile):
tesseractlang = open(pdflangfile, "r").read().strip()
if tesseractlang:
_deb("Tesseract lang from file: %s" % tesseractlang)
return tesseractlang
# Then look for a config file option.
config.setKeyDir(dirname)
tesseractlang = config.getConfParam("tesseractlang")
if tesseractlang:
_deb("Tesseract lang from config: %s" % tesseractlang)
return tesseractlang
# Half-assed trial to guess from LANG then default to english
try:
localelang = os.environ.get("LANG", "").split("_")[0]
if localelang == "en":
tesseractlang = "eng"
elif localelang == "de":
tesseractlang = "deu"
elif localelang == "fr":
tesseractlang = "fra"
except:
pass
if not tesseractlang:
tesseractlang = "eng"
_deb("Tesseract lang (guessed): %s" % tesseractlang)
return tesseractlang
# Process pdf file: use pdftoppm to split it into ppm pages, then run
# tesseract on each and concatenate the result. It would probably be
# possible instead to use pdftocairo to produce a tiff, buf pdftocairo
# is sometimes not available (windows).
def _pdftesseract(config, path):
if not tmpdir:
return b""
tesseractlang = _guesstesseractlang(config, path)
#tesserrorfile = os.path.join(tmpdir.getpath(), "tesserrorfile")
tmpfile = os.path.join(tmpdir.getpath(), "ocrXXXXXX")
# Split pdf pages
try:
tmpdir.vacuumdir()
if pdftocairocmd:
cmd = [pdftocairocmd, "-tiff", "-tiffcompression", "lzw", "-r", "300", path, tmpfile]
else:
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
#_deb("Executing %s" % cmd)
subprocess.check_call(cmd)
except Exception as e:
_deb("%s failed: %s" % (pdftoppmcmd,e))
return b""
# Note: unfortunately, pdftoppm silently fails if the temp file
# system is full. There is no really good way to check for
# this. We consider any empty file to signal an error
pages = glob.glob(tmpfile + "*")
for f in pages:
size = os.path.getsize(f)
if os.path.getsize(f) == 0:
_deb("pdftoppm created empty files. "
"Suspecting full file system, failing")
return False, ""
nenv = os.environ.copy()
cnthreads = config.getConfParam("tesseractnthreads")
if cnthreads:
try:
nthreads = int(cnthreads)
nenv['OMP_THREAD_LIMIT'] = cnthreads
except:
pass
for f in sorted(pages):
out = b''
try:
out = subprocess.check_output(
[tesseractcmd, f, f, "-l", tesseractlang],
stderr=subprocess.STDOUT, env=nenv)
except Exception as e:
_deb("%s failed: %s" % (tesseractcmd,e))
errlines = out.split(b'\n')
if len(errlines) > 5:
_deb("Tesseract error output: %d %s" % (len(errlines),out))
# Concatenate the result files
txtfiles = glob.glob(tmpfile + "*" + ".txt")
data = b""
for f in sorted(txtfiles):
data += open(f, "rb").read()
return True,data
def _simpletesseract(config, path):
tesseractlang = _guesstesseractlang(config, path)
try:
out = subprocess.check_output(
[tesseractcmd, path, 'stdout', '-l', tesseractlang],
stderr=subprocess.DEVNULL)
except Exception as e:
_deb("%s failed: %s" % (tesseractcmd,e))
return False, ""
return True, out
# run ocr on the input path and output the result data.
def runocr(config, path):
_maybemaketmpdir()
base,ext = os.path.splitext(path)
ext = ext.lower()
if ext in _okexts:
return _simpletesseract(config, path)
else:
return _pdftesseract(config, path)
if __name__ == '__main__':
import rclconfig
config = rclconfig.RclConfig()
path = sys.argv[1]
if ocrpossible(config, path):
ok, data = runocr(config, sys.argv[1])
else:
_deb("ocrpossible returned false")
sys.exit(1)
if ok:
sys.stdout.buffer.write(data)
else:
_deb("OCR program failed")