recoll/src/filters/rclocr.py

125 lines
3.4 KiB
Python
Executable File

#!/usr/bin/env python3
#################################
# Copyright (C) 2020 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
########################################################
# Running OCR programs for Recoll. This is excecuted from,
# e.g. rclpdf.py if pdftotext returns no data.
#
# The script tries to retrieve the data from the ocr cache, else it
# runs the configured OCR program and updates the cache. In both cases it writes
# the resulting text to stdout.
import os
import sys
import importlib.util
import rclconfig
import rclocrcache
_mswindows = (sys.platform == "win32")
def _deb(s):
if not _mswindows:
print("rclocr: %s" % s, file=sys.stderr)
def Usage():
_deb("Usage: rclocr.py <imagefilename>")
sys.exit(1)
def breakwrite(f, data):
# On Windows, writing big chunks can fail with a "not enough space"
# error. Seems a combined windows/python bug, depending on versions.
# See https://bugs.python.org/issue11395
# In any case, just break it up
total = len(data)
bs = 4*1024
offset = 0
while total > 0:
if total < bs:
tow = total
else:
tow = bs
f.write(data[offset:offset+tow])
offset += tow
total -= tow
if len(sys.argv) != 2:
Usage()
path = sys.argv[1]
config = rclconfig.RclConfig()
config.setKeyDir(os.path.dirname(path))
cache = rclocrcache.OCRCache(config)
incache, data = cache.get(path)
if incache:
try:
breakwrite(sys.stdout.buffer, data)
except Exception as e:
_deb("RCLOCR error writing: %s" % e)
sys.exit(1)
sys.exit(0)
#### Data not in cache
# Retrieve configured OCR program names and try to load the
# corresponding module
ocrprogs = config.getConfParam("ocrprogs")
if ocrprogs is None:
# Compat: the previous version has no ocrprogs variable, but would do
# tesseract by default. Use "ocrprogs = " for a really empty list
ocrprogs = "tesseract"
if not ocrprogs:
_deb("No ocrprogs variable in recoll configuration")
sys.exit(0)
#_deb("ocrprogs: %s" % ocrprogs)
proglist = ocrprogs.split(" ")
ok = False
for ocrprog in proglist:
try:
modulename = "rclocr" + ocrprog
ocr = importlib.import_module(modulename)
if ocr.ocrpossible(config, path):
ok = True
break
except Exception as err:
_deb("While loading %s: got: %s" % (modulename, err))
pass
if not ok:
_deb("No OCR module could be loaded")
sys.exit(1)
#_deb("Using ocr module %s" % modulename)
# The OCR module will retrieve its specific parameters from the
# configuration
status, data = ocr.runocr(config, path)
if not status:
_deb("runocr failed")
sys.exit(1)
cache.store(path, data)
sys.stdout.buffer.write(data)
sys.exit(0)