pdf/ocr scripts: no need to look for rclocr if pdfocr is not set. comments.

This commit is contained in:
Jean-Francois Dockes 2020-02-27 18:16:28 +01:00
parent bfecc9ed72
commit 8560467e4a
2 changed files with 18 additions and 19 deletions

View File

@ -26,9 +26,10 @@
import os
import sys
import importlib.util
import rclconfig
import rclocrcache
import importlib.util
def _deb(s):
print("rclocr: %s" % s, file=sys.stderr)
@ -43,6 +44,8 @@ if len(sys.argv) != 2:
path = sys.argv[1]
config = rclconfig.RclConfig()
config.setKeyDir(os.path.dirname(path))
cache = rclocrcache.OCRCache(config)
incache, data = cache.get(path)
@ -58,7 +61,9 @@ ocrprogs = config.getConfParam("ocrprogs")
if not ocrprogs:
_deb("No ocrprogs variable in recoll configuration")
sys.exit(1)
#_deb("ocrprogs: %s" % ocrprogs)
proglist = ocrprogs.split(" ")
ok = False
for ocrprog in proglist:

View File

@ -20,17 +20,8 @@
# pdftotext sometimes outputs unescaped text inside HTML text sections.
# We try to correct.
#
# If pdftotext produces no text and tesseract is available, we try to
# perform OCR. As this can be very slow and the result not always
# good, we only do this if this is required by the configuration
#
# We guess the OCR language in order of preference:
# - From the content of a ".ocrpdflang" file if it exists in the same
# directory as the PDF
# - Else from the pdfocrlang in recoll.conf
# - Else from an RECOLL_TESSERACT_LANG environment variable
# - From the content of $RECOLL_CONFDIR/ocrpdf
# - Default to "eng"
# If pdftotext produces no text and the configuration allows it, we may try to
# perform OCR.
from __future__ import print_function
@ -411,13 +402,16 @@ class PDFExtractor:
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
if isempty:
try:
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
self.filename]
data = subprocess.check_output(cmd)
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
except:
pass
self.config.setKeyDir(os.path.dirname(self.filename))
s = self.config.getConfParam("pdfocr")
if rclexecm.configparamtrue(s):
try:
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
self.filename]
data = subprocess.check_output(cmd)
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
except:
pass
if self.extrameta:
try: