pdf/ocr scripts: no need to look for rclocr if pdfocr is not set. comments.

2020-02-27 18:16:28 +01:00 · 2020-02-27 18:16:28 +01:00 · 8560467e4a
commit 8560467e4a
parent bfecc9ed72
2 changed files with 18 additions and 19 deletions
--- a/src/filters/rclocr.py
+++ b/src/filters/rclocr.py
@ -26,9 +26,10 @@

 import os
 import sys
+import importlib.util
+
 import rclconfig
 import rclocrcache
-import importlib.util

 def _deb(s):
    print("rclocr: %s" % s, file=sys.stderr)
@ -43,6 +44,8 @@ if len(sys.argv) != 2:
 path = sys.argv[1]

 config = rclconfig.RclConfig()
+config.setKeyDir(os.path.dirname(path))
+
 cache = rclocrcache.OCRCache(config)

 incache, data = cache.get(path)
@ -58,7 +61,9 @@ ocrprogs = config.getConfParam("ocrprogs")
 if not ocrprogs:
    _deb("No ocrprogs variable in recoll configuration")
    sys.exit(1)
+
 #_deb("ocrprogs: %s" % ocrprogs)
+
 proglist = ocrprogs.split(" ")
 ok = False
 for ocrprog in proglist:
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@ -20,17 +20,8 @@
 # pdftotext sometimes outputs unescaped text inside HTML text sections.
 # We try to correct.
 #
-# If pdftotext produces no text and tesseract is available, we try to
-# perform OCR. As this can be very slow and the result not always
-# good, we only do this if this is required by the configuration
-#
-# We guess the OCR language in order of preference:
-#  - From the content of a ".ocrpdflang" file if it exists in the same
-#    directory as the PDF
-#  - Else from the pdfocrlang in recoll.conf
-#  - Else from an RECOLL_TESSERACT_LANG environment variable
-#  - From the content of $RECOLL_CONFDIR/ocrpdf
-#  - Default to "eng"
+# If pdftotext produces no text and the configuration allows it, we may try to
+# perform OCR.

 from __future__ import print_function

@ -411,13 +402,16 @@ class PDFExtractor:
        #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))

        if isempty:
-            try:
-                cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
-                       self.filename]
-                data = subprocess.check_output(cmd)
-                html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
-            except:
-                pass
+            self.config.setKeyDir(os.path.dirname(self.filename))
+            s = self.config.getConfParam("pdfocr")
+            if rclexecm.configparamtrue(s):
+                try:
+                    cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
+                           self.filename]
+                    data = subprocess.check_output(cmd)
+                    html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
+                except:
+                    pass

        if self.extrameta:
            try: