OCR: small adjustments for Windows. Works with Tesseract.

2020-02-27 14:10:55 +01:00 · 2020-02-27 14:10:55 +01:00 · e520176a2a
commit e520176a2a
parent abb7ef8803
3 changed files with 44 additions and 21 deletions
--- a/src/filters/rclocrtesseract.py
+++ b/src/filters/rclocrtesseract.py
@ -36,6 +36,9 @@ else:

 _okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')

+tesseractcmd = None
+pdftoppmcmd = None
+
 def _deb(s):
    if not _mswindows:
        print("rclocrtesseract: %s" % s, file=sys.stderr)
@ -69,10 +72,15 @@ atexit.register(finalcleanup)
 # the file type (e.g. pdftoppt for pdf) appear to be available
 def ocrpossible(config, path):
    # Check for tesseract
-    global tesseract
-    tesseract = rclexecm.which("tesseract")
-    if not tesseract:
-        return False
+    global tesseractcmd
+    if not tesseractcmd:
+        config.setKeyDir(os.path.dirname(path))
+        tesseractcmd = config.getConfParam("tesseractcmd")
+        if not tesseractcmd:
+            tesseractcmd = rclexecm.which("tesseract")
+        if not tesseractcmd:
+            _deb("tesseractcmd not found")
+            return False

    # Check input format
    base,ext = os.path.splitext(path)
@ -86,9 +94,12 @@ def ocrpossible(config, path):
        # legacy code used pdftoppm for some reason, and it appears
        # that the newest builds from conda-forge do not include
        # pdftocairo. So stay with pdftoppm.
-        global pdftoppm
-        pdftoppm = rclexecm.which("pdftoppm")
-        if pdftoppm:
+        global pdftoppmcmd
+        if not pdftoppmcmd:
+            pdftoppmcmd = rclexecm.which("pdftoppm")
+            if not pdftoppmcmd:
+                pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
+        if pdftoppmcmd:
            return True

    return False
@ -150,9 +161,11 @@ def _pdftesseract(config, path):
    # Split pdf pages
    try:
        vacuumdir(tmpdir)
-        subprocess.check_call([pdftoppm, "-r", "300", path, tmpfile])
+        cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
+        #_deb("Executing %s" % cmd)
+        subprocess.check_call(cmd)
    except Exception as e:
-        _deb("pdftoppm failed: %s" % e)
+        _deb("%s failed: %s" % (pdftoppmcmd,e))
        return b""

    # Note: unfortunately, pdftoppm silently fails if the temp file
@ -171,10 +184,10 @@ def _pdftesseract(config, path):
        out = b''
        try:
            out = subprocess.check_output(
-                [tesseract, f, f, "-l", tesseractlang],
+                [tesseractcmd, f, f, "-l", tesseractlang],
                stderr=subprocess.STDOUT)
        except Exception as e:
-            _deb("tesseract failed: %s" % e)
+            _deb("%s failed: %s" % (tesseractcmd,e))

        errlines = out.split(b'\n')
        if len(errlines) > 5:
@ -194,10 +207,10 @@ def _simpletesseract(config, path):

    try:
        out = subprocess.check_output(
-            [tesseract, path, 'stdout', '-l', tesseractlang],
+            [tesseractcmd, path, 'stdout', '-l', tesseractlang],
            stderr=subprocess.DEVNULL)
    except Exception as e:
-        _deb("tesseract failed: %s" % e)
+        _deb("%s failed: %s" % (tesseractcmd,e))
        return False, ""
    return True, out

--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@ -411,10 +411,13 @@ class PDFExtractor:
        #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))

        if isempty:
-            cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
-                   self.filename]
-            data = subprocess.check_output(cmd)
-            html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
+            try:
+                cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
+                       self.filename]
+                data = subprocess.check_output(cmd)
+                html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
+            except:
+                pass

        if self.extrameta:
            try:
--- a/src/windows/mkinstdir.sh
+++ b/src/windows/mkinstdir.sh
@ -40,7 +40,7 @@ MUTAGEN=${RCLDEPS}mutagen-1.32/
 EPUB=${RCLDEPS}epub-0.5.2
 FUTURE=${RCLDEPS}python2-future
 ZLIB=${RCLDEPS}zlib-1.2.8
-POPPLER=${RCLDEPS}poppler-0.36/
+POPPLER=${RCLDEPS}poppler-0.68.0/
 LIBWPD=${RCLDEPS}libwpd/libwpd-0.10.0/
 LIBREVENGE=${RCLDEPS}libwpd/librevenge-0.0.1.jfd/
 CHM=${RCLDEPS}pychm
@ -237,9 +237,16 @@ copypoppler()
 {
    test -d $FILTERS/poppler || mkdir $FILTERS/poppler || \
        fatal cant create poppler dir
-    for f in pdftotext.exe pdfinfo.exe libpoppler.dll freetype6.dll jpeg62.dll \
-             libpng16-16.dll zlib1.dll libtiff3.dll \
-             libgcc_s_dw2-1.dll libstdc++-6.dll; do
+    for f in pdftotext.exe pdfinfo.exe pdftoppm.exe \
+             freetype6.dll \
+             jpeg62.dll \
+             libgcc_s_dw2-1.dll \
+             libpng16-16.dll \
+             libpoppler*.dll \
+             libstdc++-6.dll \
+             libtiff3.dll \
+             zlib1.dll \
+             ; do
        chkcp $POPPLER/bin/$f $FILTERS/poppler
    done
 }