diff --git a/src/filters/rclocrtesseract.py b/src/filters/rclocrtesseract.py index cc2734f6..4c8d2c9d 100755 --- a/src/filters/rclocrtesseract.py +++ b/src/filters/rclocrtesseract.py @@ -36,6 +36,9 @@ else: _okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg') +tesseractcmd = None +pdftoppmcmd = None + def _deb(s): if not _mswindows: print("rclocrtesseract: %s" % s, file=sys.stderr) @@ -69,10 +72,15 @@ atexit.register(finalcleanup) # the file type (e.g. pdftoppt for pdf) appear to be available def ocrpossible(config, path): # Check for tesseract - global tesseract - tesseract = rclexecm.which("tesseract") - if not tesseract: - return False + global tesseractcmd + if not tesseractcmd: + config.setKeyDir(os.path.dirname(path)) + tesseractcmd = config.getConfParam("tesseractcmd") + if not tesseractcmd: + tesseractcmd = rclexecm.which("tesseract") + if not tesseractcmd: + _deb("tesseractcmd not found") + return False # Check input format base,ext = os.path.splitext(path) @@ -86,9 +94,12 @@ def ocrpossible(config, path): # legacy code used pdftoppm for some reason, and it appears # that the newest builds from conda-forge do not include # pdftocairo. So stay with pdftoppm. - global pdftoppm - pdftoppm = rclexecm.which("pdftoppm") - if pdftoppm: + global pdftoppmcmd + if not pdftoppmcmd: + pdftoppmcmd = rclexecm.which("pdftoppm") + if not pdftoppmcmd: + pdftoppmcmd = rclexecm.which("poppler/pdftoppm") + if pdftoppmcmd: return True return False @@ -150,9 +161,11 @@ def _pdftesseract(config, path): # Split pdf pages try: vacuumdir(tmpdir) - subprocess.check_call([pdftoppm, "-r", "300", path, tmpfile]) + cmd = [pdftoppmcmd, "-r", "300", path, tmpfile] + #_deb("Executing %s" % cmd) + subprocess.check_call(cmd) except Exception as e: - _deb("pdftoppm failed: %s" % e) + _deb("%s failed: %s" % (pdftoppmcmd,e)) return b"" # Note: unfortunately, pdftoppm silently fails if the temp file @@ -171,10 +184,10 @@ def _pdftesseract(config, path): out = b'' try: out = subprocess.check_output( - [tesseract, f, f, "-l", tesseractlang], + [tesseractcmd, f, f, "-l", tesseractlang], stderr=subprocess.STDOUT) except Exception as e: - _deb("tesseract failed: %s" % e) + _deb("%s failed: %s" % (tesseractcmd,e)) errlines = out.split(b'\n') if len(errlines) > 5: @@ -194,10 +207,10 @@ def _simpletesseract(config, path): try: out = subprocess.check_output( - [tesseract, path, 'stdout', '-l', tesseractlang], + [tesseractcmd, path, 'stdout', '-l', tesseractlang], stderr=subprocess.DEVNULL) except Exception as e: - _deb("tesseract failed: %s" % e) + _deb("%s failed: %s" % (tesseractcmd,e)) return False, "" return True, out diff --git a/src/filters/rclpdf.py b/src/filters/rclpdf.py index 63999c3a..41e7c8ed 100755 --- a/src/filters/rclpdf.py +++ b/src/filters/rclpdf.py @@ -411,10 +411,13 @@ class PDFExtractor: #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html)) if isempty: - cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"), - self.filename] - data = subprocess.check_output(cmd) - html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix + try: + cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"), + self.filename] + data = subprocess.check_output(cmd) + html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix + except: + pass if self.extrameta: try: diff --git a/src/windows/mkinstdir.sh b/src/windows/mkinstdir.sh index 618f2400..417560f1 100644 --- a/src/windows/mkinstdir.sh +++ b/src/windows/mkinstdir.sh @@ -40,7 +40,7 @@ MUTAGEN=${RCLDEPS}mutagen-1.32/ EPUB=${RCLDEPS}epub-0.5.2 FUTURE=${RCLDEPS}python2-future ZLIB=${RCLDEPS}zlib-1.2.8 -POPPLER=${RCLDEPS}poppler-0.36/ +POPPLER=${RCLDEPS}poppler-0.68.0/ LIBWPD=${RCLDEPS}libwpd/libwpd-0.10.0/ LIBREVENGE=${RCLDEPS}libwpd/librevenge-0.0.1.jfd/ CHM=${RCLDEPS}pychm @@ -237,9 +237,16 @@ copypoppler() { test -d $FILTERS/poppler || mkdir $FILTERS/poppler || \ fatal cant create poppler dir - for f in pdftotext.exe pdfinfo.exe libpoppler.dll freetype6.dll jpeg62.dll \ - libpng16-16.dll zlib1.dll libtiff3.dll \ - libgcc_s_dw2-1.dll libstdc++-6.dll; do + for f in pdftotext.exe pdfinfo.exe pdftoppm.exe \ + freetype6.dll \ + jpeg62.dll \ + libgcc_s_dw2-1.dll \ + libpng16-16.dll \ + libpoppler*.dll \ + libstdc++-6.dll \ + libtiff3.dll \ + zlib1.dll \ + ; do chkcp $POPPLER/bin/$f $FILTERS/poppler done }