OCR: small adjustments for Windows. Works with Tesseract.

This commit is contained in:
Jean-Francois Dockes 2020-02-27 14:10:55 +01:00
parent abb7ef8803
commit e520176a2a
3 changed files with 44 additions and 21 deletions

View File

@ -36,6 +36,9 @@ else:
_okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg') _okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
tesseractcmd = None
pdftoppmcmd = None
def _deb(s): def _deb(s):
if not _mswindows: if not _mswindows:
print("rclocrtesseract: %s" % s, file=sys.stderr) print("rclocrtesseract: %s" % s, file=sys.stderr)
@ -69,10 +72,15 @@ atexit.register(finalcleanup)
# the file type (e.g. pdftoppt for pdf) appear to be available # the file type (e.g. pdftoppt for pdf) appear to be available
def ocrpossible(config, path): def ocrpossible(config, path):
# Check for tesseract # Check for tesseract
global tesseract global tesseractcmd
tesseract = rclexecm.which("tesseract") if not tesseractcmd:
if not tesseract: config.setKeyDir(os.path.dirname(path))
return False tesseractcmd = config.getConfParam("tesseractcmd")
if not tesseractcmd:
tesseractcmd = rclexecm.which("tesseract")
if not tesseractcmd:
_deb("tesseractcmd not found")
return False
# Check input format # Check input format
base,ext = os.path.splitext(path) base,ext = os.path.splitext(path)
@ -86,9 +94,12 @@ def ocrpossible(config, path):
# legacy code used pdftoppm for some reason, and it appears # legacy code used pdftoppm for some reason, and it appears
# that the newest builds from conda-forge do not include # that the newest builds from conda-forge do not include
# pdftocairo. So stay with pdftoppm. # pdftocairo. So stay with pdftoppm.
global pdftoppm global pdftoppmcmd
pdftoppm = rclexecm.which("pdftoppm") if not pdftoppmcmd:
if pdftoppm: pdftoppmcmd = rclexecm.which("pdftoppm")
if not pdftoppmcmd:
pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
if pdftoppmcmd:
return True return True
return False return False
@ -150,9 +161,11 @@ def _pdftesseract(config, path):
# Split pdf pages # Split pdf pages
try: try:
vacuumdir(tmpdir) vacuumdir(tmpdir)
subprocess.check_call([pdftoppm, "-r", "300", path, tmpfile]) cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
#_deb("Executing %s" % cmd)
subprocess.check_call(cmd)
except Exception as e: except Exception as e:
_deb("pdftoppm failed: %s" % e) _deb("%s failed: %s" % (pdftoppmcmd,e))
return b"" return b""
# Note: unfortunately, pdftoppm silently fails if the temp file # Note: unfortunately, pdftoppm silently fails if the temp file
@ -171,10 +184,10 @@ def _pdftesseract(config, path):
out = b'' out = b''
try: try:
out = subprocess.check_output( out = subprocess.check_output(
[tesseract, f, f, "-l", tesseractlang], [tesseractcmd, f, f, "-l", tesseractlang],
stderr=subprocess.STDOUT) stderr=subprocess.STDOUT)
except Exception as e: except Exception as e:
_deb("tesseract failed: %s" % e) _deb("%s failed: %s" % (tesseractcmd,e))
errlines = out.split(b'\n') errlines = out.split(b'\n')
if len(errlines) > 5: if len(errlines) > 5:
@ -194,10 +207,10 @@ def _simpletesseract(config, path):
try: try:
out = subprocess.check_output( out = subprocess.check_output(
[tesseract, path, 'stdout', '-l', tesseractlang], [tesseractcmd, path, 'stdout', '-l', tesseractlang],
stderr=subprocess.DEVNULL) stderr=subprocess.DEVNULL)
except Exception as e: except Exception as e:
_deb("tesseract failed: %s" % e) _deb("%s failed: %s" % (tesseractcmd,e))
return False, "" return False, ""
return True, out return True, out

View File

@ -411,10 +411,13 @@ class PDFExtractor:
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html)) #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
if isempty: if isempty:
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"), try:
self.filename] cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
data = subprocess.check_output(cmd) self.filename]
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix data = subprocess.check_output(cmd)
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
except:
pass
if self.extrameta: if self.extrameta:
try: try:

View File

@ -40,7 +40,7 @@ MUTAGEN=${RCLDEPS}mutagen-1.32/
EPUB=${RCLDEPS}epub-0.5.2 EPUB=${RCLDEPS}epub-0.5.2
FUTURE=${RCLDEPS}python2-future FUTURE=${RCLDEPS}python2-future
ZLIB=${RCLDEPS}zlib-1.2.8 ZLIB=${RCLDEPS}zlib-1.2.8
POPPLER=${RCLDEPS}poppler-0.36/ POPPLER=${RCLDEPS}poppler-0.68.0/
LIBWPD=${RCLDEPS}libwpd/libwpd-0.10.0/ LIBWPD=${RCLDEPS}libwpd/libwpd-0.10.0/
LIBREVENGE=${RCLDEPS}libwpd/librevenge-0.0.1.jfd/ LIBREVENGE=${RCLDEPS}libwpd/librevenge-0.0.1.jfd/
CHM=${RCLDEPS}pychm CHM=${RCLDEPS}pychm
@ -237,9 +237,16 @@ copypoppler()
{ {
test -d $FILTERS/poppler || mkdir $FILTERS/poppler || \ test -d $FILTERS/poppler || mkdir $FILTERS/poppler || \
fatal cant create poppler dir fatal cant create poppler dir
for f in pdftotext.exe pdfinfo.exe libpoppler.dll freetype6.dll jpeg62.dll \ for f in pdftotext.exe pdfinfo.exe pdftoppm.exe \
libpng16-16.dll zlib1.dll libtiff3.dll \ freetype6.dll \
libgcc_s_dw2-1.dll libstdc++-6.dll; do jpeg62.dll \
libgcc_s_dw2-1.dll \
libpng16-16.dll \
libpoppler*.dll \
libstdc++-6.dll \
libtiff3.dll \
zlib1.dll \
; do
chkcp $POPPLER/bin/$f $FILTERS/poppler chkcp $POPPLER/bin/$f $FILTERS/poppler
done done
} }