OCR: small adjustments for Windows. Works with Tesseract.

This commit is contained in:
Jean-Francois Dockes 2020-02-27 14:10:55 +01:00
parent abb7ef8803
commit e520176a2a
3 changed files with 44 additions and 21 deletions

View File

@ -36,6 +36,9 @@ else:
_okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
tesseractcmd = None
pdftoppmcmd = None
def _deb(s):
if not _mswindows:
print("rclocrtesseract: %s" % s, file=sys.stderr)
@ -69,10 +72,15 @@ atexit.register(finalcleanup)
# the file type (e.g. pdftoppt for pdf) appear to be available
def ocrpossible(config, path):
# Check for tesseract
global tesseract
tesseract = rclexecm.which("tesseract")
if not tesseract:
return False
global tesseractcmd
if not tesseractcmd:
config.setKeyDir(os.path.dirname(path))
tesseractcmd = config.getConfParam("tesseractcmd")
if not tesseractcmd:
tesseractcmd = rclexecm.which("tesseract")
if not tesseractcmd:
_deb("tesseractcmd not found")
return False
# Check input format
base,ext = os.path.splitext(path)
@ -86,9 +94,12 @@ def ocrpossible(config, path):
# legacy code used pdftoppm for some reason, and it appears
# that the newest builds from conda-forge do not include
# pdftocairo. So stay with pdftoppm.
global pdftoppm
pdftoppm = rclexecm.which("pdftoppm")
if pdftoppm:
global pdftoppmcmd
if not pdftoppmcmd:
pdftoppmcmd = rclexecm.which("pdftoppm")
if not pdftoppmcmd:
pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
if pdftoppmcmd:
return True
return False
@ -150,9 +161,11 @@ def _pdftesseract(config, path):
# Split pdf pages
try:
vacuumdir(tmpdir)
subprocess.check_call([pdftoppm, "-r", "300", path, tmpfile])
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
#_deb("Executing %s" % cmd)
subprocess.check_call(cmd)
except Exception as e:
_deb("pdftoppm failed: %s" % e)
_deb("%s failed: %s" % (pdftoppmcmd,e))
return b""
# Note: unfortunately, pdftoppm silently fails if the temp file
@ -171,10 +184,10 @@ def _pdftesseract(config, path):
out = b''
try:
out = subprocess.check_output(
[tesseract, f, f, "-l", tesseractlang],
[tesseractcmd, f, f, "-l", tesseractlang],
stderr=subprocess.STDOUT)
except Exception as e:
_deb("tesseract failed: %s" % e)
_deb("%s failed: %s" % (tesseractcmd,e))
errlines = out.split(b'\n')
if len(errlines) > 5:
@ -194,10 +207,10 @@ def _simpletesseract(config, path):
try:
out = subprocess.check_output(
[tesseract, path, 'stdout', '-l', tesseractlang],
[tesseractcmd, path, 'stdout', '-l', tesseractlang],
stderr=subprocess.DEVNULL)
except Exception as e:
_deb("tesseract failed: %s" % e)
_deb("%s failed: %s" % (tesseractcmd,e))
return False, ""
return True, out

View File

@ -411,10 +411,13 @@ class PDFExtractor:
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
if isempty:
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
self.filename]
data = subprocess.check_output(cmd)
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
try:
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
self.filename]
data = subprocess.check_output(cmd)
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
except:
pass
if self.extrameta:
try:

View File

@ -40,7 +40,7 @@ MUTAGEN=${RCLDEPS}mutagen-1.32/
EPUB=${RCLDEPS}epub-0.5.2
FUTURE=${RCLDEPS}python2-future
ZLIB=${RCLDEPS}zlib-1.2.8
POPPLER=${RCLDEPS}poppler-0.36/
POPPLER=${RCLDEPS}poppler-0.68.0/
LIBWPD=${RCLDEPS}libwpd/libwpd-0.10.0/
LIBREVENGE=${RCLDEPS}libwpd/librevenge-0.0.1.jfd/
CHM=${RCLDEPS}pychm
@ -237,9 +237,16 @@ copypoppler()
{
test -d $FILTERS/poppler || mkdir $FILTERS/poppler || \
fatal cant create poppler dir
for f in pdftotext.exe pdfinfo.exe libpoppler.dll freetype6.dll jpeg62.dll \
libpng16-16.dll zlib1.dll libtiff3.dll \
libgcc_s_dw2-1.dll libstdc++-6.dll; do
for f in pdftotext.exe pdfinfo.exe pdftoppm.exe \
freetype6.dll \
jpeg62.dll \
libgcc_s_dw2-1.dll \
libpng16-16.dll \
libpoppler*.dll \
libstdc++-6.dll \
libtiff3.dll \
zlib1.dll \
; do
chkcp $POPPLER/bin/$f $FILTERS/poppler
done
}