tesseract ocr: use compressed tif temp pages if pdftocairo is available (10x smaller than ppm)

This commit is contained in:
Jean-Francois Dockes 2021-12-04 09:35:10 +01:00
parent c37765524d
commit 5fcffb7654

View File

@ -37,7 +37,7 @@ _okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
tesseractcmd = None
pdftoppmcmd = None
pdftocairocmd = None
def _deb(s):
rclexecm.logmsg("rclocrtesseract: %s" % s)
@ -95,12 +95,16 @@ def ocrpossible(config, path):
# legacy code used pdftoppm for some reason, and it appears
# that the newest builds from conda-forge do not include
# pdftocairo. So stay with pdftoppm.
global pdftoppmcmd
if not pdftoppmcmd:
pdftoppmcmd = rclexecm.which("pdftoppm")
if not pdftoppmcmd:
pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
if pdftoppmcmd:
global pdftoppmcmd, pdftocairocmd
if not pdftoppmcmd and not pdftocairocmd:
pdftocairocmd = rclexecm.which("pdftocairo")
if not pdftocairocmd:
pdftocairocmd = rclexecm.which("poppler/pdftocairo")
if not pdftocairocmd:
pdftoppmcmd = rclexecm.which("pdftoppm")
if not pdftoppmcmd:
pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
if pdftoppmcmd or pdftocairocmd:
return True
return False
@ -163,8 +167,11 @@ def _pdftesseract(config, path):
# Split pdf pages
try:
tmpdir.vacuumdir()
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
#_deb("Executing %s" % cmd)
if pdftocairocmd:
cmd = [pdftocairocmd, "-tiff", "-tiffcompression", "lzw", "-r", "300", path, tmpfile]
else:
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
#_deb("Executing %s" % cmd)
subprocess.check_call(cmd)
except Exception as e:
_deb("%s failed: %s" % (pdftoppmcmd,e))
@ -174,8 +181,8 @@ def _pdftesseract(config, path):
# system is full. There is no really good way to check for
# this. We consider any empty file to signal an error
ppmfiles = glob.glob(tmpfile + "*")
for f in ppmfiles:
pages = glob.glob(tmpfile + "*")
for f in pages:
size = os.path.getsize(f)
if os.path.getsize(f) == 0:
_deb("pdftoppm created empty files. "
@ -191,7 +198,7 @@ def _pdftesseract(config, path):
except:
pass
for f in sorted(ppmfiles):
for f in sorted(pages):
out = b''
try:
out = subprocess.check_output(