tesseract ocr: use compressed tif temp pages if pdftocairo is available (10x smaller than ppm)

This commit is contained in:
Jean-Francois Dockes 2021-12-04 09:35:10 +01:00
parent c37765524d
commit 5fcffb7654

View File

@ -37,7 +37,7 @@ _okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
tesseractcmd = None tesseractcmd = None
pdftoppmcmd = None pdftoppmcmd = None
pdftocairocmd = None
def _deb(s): def _deb(s):
rclexecm.logmsg("rclocrtesseract: %s" % s) rclexecm.logmsg("rclocrtesseract: %s" % s)
@ -95,12 +95,16 @@ def ocrpossible(config, path):
# legacy code used pdftoppm for some reason, and it appears # legacy code used pdftoppm for some reason, and it appears
# that the newest builds from conda-forge do not include # that the newest builds from conda-forge do not include
# pdftocairo. So stay with pdftoppm. # pdftocairo. So stay with pdftoppm.
global pdftoppmcmd global pdftoppmcmd, pdftocairocmd
if not pdftoppmcmd: if not pdftoppmcmd and not pdftocairocmd:
pdftocairocmd = rclexecm.which("pdftocairo")
if not pdftocairocmd:
pdftocairocmd = rclexecm.which("poppler/pdftocairo")
if not pdftocairocmd:
pdftoppmcmd = rclexecm.which("pdftoppm") pdftoppmcmd = rclexecm.which("pdftoppm")
if not pdftoppmcmd: if not pdftoppmcmd:
pdftoppmcmd = rclexecm.which("poppler/pdftoppm") pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
if pdftoppmcmd: if pdftoppmcmd or pdftocairocmd:
return True return True
return False return False
@ -163,6 +167,9 @@ def _pdftesseract(config, path):
# Split pdf pages # Split pdf pages
try: try:
tmpdir.vacuumdir() tmpdir.vacuumdir()
if pdftocairocmd:
cmd = [pdftocairocmd, "-tiff", "-tiffcompression", "lzw", "-r", "300", path, tmpfile]
else:
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile] cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
#_deb("Executing %s" % cmd) #_deb("Executing %s" % cmd)
subprocess.check_call(cmd) subprocess.check_call(cmd)
@ -174,8 +181,8 @@ def _pdftesseract(config, path):
# system is full. There is no really good way to check for # system is full. There is no really good way to check for
# this. We consider any empty file to signal an error # this. We consider any empty file to signal an error
ppmfiles = glob.glob(tmpfile + "*") pages = glob.glob(tmpfile + "*")
for f in ppmfiles: for f in pages:
size = os.path.getsize(f) size = os.path.getsize(f)
if os.path.getsize(f) == 0: if os.path.getsize(f) == 0:
_deb("pdftoppm created empty files. " _deb("pdftoppm created empty files. "
@ -191,7 +198,7 @@ def _pdftesseract(config, path):
except: except:
pass pass
for f in sorted(ppmfiles): for f in sorted(pages):
out = b'' out = b''
try: try:
out = subprocess.check_output( out = subprocess.check_output(