tesseract ocr: use compressed tif temp pages if pdftocairo is available (10x smaller than ppm)
This commit is contained in:
parent
c37765524d
commit
5fcffb7654
@ -37,7 +37,7 @@ _okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
|
|||||||
|
|
||||||
tesseractcmd = None
|
tesseractcmd = None
|
||||||
pdftoppmcmd = None
|
pdftoppmcmd = None
|
||||||
|
pdftocairocmd = None
|
||||||
|
|
||||||
def _deb(s):
|
def _deb(s):
|
||||||
rclexecm.logmsg("rclocrtesseract: %s" % s)
|
rclexecm.logmsg("rclocrtesseract: %s" % s)
|
||||||
@ -95,12 +95,16 @@ def ocrpossible(config, path):
|
|||||||
# legacy code used pdftoppm for some reason, and it appears
|
# legacy code used pdftoppm for some reason, and it appears
|
||||||
# that the newest builds from conda-forge do not include
|
# that the newest builds from conda-forge do not include
|
||||||
# pdftocairo. So stay with pdftoppm.
|
# pdftocairo. So stay with pdftoppm.
|
||||||
global pdftoppmcmd
|
global pdftoppmcmd, pdftocairocmd
|
||||||
if not pdftoppmcmd:
|
if not pdftoppmcmd and not pdftocairocmd:
|
||||||
|
pdftocairocmd = rclexecm.which("pdftocairo")
|
||||||
|
if not pdftocairocmd:
|
||||||
|
pdftocairocmd = rclexecm.which("poppler/pdftocairo")
|
||||||
|
if not pdftocairocmd:
|
||||||
pdftoppmcmd = rclexecm.which("pdftoppm")
|
pdftoppmcmd = rclexecm.which("pdftoppm")
|
||||||
if not pdftoppmcmd:
|
if not pdftoppmcmd:
|
||||||
pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
|
pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
|
||||||
if pdftoppmcmd:
|
if pdftoppmcmd or pdftocairocmd:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
@ -163,6 +167,9 @@ def _pdftesseract(config, path):
|
|||||||
# Split pdf pages
|
# Split pdf pages
|
||||||
try:
|
try:
|
||||||
tmpdir.vacuumdir()
|
tmpdir.vacuumdir()
|
||||||
|
if pdftocairocmd:
|
||||||
|
cmd = [pdftocairocmd, "-tiff", "-tiffcompression", "lzw", "-r", "300", path, tmpfile]
|
||||||
|
else:
|
||||||
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
|
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
|
||||||
#_deb("Executing %s" % cmd)
|
#_deb("Executing %s" % cmd)
|
||||||
subprocess.check_call(cmd)
|
subprocess.check_call(cmd)
|
||||||
@ -174,8 +181,8 @@ def _pdftesseract(config, path):
|
|||||||
# system is full. There is no really good way to check for
|
# system is full. There is no really good way to check for
|
||||||
# this. We consider any empty file to signal an error
|
# this. We consider any empty file to signal an error
|
||||||
|
|
||||||
ppmfiles = glob.glob(tmpfile + "*")
|
pages = glob.glob(tmpfile + "*")
|
||||||
for f in ppmfiles:
|
for f in pages:
|
||||||
size = os.path.getsize(f)
|
size = os.path.getsize(f)
|
||||||
if os.path.getsize(f) == 0:
|
if os.path.getsize(f) == 0:
|
||||||
_deb("pdftoppm created empty files. "
|
_deb("pdftoppm created empty files. "
|
||||||
@ -191,7 +198,7 @@ def _pdftesseract(config, path):
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
for f in sorted(ppmfiles):
|
for f in sorted(pages):
|
||||||
out = b''
|
out = b''
|
||||||
try:
|
try:
|
||||||
out = subprocess.check_output(
|
out = subprocess.check_output(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user