OCR: small adjustments for Windows. Works with Tesseract.
This commit is contained in:
parent
abb7ef8803
commit
e520176a2a
@ -36,6 +36,9 @@ else:
|
||||
|
||||
_okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
|
||||
|
||||
tesseractcmd = None
|
||||
pdftoppmcmd = None
|
||||
|
||||
def _deb(s):
|
||||
if not _mswindows:
|
||||
print("rclocrtesseract: %s" % s, file=sys.stderr)
|
||||
@ -69,10 +72,15 @@ atexit.register(finalcleanup)
|
||||
# the file type (e.g. pdftoppt for pdf) appear to be available
|
||||
def ocrpossible(config, path):
|
||||
# Check for tesseract
|
||||
global tesseract
|
||||
tesseract = rclexecm.which("tesseract")
|
||||
if not tesseract:
|
||||
return False
|
||||
global tesseractcmd
|
||||
if not tesseractcmd:
|
||||
config.setKeyDir(os.path.dirname(path))
|
||||
tesseractcmd = config.getConfParam("tesseractcmd")
|
||||
if not tesseractcmd:
|
||||
tesseractcmd = rclexecm.which("tesseract")
|
||||
if not tesseractcmd:
|
||||
_deb("tesseractcmd not found")
|
||||
return False
|
||||
|
||||
# Check input format
|
||||
base,ext = os.path.splitext(path)
|
||||
@ -86,9 +94,12 @@ def ocrpossible(config, path):
|
||||
# legacy code used pdftoppm for some reason, and it appears
|
||||
# that the newest builds from conda-forge do not include
|
||||
# pdftocairo. So stay with pdftoppm.
|
||||
global pdftoppm
|
||||
pdftoppm = rclexecm.which("pdftoppm")
|
||||
if pdftoppm:
|
||||
global pdftoppmcmd
|
||||
if not pdftoppmcmd:
|
||||
pdftoppmcmd = rclexecm.which("pdftoppm")
|
||||
if not pdftoppmcmd:
|
||||
pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
|
||||
if pdftoppmcmd:
|
||||
return True
|
||||
|
||||
return False
|
||||
@ -150,9 +161,11 @@ def _pdftesseract(config, path):
|
||||
# Split pdf pages
|
||||
try:
|
||||
vacuumdir(tmpdir)
|
||||
subprocess.check_call([pdftoppm, "-r", "300", path, tmpfile])
|
||||
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
|
||||
#_deb("Executing %s" % cmd)
|
||||
subprocess.check_call(cmd)
|
||||
except Exception as e:
|
||||
_deb("pdftoppm failed: %s" % e)
|
||||
_deb("%s failed: %s" % (pdftoppmcmd,e))
|
||||
return b""
|
||||
|
||||
# Note: unfortunately, pdftoppm silently fails if the temp file
|
||||
@ -171,10 +184,10 @@ def _pdftesseract(config, path):
|
||||
out = b''
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
[tesseract, f, f, "-l", tesseractlang],
|
||||
[tesseractcmd, f, f, "-l", tesseractlang],
|
||||
stderr=subprocess.STDOUT)
|
||||
except Exception as e:
|
||||
_deb("tesseract failed: %s" % e)
|
||||
_deb("%s failed: %s" % (tesseractcmd,e))
|
||||
|
||||
errlines = out.split(b'\n')
|
||||
if len(errlines) > 5:
|
||||
@ -194,10 +207,10 @@ def _simpletesseract(config, path):
|
||||
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
[tesseract, path, 'stdout', '-l', tesseractlang],
|
||||
[tesseractcmd, path, 'stdout', '-l', tesseractlang],
|
||||
stderr=subprocess.DEVNULL)
|
||||
except Exception as e:
|
||||
_deb("tesseract failed: %s" % e)
|
||||
_deb("%s failed: %s" % (tesseractcmd,e))
|
||||
return False, ""
|
||||
return True, out
|
||||
|
||||
|
||||
@ -411,10 +411,13 @@ class PDFExtractor:
|
||||
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
||||
|
||||
if isempty:
|
||||
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
|
||||
self.filename]
|
||||
data = subprocess.check_output(cmd)
|
||||
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
|
||||
try:
|
||||
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
|
||||
self.filename]
|
||||
data = subprocess.check_output(cmd)
|
||||
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
|
||||
except:
|
||||
pass
|
||||
|
||||
if self.extrameta:
|
||||
try:
|
||||
|
||||
@ -40,7 +40,7 @@ MUTAGEN=${RCLDEPS}mutagen-1.32/
|
||||
EPUB=${RCLDEPS}epub-0.5.2
|
||||
FUTURE=${RCLDEPS}python2-future
|
||||
ZLIB=${RCLDEPS}zlib-1.2.8
|
||||
POPPLER=${RCLDEPS}poppler-0.36/
|
||||
POPPLER=${RCLDEPS}poppler-0.68.0/
|
||||
LIBWPD=${RCLDEPS}libwpd/libwpd-0.10.0/
|
||||
LIBREVENGE=${RCLDEPS}libwpd/librevenge-0.0.1.jfd/
|
||||
CHM=${RCLDEPS}pychm
|
||||
@ -237,9 +237,16 @@ copypoppler()
|
||||
{
|
||||
test -d $FILTERS/poppler || mkdir $FILTERS/poppler || \
|
||||
fatal cant create poppler dir
|
||||
for f in pdftotext.exe pdfinfo.exe libpoppler.dll freetype6.dll jpeg62.dll \
|
||||
libpng16-16.dll zlib1.dll libtiff3.dll \
|
||||
libgcc_s_dw2-1.dll libstdc++-6.dll; do
|
||||
for f in pdftotext.exe pdfinfo.exe pdftoppm.exe \
|
||||
freetype6.dll \
|
||||
jpeg62.dll \
|
||||
libgcc_s_dw2-1.dll \
|
||||
libpng16-16.dll \
|
||||
libpoppler*.dll \
|
||||
libstdc++-6.dll \
|
||||
libtiff3.dll \
|
||||
zlib1.dll \
|
||||
; do
|
||||
chkcp $POPPLER/bin/$f $FILTERS/poppler
|
||||
done
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user