OCR: small adjustments for Windows. Works with Tesseract.
This commit is contained in:
parent
abb7ef8803
commit
e520176a2a
@ -36,6 +36,9 @@ else:
|
|||||||
|
|
||||||
_okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
|
_okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
|
||||||
|
|
||||||
|
tesseractcmd = None
|
||||||
|
pdftoppmcmd = None
|
||||||
|
|
||||||
def _deb(s):
|
def _deb(s):
|
||||||
if not _mswindows:
|
if not _mswindows:
|
||||||
print("rclocrtesseract: %s" % s, file=sys.stderr)
|
print("rclocrtesseract: %s" % s, file=sys.stderr)
|
||||||
@ -69,10 +72,15 @@ atexit.register(finalcleanup)
|
|||||||
# the file type (e.g. pdftoppt for pdf) appear to be available
|
# the file type (e.g. pdftoppt for pdf) appear to be available
|
||||||
def ocrpossible(config, path):
|
def ocrpossible(config, path):
|
||||||
# Check for tesseract
|
# Check for tesseract
|
||||||
global tesseract
|
global tesseractcmd
|
||||||
tesseract = rclexecm.which("tesseract")
|
if not tesseractcmd:
|
||||||
if not tesseract:
|
config.setKeyDir(os.path.dirname(path))
|
||||||
return False
|
tesseractcmd = config.getConfParam("tesseractcmd")
|
||||||
|
if not tesseractcmd:
|
||||||
|
tesseractcmd = rclexecm.which("tesseract")
|
||||||
|
if not tesseractcmd:
|
||||||
|
_deb("tesseractcmd not found")
|
||||||
|
return False
|
||||||
|
|
||||||
# Check input format
|
# Check input format
|
||||||
base,ext = os.path.splitext(path)
|
base,ext = os.path.splitext(path)
|
||||||
@ -86,9 +94,12 @@ def ocrpossible(config, path):
|
|||||||
# legacy code used pdftoppm for some reason, and it appears
|
# legacy code used pdftoppm for some reason, and it appears
|
||||||
# that the newest builds from conda-forge do not include
|
# that the newest builds from conda-forge do not include
|
||||||
# pdftocairo. So stay with pdftoppm.
|
# pdftocairo. So stay with pdftoppm.
|
||||||
global pdftoppm
|
global pdftoppmcmd
|
||||||
pdftoppm = rclexecm.which("pdftoppm")
|
if not pdftoppmcmd:
|
||||||
if pdftoppm:
|
pdftoppmcmd = rclexecm.which("pdftoppm")
|
||||||
|
if not pdftoppmcmd:
|
||||||
|
pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
|
||||||
|
if pdftoppmcmd:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
@ -150,9 +161,11 @@ def _pdftesseract(config, path):
|
|||||||
# Split pdf pages
|
# Split pdf pages
|
||||||
try:
|
try:
|
||||||
vacuumdir(tmpdir)
|
vacuumdir(tmpdir)
|
||||||
subprocess.check_call([pdftoppm, "-r", "300", path, tmpfile])
|
cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
|
||||||
|
#_deb("Executing %s" % cmd)
|
||||||
|
subprocess.check_call(cmd)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_deb("pdftoppm failed: %s" % e)
|
_deb("%s failed: %s" % (pdftoppmcmd,e))
|
||||||
return b""
|
return b""
|
||||||
|
|
||||||
# Note: unfortunately, pdftoppm silently fails if the temp file
|
# Note: unfortunately, pdftoppm silently fails if the temp file
|
||||||
@ -171,10 +184,10 @@ def _pdftesseract(config, path):
|
|||||||
out = b''
|
out = b''
|
||||||
try:
|
try:
|
||||||
out = subprocess.check_output(
|
out = subprocess.check_output(
|
||||||
[tesseract, f, f, "-l", tesseractlang],
|
[tesseractcmd, f, f, "-l", tesseractlang],
|
||||||
stderr=subprocess.STDOUT)
|
stderr=subprocess.STDOUT)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_deb("tesseract failed: %s" % e)
|
_deb("%s failed: %s" % (tesseractcmd,e))
|
||||||
|
|
||||||
errlines = out.split(b'\n')
|
errlines = out.split(b'\n')
|
||||||
if len(errlines) > 5:
|
if len(errlines) > 5:
|
||||||
@ -194,10 +207,10 @@ def _simpletesseract(config, path):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
out = subprocess.check_output(
|
out = subprocess.check_output(
|
||||||
[tesseract, path, 'stdout', '-l', tesseractlang],
|
[tesseractcmd, path, 'stdout', '-l', tesseractlang],
|
||||||
stderr=subprocess.DEVNULL)
|
stderr=subprocess.DEVNULL)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
_deb("tesseract failed: %s" % e)
|
_deb("%s failed: %s" % (tesseractcmd,e))
|
||||||
return False, ""
|
return False, ""
|
||||||
return True, out
|
return True, out
|
||||||
|
|
||||||
|
|||||||
@ -411,10 +411,13 @@ class PDFExtractor:
|
|||||||
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
||||||
|
|
||||||
if isempty:
|
if isempty:
|
||||||
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
|
try:
|
||||||
self.filename]
|
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
|
||||||
data = subprocess.check_output(cmd)
|
self.filename]
|
||||||
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
|
data = subprocess.check_output(cmd)
|
||||||
|
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
if self.extrameta:
|
if self.extrameta:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -40,7 +40,7 @@ MUTAGEN=${RCLDEPS}mutagen-1.32/
|
|||||||
EPUB=${RCLDEPS}epub-0.5.2
|
EPUB=${RCLDEPS}epub-0.5.2
|
||||||
FUTURE=${RCLDEPS}python2-future
|
FUTURE=${RCLDEPS}python2-future
|
||||||
ZLIB=${RCLDEPS}zlib-1.2.8
|
ZLIB=${RCLDEPS}zlib-1.2.8
|
||||||
POPPLER=${RCLDEPS}poppler-0.36/
|
POPPLER=${RCLDEPS}poppler-0.68.0/
|
||||||
LIBWPD=${RCLDEPS}libwpd/libwpd-0.10.0/
|
LIBWPD=${RCLDEPS}libwpd/libwpd-0.10.0/
|
||||||
LIBREVENGE=${RCLDEPS}libwpd/librevenge-0.0.1.jfd/
|
LIBREVENGE=${RCLDEPS}libwpd/librevenge-0.0.1.jfd/
|
||||||
CHM=${RCLDEPS}pychm
|
CHM=${RCLDEPS}pychm
|
||||||
@ -237,9 +237,16 @@ copypoppler()
|
|||||||
{
|
{
|
||||||
test -d $FILTERS/poppler || mkdir $FILTERS/poppler || \
|
test -d $FILTERS/poppler || mkdir $FILTERS/poppler || \
|
||||||
fatal cant create poppler dir
|
fatal cant create poppler dir
|
||||||
for f in pdftotext.exe pdfinfo.exe libpoppler.dll freetype6.dll jpeg62.dll \
|
for f in pdftotext.exe pdfinfo.exe pdftoppm.exe \
|
||||||
libpng16-16.dll zlib1.dll libtiff3.dll \
|
freetype6.dll \
|
||||||
libgcc_s_dw2-1.dll libstdc++-6.dll; do
|
jpeg62.dll \
|
||||||
|
libgcc_s_dw2-1.dll \
|
||||||
|
libpng16-16.dll \
|
||||||
|
libpoppler*.dll \
|
||||||
|
libstdc++-6.dll \
|
||||||
|
libtiff3.dll \
|
||||||
|
zlib1.dll \
|
||||||
|
; do
|
||||||
chkcp $POPPLER/bin/$f $FILTERS/poppler
|
chkcp $POPPLER/bin/$f $FILTERS/poppler
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user