1st version of the cached ocr mechanism
This commit is contained in:
parent
aa40531bbe
commit
38dfa5f841
80
src/filters/rclocr.py
Executable file
80
src/filters/rclocr.py
Executable file
@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
#################################
|
||||
# Copyright (C) 2020 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
########################################################
|
||||
|
||||
# Running OCR programs for Recoll
|
||||
|
||||
import os
|
||||
import sys
|
||||
import rclconfig
|
||||
import rclocrcache
|
||||
import importlib.util
|
||||
|
||||
def deb(s):
|
||||
print("%s" % s, file=sys.stderr)
|
||||
|
||||
def Usage():
|
||||
deb("Usage: rclocr.py <imagefilename>")
|
||||
sys.exit(1)
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
Usage()
|
||||
|
||||
path = sys.argv[1]
|
||||
|
||||
config = rclconfig.RclConfig()
|
||||
cache = rclocrcache.OCRCache(config)
|
||||
|
||||
incache, data = cache.get(path)
|
||||
if incache:
|
||||
sys.stdout.buffer.write(data)
|
||||
sys.exit(0)
|
||||
|
||||
#### Data not in cache
|
||||
|
||||
# Retrieve known ocr program names and try to load the corresponding module
|
||||
ocrprogs = config.getConfParam("ocrprogs")
|
||||
if not ocrprogs:
|
||||
deb("No ocrprogs variable")
|
||||
sys.exit(1)
|
||||
deb("ocrprogs: %s" % ocrprogs)
|
||||
proglist = ocrprogs.split(" ")
|
||||
ok = False
|
||||
for ocrprog in proglist:
|
||||
try:
|
||||
modulename = "rclocr" + ocrprog
|
||||
ocr = importlib.import_module(modulename)
|
||||
if ocr.ocrpossible(path):
|
||||
ok = True
|
||||
break
|
||||
except Exception as err:
|
||||
deb("While loading %s: got: %s" % (modulename, err))
|
||||
pass
|
||||
|
||||
if not ok:
|
||||
deb("No OCR module could be loaded")
|
||||
sys.exit(1)
|
||||
|
||||
deb("Using ocr module %s" % modulename)
|
||||
|
||||
data = ocr.runocr(config, path)
|
||||
|
||||
cache.store(path, data)
|
||||
sys.stdout.buffer.write(data)
|
||||
sys.exit(0)
|
||||
|
||||
208
src/filters/rclocrcache.py
Executable file
208
src/filters/rclocrcache.py
Executable file
@ -0,0 +1,208 @@
|
||||
#!/usr/bin/env python3
|
||||
#################################
|
||||
# Copyright (C) 2020 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
########################################################
|
||||
|
||||
# Caching OCR'd data
|
||||
|
||||
# OCR is extremely slow. The cache stores 2 kinds of objects:
|
||||
# - Path files are named from the hash of the image path and contain
|
||||
# the image data hash and the modification time and size of the
|
||||
# image at the time the OCR'd data was stored in the cache
|
||||
# - Data files are named with the hash of the image data and contain
|
||||
# the OCR'd data
|
||||
# When retrieving data from the cache:
|
||||
# - We first use the image file size and modification time: if an
|
||||
# entry exists for the imagepath/mtime/size triplet, and is up to
|
||||
# date, the corresponding data is obtained from the data file and
|
||||
# returned.
|
||||
# - Else we then use the image data: if an entry exists for the
|
||||
# computed hashed value of the data, it is returned. This allows
|
||||
# moving files around without needing to run OCR again, but of
|
||||
# course, it is more expensive than the first step
|
||||
#
|
||||
# If we need to use the second step, as a side effect, a path file is
|
||||
# created or updated so that the data will be found with the first
|
||||
# step next time around.
|
||||
|
||||
import sys
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
def deb(s):
|
||||
print("%s" %s, file=sys.stderr)
|
||||
|
||||
class OCRCache(object):
|
||||
def __init__(self, conf):
|
||||
self.config = conf
|
||||
self.cachedir = conf.getConfParam("ocrcachedir")
|
||||
if not self.cachedir:
|
||||
self.cachedir = os.path.join(self.config.getConfDir(), "ocrcache")
|
||||
self.objdir = os.path.join(self.cachedir, "objects")
|
||||
if not os.path.exists(self.objdir):
|
||||
os.makedirs(self.objdir)
|
||||
|
||||
# Compute sha1 of path, as two parts of 2 and 38 chars
|
||||
def _hashpath(self, data):
|
||||
if type(data) != type(b""):
|
||||
data = data.encode('utf-8')
|
||||
m = hashlib.sha1()
|
||||
m.update(data)
|
||||
h = m.hexdigest()
|
||||
return h[0:2], h[2:]
|
||||
|
||||
# Compute sha1 of path data contents, as two parts of 2 and 38 chars
|
||||
def _hashdata(self, path):
|
||||
#deb("Hashing DATA")
|
||||
m = hashlib.sha1()
|
||||
with open(path, "rb") as f:
|
||||
while True:
|
||||
d = f.read(8192)
|
||||
if not d:
|
||||
break
|
||||
m.update(d)
|
||||
h = m.hexdigest()
|
||||
return h[0:2], h[2:]
|
||||
|
||||
# Try to read the stored attributes for a given path: data hash,
|
||||
# modification time and size. If this fails, the path itself is
|
||||
# not cached (but the data still might be, maybe the file was moved)
|
||||
def _cachedpathattrs(self, path):
|
||||
pd,pf = self._hashpath(path)
|
||||
o = os.path.join(self.objdir, pd, pf)
|
||||
if not os.path.exists(o):
|
||||
return False, None, None, None, None
|
||||
line = open(o, "r").read()
|
||||
dd,df,tm,sz = line.split()
|
||||
tm = int(tm)
|
||||
sz = int(sz)
|
||||
return True, dd, df, tm, sz
|
||||
|
||||
# Compute the path hash, and get the mtime and size for given
|
||||
# path, for updating the cache path file
|
||||
def _newpathattrs(self, path):
|
||||
pd,pf = self._hashpath(path)
|
||||
tm = int(os.path.getmtime(path))
|
||||
sz = int(os.path.getsize(path))
|
||||
return pd, pf, tm, sz
|
||||
|
||||
# Check if the cache appears up to date for a given path, only
|
||||
# using the modification time and size. Return the data file path
|
||||
# elements if we get a hit.
|
||||
def _pathincache(self, path):
|
||||
ret, od, of, otm, osz = self._cachedpathattrs(path)
|
||||
if not ret:
|
||||
return False, None, None
|
||||
pd, pf, ntm, nsz = self._newpathattrs(path)
|
||||
#deb(" tm %d sz %d" % (ntm, nsz))
|
||||
#deb("otm %d osz %d" % (otm, osz))
|
||||
if otm != ntm or osz != nsz:
|
||||
return False, None, None
|
||||
return True, od, of
|
||||
|
||||
# Check if cache appears up to date for path (no data check),
|
||||
# return True/False
|
||||
def pathincache(self, path):
|
||||
ret, dd, df = self._pathincache(path)
|
||||
return ret
|
||||
|
||||
# Compute the data file name for path. Expensive: we compute the data hash.
|
||||
# Return both the data file path and path elements (for storage in path file)
|
||||
def _datafilename(self, path):
|
||||
d, f = self._hashdata(path)
|
||||
return os.path.join(self.objdir, d, f), d, f
|
||||
|
||||
# Check if the data for path is in cache: expensive, needs to
|
||||
# compute the hash for the path's data contents. Returns True/False
|
||||
def dataincache(self, path):
|
||||
return os.path.exists(self._datafilename(path)[0])
|
||||
|
||||
# Create path file with given elements.
|
||||
def _updatepathfile(self, pd, pf, dd, df, tm, sz):
|
||||
dir = os.path.join(self.objdir, pd)
|
||||
if not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
pfile = os.path.join(dir, pf)
|
||||
with open(pfile, "w") as f:
|
||||
f.write("%s %s %d %d\n" % (dd, df, tm, sz))
|
||||
|
||||
# Store data for path. Only rewrite an existing data file if told
|
||||
# to do so: this is only useful if we are forcing an OCR re-run.
|
||||
def store(self, path, datatostore, force=False):
|
||||
dd,df = self._hashdata(path)
|
||||
pd, pf, tm, sz = self._newpathattrs(path)
|
||||
self._updatepathfile(pd, pf, dd, df, tm, sz)
|
||||
dir = os.path.join(self.objdir, dd)
|
||||
if not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
dfile = os.path.join(dir, df)
|
||||
if force or not os.path.exists(dfile):
|
||||
#deb("Storing data")
|
||||
with open(dfile, "wb") as f:
|
||||
f.write(datatostore)
|
||||
return True
|
||||
|
||||
# Retrieve cached OCR'd data for image path. Possibly update the
|
||||
# path file as a side effect (case where the image has moved, but
|
||||
# the data has not changed).
|
||||
def get(self, path):
|
||||
pincache, dd, df = self._pathincache(path)
|
||||
if pincache:
|
||||
dfn = os.path.join(self.objdir, dd, df)
|
||||
else:
|
||||
dfn, dd, df = self._datafilename(path)
|
||||
|
||||
if not os.path.exists(dfn):
|
||||
return False, b""
|
||||
|
||||
if not pincache:
|
||||
# File has moved. create/Update path file for next time
|
||||
pd, pf, tm, sz = self._newpathattrs(path)
|
||||
self._updatepathfile(pd, pf, dd, df, tm, sz)
|
||||
|
||||
return True, open(dfn, "rb").read()
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import rclconfig
|
||||
|
||||
conf = rclconfig.RclConfig()
|
||||
cache = OCRCache(conf)
|
||||
path = sys.argv[1]
|
||||
deb("Using %s" % path)
|
||||
|
||||
deb("== CACHE tests")
|
||||
ret = cache.pathincache(path)
|
||||
s = "" if ret else " not"
|
||||
deb("path for %s%s in cache" % (path, s))
|
||||
|
||||
#ret = cache.dataincache(path)
|
||||
#s = "" if ret else " not"
|
||||
#deb("data for %s%s in cache" % (path, s))
|
||||
|
||||
if False:
|
||||
deb("== STORE tests")
|
||||
cache.store(path, b"my OCR'd text is one line\n", force=False)
|
||||
|
||||
deb("== GET tests")
|
||||
incache, data = cache.get(path)
|
||||
if incache:
|
||||
deb("Data from cache [%s]" % data)
|
||||
else:
|
||||
deb("Data was not found in cache")
|
||||
|
||||
217
src/filters/rclocrtesseract.py
Executable file
217
src/filters/rclocrtesseract.py
Executable file
@ -0,0 +1,217 @@
|
||||
#!/usr/bin/env python3
|
||||
#################################
|
||||
# Copyright (C) 2020 J.F.Dockes
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the
|
||||
# Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
########################################################
|
||||
|
||||
import os
|
||||
import sys
|
||||
import atexit
|
||||
import tempfile
|
||||
import subprocess
|
||||
import glob
|
||||
|
||||
import rclexecm
|
||||
|
||||
_mswindows = (sys.platform == "win32")
|
||||
if _mswindows:
|
||||
ocrlangfile = ".rclocrlang"
|
||||
else:
|
||||
ocrlangfile = ".rclocrlang"
|
||||
|
||||
_okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
|
||||
|
||||
def _deb(s):
|
||||
if not _mswindows:
|
||||
#print("%s" % s, file=sys.stderr)
|
||||
pass
|
||||
|
||||
def vacuumdir(dir):
|
||||
if dir:
|
||||
for fn in os.listdir(dir):
|
||||
path = os.path.join(dir, fn)
|
||||
if os.path.isfile(path):
|
||||
os.unlink(path)
|
||||
return True
|
||||
|
||||
tmpdir = None
|
||||
def _maybemaketmpdir():
|
||||
global tmpdir
|
||||
if tmpdir:
|
||||
if not vacuumdir(tmpdir):
|
||||
_deb("openfile: vacuumdir %s failed" % tmpdir)
|
||||
return False
|
||||
else:
|
||||
tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
|
||||
|
||||
def finalcleanup():
|
||||
if tmpdir:
|
||||
vacuumdir(tmpdir)
|
||||
os.rmdir(tmpdir)
|
||||
|
||||
atexit.register(finalcleanup)
|
||||
|
||||
# Return true if tesseract and the appropriate conversion program for
|
||||
# the file type (e.g. pdftoppt for pdf) appear to be available
|
||||
def ocrpossible(path):
|
||||
# Check for tesseract
|
||||
global tesseract
|
||||
tesseract = rclexecm.which("tesseract")
|
||||
if not tesseract:
|
||||
return False
|
||||
|
||||
# Check input format
|
||||
base,ext = os.path.splitext(path)
|
||||
ext = ext.lower()
|
||||
if ext in _okexts:
|
||||
return True
|
||||
|
||||
if ext == '.pdf':
|
||||
# Check for pdftoppm. We could use pdftocairo, which can
|
||||
# produce a multi-page pdf and make the rest simpler, but the
|
||||
# legacy code used pdftoppm for some reason, and it appears
|
||||
# that the newest builds from conda-forge do not include
|
||||
# pdftocairo. So stay with pdftoppm.
|
||||
global pdftoppm
|
||||
pdftoppm = rclexecm.which("pdftoppm")
|
||||
if pdftoppm:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# Try to guess tesseract language. This should depend on the input
|
||||
# file, but we have no general way to determine it. So use the
|
||||
# environment and hope for the best.
|
||||
def _guesstesseractlang(config, path):
|
||||
tesseractlang = ""
|
||||
|
||||
dirname = os.path.dirname(path)
|
||||
|
||||
# First look for a language def file in the file's directory
|
||||
pdflangfile = os.path.join(dirname, ocrlangfile)
|
||||
if os.path.isfile(pdflangfile):
|
||||
tesseractlang = open(pdflangfile, "r").read().strip()
|
||||
if tesseractlang:
|
||||
_deb("Tesseract lang from file: %s" % tesseractlang)
|
||||
return tesseractlang
|
||||
|
||||
# Then look for a config file option.
|
||||
config.setKeyDir(dirname)
|
||||
tesseractlang = config.getConfParam("tesseractlang")
|
||||
if tesseractlang:
|
||||
_deb("Tesseract lang from config: %s" % tesseractlang)
|
||||
return tesseractlang
|
||||
|
||||
# Half-assed trial to guess from LANG then default to english
|
||||
try:
|
||||
localelang = os.environ.get("LANG", "").split("_")[0]
|
||||
if localelang == "en":
|
||||
tesseractlang = "eng"
|
||||
elif localelang == "de":
|
||||
tesseractlang = "deu"
|
||||
elif localelang == "fr":
|
||||
tesseractlang = "fra"
|
||||
except:
|
||||
pass
|
||||
|
||||
if not tesseractlang:
|
||||
tesseractlang = "eng"
|
||||
_deb("Tesseract lang (guessed): %s" % tesseractlang)
|
||||
return tesseractlang
|
||||
|
||||
# Process pdf file: use pdftoppm to split it into ppm pages, then run
|
||||
# tesseract on each and concatenate the result. It would probably be
|
||||
# possible instead to use pdftocairo to produce a tiff, buf pdftocairo
|
||||
# is sometimes not available (windows).
|
||||
def _pdftesseract(config, path):
|
||||
if not tmpdir:
|
||||
return b""
|
||||
|
||||
tesseractlang = _guesstesseractlang(config, path)
|
||||
|
||||
#tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
|
||||
tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
|
||||
|
||||
# Split pdf pages
|
||||
try:
|
||||
vacuumdir(tmpdir)
|
||||
subprocess.check_call([pdftoppm, "-r", "300", path, tmpfile])
|
||||
except Exception as e:
|
||||
_deb("pdftoppm failed: %s" % e)
|
||||
return b""
|
||||
|
||||
files = glob.glob(tmpfile + "*")
|
||||
for f in files:
|
||||
out = b''
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
[tesseract, f, f, "-l", tesseractlang],
|
||||
stderr=subprocess.STDOUT)
|
||||
except Exception as e:
|
||||
_deb("tesseract failed: %s" % e)
|
||||
|
||||
errlines = out.split(b'\n')
|
||||
if len(errlines) > 2:
|
||||
_deb("Tesseract error: %s" % out)
|
||||
|
||||
# Concatenate the result files
|
||||
files = glob.glob(tmpfile + "*" + ".txt")
|
||||
data = b""
|
||||
for f in files:
|
||||
data += open(f, "rb").read()
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def _simpletesseract(config, path):
|
||||
tesseractlang = _guesstesseractlang(config, path)
|
||||
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
[tesseract, path, 'stdout', '-l', tesseractlang],
|
||||
stderr=subprocess.DEVNULL)
|
||||
except Exception as e:
|
||||
_deb("tesseract failed: %s" % e)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
# run ocr on the input path and output the result data.
|
||||
def runocr(config, path):
|
||||
_maybemaketmpdir()
|
||||
base,ext = os.path.splitext(path)
|
||||
ext = ext.lower()
|
||||
if ext in _okexts:
|
||||
return _simpletesseract(config, path)
|
||||
else:
|
||||
return _pdftesseract(config, path)
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import rclconfig
|
||||
config = rclconfig.RclConfig()
|
||||
path = sys.argv[1]
|
||||
if ocrpossible(path):
|
||||
data = runocr(config, sys.argv[1])
|
||||
else:
|
||||
_deb("ocrpossible returned false")
|
||||
sys.exit(1)
|
||||
sys.stdout.buffer.write(data)
|
||||
|
||||
|
||||
@ -47,8 +47,14 @@ import glob
|
||||
import traceback
|
||||
|
||||
_mswindows = (sys.platform == "win32")
|
||||
|
||||
tmpdir = None
|
||||
|
||||
_htmlprefix =b'''<html><head>
|
||||
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
|
||||
</head><body><pre>'''
|
||||
_htmlsuffix = b'''</pre></body></html>'''
|
||||
|
||||
def finalcleanup():
|
||||
if tmpdir:
|
||||
vacuumdir(tmpdir)
|
||||
@ -120,18 +126,6 @@ class PDFExtractor:
|
||||
except:
|
||||
pass
|
||||
|
||||
# See if we'll try to perform OCR. Need the commands and the
|
||||
# either the presence of a file in the config dir (historical)
|
||||
# or a set config variable.
|
||||
self.ocrpossible = False
|
||||
self.tesseract = rclexecm.which("tesseract")
|
||||
if self.tesseract:
|
||||
self.pdftoppm = rclexecm.which("pdftoppm")
|
||||
if self.pdftoppm:
|
||||
self.ocrpossible = True
|
||||
self.maybemaketmpdir()
|
||||
# self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
|
||||
|
||||
# Pdftk is optionally used to extract attachments. This takes
|
||||
# a hit on performance even in the absence of any attachments,
|
||||
# so it can be disabled in the configuration.
|
||||
@ -236,100 +230,6 @@ class PDFExtractor:
|
||||
return (True, docdata, ipath, eof)
|
||||
|
||||
|
||||
# Try to guess tesseract language. This should depend on the input
|
||||
# file, but we have no general way to determine it. So use the
|
||||
# environment and hope for the best.
|
||||
def guesstesseractlang(self):
|
||||
tesseractlang = ""
|
||||
|
||||
# First look for a language def file in the file's directory
|
||||
pdflangfile = os.path.join(os.path.dirname(self.filename),
|
||||
b".ocrpdflang")
|
||||
if os.path.isfile(pdflangfile):
|
||||
tesseractlang = open(pdflangfile, "r").read().strip()
|
||||
if tesseractlang:
|
||||
return tesseractlang
|
||||
|
||||
# Then look for a global option. The normal way now that we
|
||||
# have config reading capability in the handlers is to use the
|
||||
# config. Then, for backwards compat, environment variable and
|
||||
# file inside the configuration directory
|
||||
tesseractlang = self.config.getConfParam("pdfocrlang")
|
||||
if tesseractlang:
|
||||
return tesseractlang
|
||||
tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", "");
|
||||
if tesseractlang:
|
||||
return tesseractlang
|
||||
pdflangfile = os.path.join(self.confdir, "ocrpdf")
|
||||
if os.path.isfile(pdflangfile):
|
||||
tesseractlang = open(pdflangfile, "r").read().strip()
|
||||
if tesseractlang:
|
||||
return tesseractlang
|
||||
|
||||
# Half-assed trial to guess from LANG then default to english
|
||||
localelang = os.environ.get("LANG", "").split("_")[0]
|
||||
if localelang == "en":
|
||||
tesseractlang = "eng"
|
||||
elif localelang == "de":
|
||||
tesseractlang = "deu"
|
||||
elif localelang == "fr":
|
||||
tesseractlang = "fra"
|
||||
if tesseractlang:
|
||||
return tesseractlang
|
||||
|
||||
if not tesseractlang:
|
||||
tesseractlang = "eng"
|
||||
return tesseractlang
|
||||
|
||||
# PDF has no text content and tesseract is available. Give OCR a try
|
||||
def ocrpdf(self):
|
||||
|
||||
global tmpdir
|
||||
if not tmpdir:
|
||||
return b""
|
||||
|
||||
tesseractlang = self.guesstesseractlang()
|
||||
# self.em.rclog("tesseractlang %s" % tesseractlang)
|
||||
|
||||
tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
|
||||
tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
|
||||
|
||||
# Split pdf pages
|
||||
try:
|
||||
vacuumdir(tmpdir)
|
||||
subprocess.check_call([self.pdftoppm, "-r", "300", self.filename,
|
||||
tmpfile])
|
||||
except Exception as e:
|
||||
self.em.rclog("pdftoppm failed: %s" % e)
|
||||
return b""
|
||||
|
||||
files = glob.glob(tmpfile + "*")
|
||||
for f in files:
|
||||
out = b''
|
||||
try:
|
||||
out = subprocess.check_output([self.tesseract, f, f, "-l",
|
||||
tesseractlang],
|
||||
stderr = subprocess.STDOUT)
|
||||
except Exception as e:
|
||||
self.em.rclog("tesseract failed: %s" % e)
|
||||
|
||||
errlines = out.split(b'\n')
|
||||
if len(errlines) > 2:
|
||||
self.em.rclog("Tesseract error: %s" % out)
|
||||
|
||||
# Concatenate the result files
|
||||
files = glob.glob(tmpfile + "*" + ".txt")
|
||||
data = b""
|
||||
for f in files:
|
||||
data += open(f, "rb").read()
|
||||
|
||||
return b'''<html><head>
|
||||
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
|
||||
</head><body><pre>''' + \
|
||||
self.em.htmlescape(data) + \
|
||||
b'''</pre></body></html>'''
|
||||
|
||||
|
||||
# pdftotext (used to?) badly escape text inside the header
|
||||
# fields. We do it here. This is not an html parser, and depends a
|
||||
# lot on the actual format output by pdftotext.
|
||||
@ -510,13 +410,11 @@ class PDFExtractor:
|
||||
html, isempty = self._fixhtml(html)
|
||||
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
||||
|
||||
if isempty and self.ocrpossible:
|
||||
self.config.setKeyDir(os.path.dirname(self.filename))
|
||||
s = self.config.getConfParam("pdfocr")
|
||||
cf_doocr = rclexecm.configparamtrue(s)
|
||||
file_doocr = os.path.isfile(os.path.join(self.confdir, "ocrpdf"))
|
||||
if cf_doocr or file_doocr:
|
||||
html = self.ocrpdf()
|
||||
if isempty:
|
||||
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
|
||||
self.filename]
|
||||
data = subprocess.check_output(cmd)
|
||||
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
|
||||
|
||||
if self.extrameta:
|
||||
try:
|
||||
@ -592,6 +490,7 @@ class PDFExtractor:
|
||||
|
||||
|
||||
# Main program: create protocol handler and extractor and run them
|
||||
_execdir = os.path.dirname(sys.argv[0])
|
||||
proto = rclexecm.RclExecM()
|
||||
extract = PDFExtractor(proto)
|
||||
rclexecm.main(proto, extract)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user