1st version of the cached ocr mechanism
This commit is contained in:
parent
aa40531bbe
commit
38dfa5f841
80
src/filters/rclocr.py
Executable file
80
src/filters/rclocr.py
Executable file
@ -0,0 +1,80 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#################################
|
||||||
|
# Copyright (C) 2020 J.F.Dockes
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the
|
||||||
|
# Free Software Foundation, Inc.,
|
||||||
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
########################################################
|
||||||
|
|
||||||
|
# Running OCR programs for Recoll
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import rclconfig
|
||||||
|
import rclocrcache
|
||||||
|
import importlib.util
|
||||||
|
|
||||||
|
def deb(s):
|
||||||
|
print("%s" % s, file=sys.stderr)
|
||||||
|
|
||||||
|
def Usage():
|
||||||
|
deb("Usage: rclocr.py <imagefilename>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
Usage()
|
||||||
|
|
||||||
|
path = sys.argv[1]
|
||||||
|
|
||||||
|
config = rclconfig.RclConfig()
|
||||||
|
cache = rclocrcache.OCRCache(config)
|
||||||
|
|
||||||
|
incache, data = cache.get(path)
|
||||||
|
if incache:
|
||||||
|
sys.stdout.buffer.write(data)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
#### Data not in cache
|
||||||
|
|
||||||
|
# Retrieve known ocr program names and try to load the corresponding module
|
||||||
|
ocrprogs = config.getConfParam("ocrprogs")
|
||||||
|
if not ocrprogs:
|
||||||
|
deb("No ocrprogs variable")
|
||||||
|
sys.exit(1)
|
||||||
|
deb("ocrprogs: %s" % ocrprogs)
|
||||||
|
proglist = ocrprogs.split(" ")
|
||||||
|
ok = False
|
||||||
|
for ocrprog in proglist:
|
||||||
|
try:
|
||||||
|
modulename = "rclocr" + ocrprog
|
||||||
|
ocr = importlib.import_module(modulename)
|
||||||
|
if ocr.ocrpossible(path):
|
||||||
|
ok = True
|
||||||
|
break
|
||||||
|
except Exception as err:
|
||||||
|
deb("While loading %s: got: %s" % (modulename, err))
|
||||||
|
pass
|
||||||
|
|
||||||
|
if not ok:
|
||||||
|
deb("No OCR module could be loaded")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
deb("Using ocr module %s" % modulename)
|
||||||
|
|
||||||
|
data = ocr.runocr(config, path)
|
||||||
|
|
||||||
|
cache.store(path, data)
|
||||||
|
sys.stdout.buffer.write(data)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
208
src/filters/rclocrcache.py
Executable file
208
src/filters/rclocrcache.py
Executable file
@ -0,0 +1,208 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#################################
|
||||||
|
# Copyright (C) 2020 J.F.Dockes
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the
|
||||||
|
# Free Software Foundation, Inc.,
|
||||||
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
########################################################
|
||||||
|
|
||||||
|
# Caching OCR'd data
|
||||||
|
|
||||||
|
# OCR is extremely slow. The cache stores 2 kinds of objects:
|
||||||
|
# - Path files are named from the hash of the image path and contain
|
||||||
|
# the image data hash and the modification time and size of the
|
||||||
|
# image at the time the OCR'd data was stored in the cache
|
||||||
|
# - Data files are named with the hash of the image data and contain
|
||||||
|
# the OCR'd data
|
||||||
|
# When retrieving data from the cache:
|
||||||
|
# - We first use the image file size and modification time: if an
|
||||||
|
# entry exists for the imagepath/mtime/size triplet, and is up to
|
||||||
|
# date, the corresponding data is obtained from the data file and
|
||||||
|
# returned.
|
||||||
|
# - Else we then use the image data: if an entry exists for the
|
||||||
|
# computed hashed value of the data, it is returned. This allows
|
||||||
|
# moving files around without needing to run OCR again, but of
|
||||||
|
# course, it is more expensive than the first step
|
||||||
|
#
|
||||||
|
# If we need to use the second step, as a side effect, a path file is
|
||||||
|
# created or updated so that the data will be found with the first
|
||||||
|
# step next time around.
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
def deb(s):
|
||||||
|
print("%s" %s, file=sys.stderr)
|
||||||
|
|
||||||
|
class OCRCache(object):
|
||||||
|
def __init__(self, conf):
|
||||||
|
self.config = conf
|
||||||
|
self.cachedir = conf.getConfParam("ocrcachedir")
|
||||||
|
if not self.cachedir:
|
||||||
|
self.cachedir = os.path.join(self.config.getConfDir(), "ocrcache")
|
||||||
|
self.objdir = os.path.join(self.cachedir, "objects")
|
||||||
|
if not os.path.exists(self.objdir):
|
||||||
|
os.makedirs(self.objdir)
|
||||||
|
|
||||||
|
# Compute sha1 of path, as two parts of 2 and 38 chars
|
||||||
|
def _hashpath(self, data):
|
||||||
|
if type(data) != type(b""):
|
||||||
|
data = data.encode('utf-8')
|
||||||
|
m = hashlib.sha1()
|
||||||
|
m.update(data)
|
||||||
|
h = m.hexdigest()
|
||||||
|
return h[0:2], h[2:]
|
||||||
|
|
||||||
|
# Compute sha1 of path data contents, as two parts of 2 and 38 chars
|
||||||
|
def _hashdata(self, path):
|
||||||
|
#deb("Hashing DATA")
|
||||||
|
m = hashlib.sha1()
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
while True:
|
||||||
|
d = f.read(8192)
|
||||||
|
if not d:
|
||||||
|
break
|
||||||
|
m.update(d)
|
||||||
|
h = m.hexdigest()
|
||||||
|
return h[0:2], h[2:]
|
||||||
|
|
||||||
|
# Try to read the stored attributes for a given path: data hash,
|
||||||
|
# modification time and size. If this fails, the path itself is
|
||||||
|
# not cached (but the data still might be, maybe the file was moved)
|
||||||
|
def _cachedpathattrs(self, path):
|
||||||
|
pd,pf = self._hashpath(path)
|
||||||
|
o = os.path.join(self.objdir, pd, pf)
|
||||||
|
if not os.path.exists(o):
|
||||||
|
return False, None, None, None, None
|
||||||
|
line = open(o, "r").read()
|
||||||
|
dd,df,tm,sz = line.split()
|
||||||
|
tm = int(tm)
|
||||||
|
sz = int(sz)
|
||||||
|
return True, dd, df, tm, sz
|
||||||
|
|
||||||
|
# Compute the path hash, and get the mtime and size for given
|
||||||
|
# path, for updating the cache path file
|
||||||
|
def _newpathattrs(self, path):
|
||||||
|
pd,pf = self._hashpath(path)
|
||||||
|
tm = int(os.path.getmtime(path))
|
||||||
|
sz = int(os.path.getsize(path))
|
||||||
|
return pd, pf, tm, sz
|
||||||
|
|
||||||
|
# Check if the cache appears up to date for a given path, only
|
||||||
|
# using the modification time and size. Return the data file path
|
||||||
|
# elements if we get a hit.
|
||||||
|
def _pathincache(self, path):
|
||||||
|
ret, od, of, otm, osz = self._cachedpathattrs(path)
|
||||||
|
if not ret:
|
||||||
|
return False, None, None
|
||||||
|
pd, pf, ntm, nsz = self._newpathattrs(path)
|
||||||
|
#deb(" tm %d sz %d" % (ntm, nsz))
|
||||||
|
#deb("otm %d osz %d" % (otm, osz))
|
||||||
|
if otm != ntm or osz != nsz:
|
||||||
|
return False, None, None
|
||||||
|
return True, od, of
|
||||||
|
|
||||||
|
# Check if cache appears up to date for path (no data check),
|
||||||
|
# return True/False
|
||||||
|
def pathincache(self, path):
|
||||||
|
ret, dd, df = self._pathincache(path)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
# Compute the data file name for path. Expensive: we compute the data hash.
|
||||||
|
# Return both the data file path and path elements (for storage in path file)
|
||||||
|
def _datafilename(self, path):
|
||||||
|
d, f = self._hashdata(path)
|
||||||
|
return os.path.join(self.objdir, d, f), d, f
|
||||||
|
|
||||||
|
# Check if the data for path is in cache: expensive, needs to
|
||||||
|
# compute the hash for the path's data contents. Returns True/False
|
||||||
|
def dataincache(self, path):
|
||||||
|
return os.path.exists(self._datafilename(path)[0])
|
||||||
|
|
||||||
|
# Create path file with given elements.
|
||||||
|
def _updatepathfile(self, pd, pf, dd, df, tm, sz):
|
||||||
|
dir = os.path.join(self.objdir, pd)
|
||||||
|
if not os.path.exists(dir):
|
||||||
|
os.makedirs(dir)
|
||||||
|
pfile = os.path.join(dir, pf)
|
||||||
|
with open(pfile, "w") as f:
|
||||||
|
f.write("%s %s %d %d\n" % (dd, df, tm, sz))
|
||||||
|
|
||||||
|
# Store data for path. Only rewrite an existing data file if told
|
||||||
|
# to do so: this is only useful if we are forcing an OCR re-run.
|
||||||
|
def store(self, path, datatostore, force=False):
|
||||||
|
dd,df = self._hashdata(path)
|
||||||
|
pd, pf, tm, sz = self._newpathattrs(path)
|
||||||
|
self._updatepathfile(pd, pf, dd, df, tm, sz)
|
||||||
|
dir = os.path.join(self.objdir, dd)
|
||||||
|
if not os.path.exists(dir):
|
||||||
|
os.makedirs(dir)
|
||||||
|
dfile = os.path.join(dir, df)
|
||||||
|
if force or not os.path.exists(dfile):
|
||||||
|
#deb("Storing data")
|
||||||
|
with open(dfile, "wb") as f:
|
||||||
|
f.write(datatostore)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Retrieve cached OCR'd data for image path. Possibly update the
|
||||||
|
# path file as a side effect (case where the image has moved, but
|
||||||
|
# the data has not changed).
|
||||||
|
def get(self, path):
|
||||||
|
pincache, dd, df = self._pathincache(path)
|
||||||
|
if pincache:
|
||||||
|
dfn = os.path.join(self.objdir, dd, df)
|
||||||
|
else:
|
||||||
|
dfn, dd, df = self._datafilename(path)
|
||||||
|
|
||||||
|
if not os.path.exists(dfn):
|
||||||
|
return False, b""
|
||||||
|
|
||||||
|
if not pincache:
|
||||||
|
# File has moved. create/Update path file for next time
|
||||||
|
pd, pf, tm, sz = self._newpathattrs(path)
|
||||||
|
self._updatepathfile(pd, pf, dd, df, tm, sz)
|
||||||
|
|
||||||
|
return True, open(dfn, "rb").read()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import rclconfig
|
||||||
|
|
||||||
|
conf = rclconfig.RclConfig()
|
||||||
|
cache = OCRCache(conf)
|
||||||
|
path = sys.argv[1]
|
||||||
|
deb("Using %s" % path)
|
||||||
|
|
||||||
|
deb("== CACHE tests")
|
||||||
|
ret = cache.pathincache(path)
|
||||||
|
s = "" if ret else " not"
|
||||||
|
deb("path for %s%s in cache" % (path, s))
|
||||||
|
|
||||||
|
#ret = cache.dataincache(path)
|
||||||
|
#s = "" if ret else " not"
|
||||||
|
#deb("data for %s%s in cache" % (path, s))
|
||||||
|
|
||||||
|
if False:
|
||||||
|
deb("== STORE tests")
|
||||||
|
cache.store(path, b"my OCR'd text is one line\n", force=False)
|
||||||
|
|
||||||
|
deb("== GET tests")
|
||||||
|
incache, data = cache.get(path)
|
||||||
|
if incache:
|
||||||
|
deb("Data from cache [%s]" % data)
|
||||||
|
else:
|
||||||
|
deb("Data was not found in cache")
|
||||||
|
|
||||||
217
src/filters/rclocrtesseract.py
Executable file
217
src/filters/rclocrtesseract.py
Executable file
@ -0,0 +1,217 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#################################
|
||||||
|
# Copyright (C) 2020 J.F.Dockes
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the
|
||||||
|
# Free Software Foundation, Inc.,
|
||||||
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
########################################################
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import atexit
|
||||||
|
import tempfile
|
||||||
|
import subprocess
|
||||||
|
import glob
|
||||||
|
|
||||||
|
import rclexecm
|
||||||
|
|
||||||
|
_mswindows = (sys.platform == "win32")
|
||||||
|
if _mswindows:
|
||||||
|
ocrlangfile = ".rclocrlang"
|
||||||
|
else:
|
||||||
|
ocrlangfile = ".rclocrlang"
|
||||||
|
|
||||||
|
_okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')
|
||||||
|
|
||||||
|
def _deb(s):
|
||||||
|
if not _mswindows:
|
||||||
|
#print("%s" % s, file=sys.stderr)
|
||||||
|
pass
|
||||||
|
|
||||||
|
def vacuumdir(dir):
|
||||||
|
if dir:
|
||||||
|
for fn in os.listdir(dir):
|
||||||
|
path = os.path.join(dir, fn)
|
||||||
|
if os.path.isfile(path):
|
||||||
|
os.unlink(path)
|
||||||
|
return True
|
||||||
|
|
||||||
|
tmpdir = None
|
||||||
|
def _maybemaketmpdir():
|
||||||
|
global tmpdir
|
||||||
|
if tmpdir:
|
||||||
|
if not vacuumdir(tmpdir):
|
||||||
|
_deb("openfile: vacuumdir %s failed" % tmpdir)
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
|
||||||
|
|
||||||
|
def finalcleanup():
|
||||||
|
if tmpdir:
|
||||||
|
vacuumdir(tmpdir)
|
||||||
|
os.rmdir(tmpdir)
|
||||||
|
|
||||||
|
atexit.register(finalcleanup)
|
||||||
|
|
||||||
|
# Return true if tesseract and the appropriate conversion program for
|
||||||
|
# the file type (e.g. pdftoppt for pdf) appear to be available
|
||||||
|
def ocrpossible(path):
|
||||||
|
# Check for tesseract
|
||||||
|
global tesseract
|
||||||
|
tesseract = rclexecm.which("tesseract")
|
||||||
|
if not tesseract:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check input format
|
||||||
|
base,ext = os.path.splitext(path)
|
||||||
|
ext = ext.lower()
|
||||||
|
if ext in _okexts:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if ext == '.pdf':
|
||||||
|
# Check for pdftoppm. We could use pdftocairo, which can
|
||||||
|
# produce a multi-page pdf and make the rest simpler, but the
|
||||||
|
# legacy code used pdftoppm for some reason, and it appears
|
||||||
|
# that the newest builds from conda-forge do not include
|
||||||
|
# pdftocairo. So stay with pdftoppm.
|
||||||
|
global pdftoppm
|
||||||
|
pdftoppm = rclexecm.which("pdftoppm")
|
||||||
|
if pdftoppm:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# Try to guess tesseract language. This should depend on the input
|
||||||
|
# file, but we have no general way to determine it. So use the
|
||||||
|
# environment and hope for the best.
|
||||||
|
def _guesstesseractlang(config, path):
|
||||||
|
tesseractlang = ""
|
||||||
|
|
||||||
|
dirname = os.path.dirname(path)
|
||||||
|
|
||||||
|
# First look for a language def file in the file's directory
|
||||||
|
pdflangfile = os.path.join(dirname, ocrlangfile)
|
||||||
|
if os.path.isfile(pdflangfile):
|
||||||
|
tesseractlang = open(pdflangfile, "r").read().strip()
|
||||||
|
if tesseractlang:
|
||||||
|
_deb("Tesseract lang from file: %s" % tesseractlang)
|
||||||
|
return tesseractlang
|
||||||
|
|
||||||
|
# Then look for a config file option.
|
||||||
|
config.setKeyDir(dirname)
|
||||||
|
tesseractlang = config.getConfParam("tesseractlang")
|
||||||
|
if tesseractlang:
|
||||||
|
_deb("Tesseract lang from config: %s" % tesseractlang)
|
||||||
|
return tesseractlang
|
||||||
|
|
||||||
|
# Half-assed trial to guess from LANG then default to english
|
||||||
|
try:
|
||||||
|
localelang = os.environ.get("LANG", "").split("_")[0]
|
||||||
|
if localelang == "en":
|
||||||
|
tesseractlang = "eng"
|
||||||
|
elif localelang == "de":
|
||||||
|
tesseractlang = "deu"
|
||||||
|
elif localelang == "fr":
|
||||||
|
tesseractlang = "fra"
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if not tesseractlang:
|
||||||
|
tesseractlang = "eng"
|
||||||
|
_deb("Tesseract lang (guessed): %s" % tesseractlang)
|
||||||
|
return tesseractlang
|
||||||
|
|
||||||
|
# Process pdf file: use pdftoppm to split it into ppm pages, then run
|
||||||
|
# tesseract on each and concatenate the result. It would probably be
|
||||||
|
# possible instead to use pdftocairo to produce a tiff, buf pdftocairo
|
||||||
|
# is sometimes not available (windows).
|
||||||
|
def _pdftesseract(config, path):
|
||||||
|
if not tmpdir:
|
||||||
|
return b""
|
||||||
|
|
||||||
|
tesseractlang = _guesstesseractlang(config, path)
|
||||||
|
|
||||||
|
#tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
|
||||||
|
tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
|
||||||
|
|
||||||
|
# Split pdf pages
|
||||||
|
try:
|
||||||
|
vacuumdir(tmpdir)
|
||||||
|
subprocess.check_call([pdftoppm, "-r", "300", path, tmpfile])
|
||||||
|
except Exception as e:
|
||||||
|
_deb("pdftoppm failed: %s" % e)
|
||||||
|
return b""
|
||||||
|
|
||||||
|
files = glob.glob(tmpfile + "*")
|
||||||
|
for f in files:
|
||||||
|
out = b''
|
||||||
|
try:
|
||||||
|
out = subprocess.check_output(
|
||||||
|
[tesseract, f, f, "-l", tesseractlang],
|
||||||
|
stderr=subprocess.STDOUT)
|
||||||
|
except Exception as e:
|
||||||
|
_deb("tesseract failed: %s" % e)
|
||||||
|
|
||||||
|
errlines = out.split(b'\n')
|
||||||
|
if len(errlines) > 2:
|
||||||
|
_deb("Tesseract error: %s" % out)
|
||||||
|
|
||||||
|
# Concatenate the result files
|
||||||
|
files = glob.glob(tmpfile + "*" + ".txt")
|
||||||
|
data = b""
|
||||||
|
for f in files:
|
||||||
|
data += open(f, "rb").read()
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def _simpletesseract(config, path):
|
||||||
|
tesseractlang = _guesstesseractlang(config, path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
out = subprocess.check_output(
|
||||||
|
[tesseract, path, 'stdout', '-l', tesseractlang],
|
||||||
|
stderr=subprocess.DEVNULL)
|
||||||
|
except Exception as e:
|
||||||
|
_deb("tesseract failed: %s" % e)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# run ocr on the input path and output the result data.
|
||||||
|
def runocr(config, path):
|
||||||
|
_maybemaketmpdir()
|
||||||
|
base,ext = os.path.splitext(path)
|
||||||
|
ext = ext.lower()
|
||||||
|
if ext in _okexts:
|
||||||
|
return _simpletesseract(config, path)
|
||||||
|
else:
|
||||||
|
return _pdftesseract(config, path)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import rclconfig
|
||||||
|
config = rclconfig.RclConfig()
|
||||||
|
path = sys.argv[1]
|
||||||
|
if ocrpossible(path):
|
||||||
|
data = runocr(config, sys.argv[1])
|
||||||
|
else:
|
||||||
|
_deb("ocrpossible returned false")
|
||||||
|
sys.exit(1)
|
||||||
|
sys.stdout.buffer.write(data)
|
||||||
|
|
||||||
|
|
||||||
@ -47,8 +47,14 @@ import glob
|
|||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
_mswindows = (sys.platform == "win32")
|
_mswindows = (sys.platform == "win32")
|
||||||
|
|
||||||
tmpdir = None
|
tmpdir = None
|
||||||
|
|
||||||
|
_htmlprefix =b'''<html><head>
|
||||||
|
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
|
||||||
|
</head><body><pre>'''
|
||||||
|
_htmlsuffix = b'''</pre></body></html>'''
|
||||||
|
|
||||||
def finalcleanup():
|
def finalcleanup():
|
||||||
if tmpdir:
|
if tmpdir:
|
||||||
vacuumdir(tmpdir)
|
vacuumdir(tmpdir)
|
||||||
@ -120,18 +126,6 @@ class PDFExtractor:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# See if we'll try to perform OCR. Need the commands and the
|
|
||||||
# either the presence of a file in the config dir (historical)
|
|
||||||
# or a set config variable.
|
|
||||||
self.ocrpossible = False
|
|
||||||
self.tesseract = rclexecm.which("tesseract")
|
|
||||||
if self.tesseract:
|
|
||||||
self.pdftoppm = rclexecm.which("pdftoppm")
|
|
||||||
if self.pdftoppm:
|
|
||||||
self.ocrpossible = True
|
|
||||||
self.maybemaketmpdir()
|
|
||||||
# self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
|
|
||||||
|
|
||||||
# Pdftk is optionally used to extract attachments. This takes
|
# Pdftk is optionally used to extract attachments. This takes
|
||||||
# a hit on performance even in the absence of any attachments,
|
# a hit on performance even in the absence of any attachments,
|
||||||
# so it can be disabled in the configuration.
|
# so it can be disabled in the configuration.
|
||||||
@ -236,100 +230,6 @@ class PDFExtractor:
|
|||||||
return (True, docdata, ipath, eof)
|
return (True, docdata, ipath, eof)
|
||||||
|
|
||||||
|
|
||||||
# Try to guess tesseract language. This should depend on the input
|
|
||||||
# file, but we have no general way to determine it. So use the
|
|
||||||
# environment and hope for the best.
|
|
||||||
def guesstesseractlang(self):
|
|
||||||
tesseractlang = ""
|
|
||||||
|
|
||||||
# First look for a language def file in the file's directory
|
|
||||||
pdflangfile = os.path.join(os.path.dirname(self.filename),
|
|
||||||
b".ocrpdflang")
|
|
||||||
if os.path.isfile(pdflangfile):
|
|
||||||
tesseractlang = open(pdflangfile, "r").read().strip()
|
|
||||||
if tesseractlang:
|
|
||||||
return tesseractlang
|
|
||||||
|
|
||||||
# Then look for a global option. The normal way now that we
|
|
||||||
# have config reading capability in the handlers is to use the
|
|
||||||
# config. Then, for backwards compat, environment variable and
|
|
||||||
# file inside the configuration directory
|
|
||||||
tesseractlang = self.config.getConfParam("pdfocrlang")
|
|
||||||
if tesseractlang:
|
|
||||||
return tesseractlang
|
|
||||||
tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", "");
|
|
||||||
if tesseractlang:
|
|
||||||
return tesseractlang
|
|
||||||
pdflangfile = os.path.join(self.confdir, "ocrpdf")
|
|
||||||
if os.path.isfile(pdflangfile):
|
|
||||||
tesseractlang = open(pdflangfile, "r").read().strip()
|
|
||||||
if tesseractlang:
|
|
||||||
return tesseractlang
|
|
||||||
|
|
||||||
# Half-assed trial to guess from LANG then default to english
|
|
||||||
localelang = os.environ.get("LANG", "").split("_")[0]
|
|
||||||
if localelang == "en":
|
|
||||||
tesseractlang = "eng"
|
|
||||||
elif localelang == "de":
|
|
||||||
tesseractlang = "deu"
|
|
||||||
elif localelang == "fr":
|
|
||||||
tesseractlang = "fra"
|
|
||||||
if tesseractlang:
|
|
||||||
return tesseractlang
|
|
||||||
|
|
||||||
if not tesseractlang:
|
|
||||||
tesseractlang = "eng"
|
|
||||||
return tesseractlang
|
|
||||||
|
|
||||||
# PDF has no text content and tesseract is available. Give OCR a try
|
|
||||||
def ocrpdf(self):
|
|
||||||
|
|
||||||
global tmpdir
|
|
||||||
if not tmpdir:
|
|
||||||
return b""
|
|
||||||
|
|
||||||
tesseractlang = self.guesstesseractlang()
|
|
||||||
# self.em.rclog("tesseractlang %s" % tesseractlang)
|
|
||||||
|
|
||||||
tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
|
|
||||||
tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
|
|
||||||
|
|
||||||
# Split pdf pages
|
|
||||||
try:
|
|
||||||
vacuumdir(tmpdir)
|
|
||||||
subprocess.check_call([self.pdftoppm, "-r", "300", self.filename,
|
|
||||||
tmpfile])
|
|
||||||
except Exception as e:
|
|
||||||
self.em.rclog("pdftoppm failed: %s" % e)
|
|
||||||
return b""
|
|
||||||
|
|
||||||
files = glob.glob(tmpfile + "*")
|
|
||||||
for f in files:
|
|
||||||
out = b''
|
|
||||||
try:
|
|
||||||
out = subprocess.check_output([self.tesseract, f, f, "-l",
|
|
||||||
tesseractlang],
|
|
||||||
stderr = subprocess.STDOUT)
|
|
||||||
except Exception as e:
|
|
||||||
self.em.rclog("tesseract failed: %s" % e)
|
|
||||||
|
|
||||||
errlines = out.split(b'\n')
|
|
||||||
if len(errlines) > 2:
|
|
||||||
self.em.rclog("Tesseract error: %s" % out)
|
|
||||||
|
|
||||||
# Concatenate the result files
|
|
||||||
files = glob.glob(tmpfile + "*" + ".txt")
|
|
||||||
data = b""
|
|
||||||
for f in files:
|
|
||||||
data += open(f, "rb").read()
|
|
||||||
|
|
||||||
return b'''<html><head>
|
|
||||||
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
|
|
||||||
</head><body><pre>''' + \
|
|
||||||
self.em.htmlescape(data) + \
|
|
||||||
b'''</pre></body></html>'''
|
|
||||||
|
|
||||||
|
|
||||||
# pdftotext (used to?) badly escape text inside the header
|
# pdftotext (used to?) badly escape text inside the header
|
||||||
# fields. We do it here. This is not an html parser, and depends a
|
# fields. We do it here. This is not an html parser, and depends a
|
||||||
# lot on the actual format output by pdftotext.
|
# lot on the actual format output by pdftotext.
|
||||||
@ -510,13 +410,11 @@ class PDFExtractor:
|
|||||||
html, isempty = self._fixhtml(html)
|
html, isempty = self._fixhtml(html)
|
||||||
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
||||||
|
|
||||||
if isempty and self.ocrpossible:
|
if isempty:
|
||||||
self.config.setKeyDir(os.path.dirname(self.filename))
|
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
|
||||||
s = self.config.getConfParam("pdfocr")
|
self.filename]
|
||||||
cf_doocr = rclexecm.configparamtrue(s)
|
data = subprocess.check_output(cmd)
|
||||||
file_doocr = os.path.isfile(os.path.join(self.confdir, "ocrpdf"))
|
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
|
||||||
if cf_doocr or file_doocr:
|
|
||||||
html = self.ocrpdf()
|
|
||||||
|
|
||||||
if self.extrameta:
|
if self.extrameta:
|
||||||
try:
|
try:
|
||||||
@ -592,6 +490,7 @@ class PDFExtractor:
|
|||||||
|
|
||||||
|
|
||||||
# Main program: create protocol handler and extractor and run them
|
# Main program: create protocol handler and extractor and run them
|
||||||
|
_execdir = os.path.dirname(sys.argv[0])
|
||||||
proto = rclexecm.RclExecM()
|
proto = rclexecm.RclExecM()
|
||||||
extract = PDFExtractor(proto)
|
extract = PDFExtractor(proto)
|
||||||
rclexecm.main(proto, extract)
|
rclexecm.main(proto, extract)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user