pdf/ocr scripts: no need to look for rclocr if pdfocr is not set. comments.
This commit is contained in:
parent
bfecc9ed72
commit
8560467e4a
@ -26,9 +26,10 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import importlib.util
|
||||||
|
|
||||||
import rclconfig
|
import rclconfig
|
||||||
import rclocrcache
|
import rclocrcache
|
||||||
import importlib.util
|
|
||||||
|
|
||||||
def _deb(s):
|
def _deb(s):
|
||||||
print("rclocr: %s" % s, file=sys.stderr)
|
print("rclocr: %s" % s, file=sys.stderr)
|
||||||
@ -43,6 +44,8 @@ if len(sys.argv) != 2:
|
|||||||
path = sys.argv[1]
|
path = sys.argv[1]
|
||||||
|
|
||||||
config = rclconfig.RclConfig()
|
config = rclconfig.RclConfig()
|
||||||
|
config.setKeyDir(os.path.dirname(path))
|
||||||
|
|
||||||
cache = rclocrcache.OCRCache(config)
|
cache = rclocrcache.OCRCache(config)
|
||||||
|
|
||||||
incache, data = cache.get(path)
|
incache, data = cache.get(path)
|
||||||
@ -58,7 +61,9 @@ ocrprogs = config.getConfParam("ocrprogs")
|
|||||||
if not ocrprogs:
|
if not ocrprogs:
|
||||||
_deb("No ocrprogs variable in recoll configuration")
|
_deb("No ocrprogs variable in recoll configuration")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
#_deb("ocrprogs: %s" % ocrprogs)
|
#_deb("ocrprogs: %s" % ocrprogs)
|
||||||
|
|
||||||
proglist = ocrprogs.split(" ")
|
proglist = ocrprogs.split(" ")
|
||||||
ok = False
|
ok = False
|
||||||
for ocrprog in proglist:
|
for ocrprog in proglist:
|
||||||
|
|||||||
@ -20,17 +20,8 @@
|
|||||||
# pdftotext sometimes outputs unescaped text inside HTML text sections.
|
# pdftotext sometimes outputs unescaped text inside HTML text sections.
|
||||||
# We try to correct.
|
# We try to correct.
|
||||||
#
|
#
|
||||||
# If pdftotext produces no text and tesseract is available, we try to
|
# If pdftotext produces no text and the configuration allows it, we may try to
|
||||||
# perform OCR. As this can be very slow and the result not always
|
# perform OCR.
|
||||||
# good, we only do this if this is required by the configuration
|
|
||||||
#
|
|
||||||
# We guess the OCR language in order of preference:
|
|
||||||
# - From the content of a ".ocrpdflang" file if it exists in the same
|
|
||||||
# directory as the PDF
|
|
||||||
# - Else from the pdfocrlang in recoll.conf
|
|
||||||
# - Else from an RECOLL_TESSERACT_LANG environment variable
|
|
||||||
# - From the content of $RECOLL_CONFDIR/ocrpdf
|
|
||||||
# - Default to "eng"
|
|
||||||
|
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
@ -411,13 +402,16 @@ class PDFExtractor:
|
|||||||
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
||||||
|
|
||||||
if isempty:
|
if isempty:
|
||||||
try:
|
self.config.setKeyDir(os.path.dirname(self.filename))
|
||||||
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
|
s = self.config.getConfParam("pdfocr")
|
||||||
self.filename]
|
if rclexecm.configparamtrue(s):
|
||||||
data = subprocess.check_output(cmd)
|
try:
|
||||||
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
|
cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
|
||||||
except:
|
self.filename]
|
||||||
pass
|
data = subprocess.check_output(cmd)
|
||||||
|
html = _htmlprefix + self.em.htmlescape(data) + _htmlsuffix
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
if self.extrameta:
|
if self.extrameta:
|
||||||
try:
|
try:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user