rclpdf ocr: fix python3 issue. Add pdfocrlang config variable

This commit is contained in:
Jean-Francois Dockes 2018-07-18 18:05:42 +02:00
parent 8a7fa51dc9
commit a457b6c68e

View File

@ -91,13 +91,12 @@ class PDFExtractor:
# error at once # error at once
return return
cf = rclconfig.RclConfig() self.config = rclconfig.RclConfig()
self.confdir = cf.getConfDir() self.confdir = self.config.getConfDir()
# The user can set a list of meta tags to be extracted from # The user can set a list of meta tags to be extracted from
# the XMP metadata packet. These are specified as # the XMP metadata packet. These are specified as
# (xmltag,rcltag) pairs # (xmltag,rcltag) pairs
self.extrameta = cf.getConfParam("pdfextrameta") self.extrameta = self.config.getConfParam("pdfextrameta")
if self.extrameta: if self.extrameta:
self._initextrameta() self._initextrameta()
@ -119,7 +118,7 @@ class PDFExtractor:
# either the presence of a file in the config dir (historical) # either the presence of a file in the config dir (historical)
# or a set config variable. # or a set config variable.
self.ocrpossible = False self.ocrpossible = False
cf_doocr = cf.getConfParam("pdfocr") cf_doocr = self.config.getConfParam("pdfocr")
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")): if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
self.tesseract = rclexecm.which("tesseract") self.tesseract = rclexecm.which("tesseract")
if self.tesseract: if self.tesseract:
@ -134,7 +133,7 @@ class PDFExtractor:
# so it can be disabled in the configuration. # so it can be disabled in the configuration.
self.attextractdone = False self.attextractdone = False
self.attachlist = [] self.attachlist = []
cf_attach = cf.getConfParam("pdfattach") cf_attach = self.config.getConfParam("pdfattach")
if cf_attach: if cf_attach:
self.pdftk = rclexecm.which("pdftk") self.pdftk = rclexecm.which("pdftk")
if self.pdftk: if self.pdftk:
@ -224,18 +223,28 @@ class PDFExtractor:
# environment and hope for the best. # environment and hope for the best.
def guesstesseractlang(self): def guesstesseractlang(self):
tesseractlang = "" tesseractlang = ""
pdflangfile = os.path.join(os.path.dirname(self.filename), ".ocrpdflang")
# First look for a language def file in the file's directory
pdflangfile = os.path.join(os.path.dirname(self.filename),
b".ocrpdflang")
if os.path.isfile(pdflangfile): if os.path.isfile(pdflangfile):
tesseractlang = open(pdflangfile, "r").read().strip() tesseractlang = open(pdflangfile, "r").read().strip()
if tesseractlang: if tesseractlang:
return tesseractlang return tesseractlang
# Then look for a global option. The normal way now that we
# have config reading capability in the handlers is to use the
# config. Then, for backwards compat, environment variable and
# file inside the configuration directory
tesseractlang = self.config.getConfParam("pdfocrlang")
if tesseractlang:
return tesseractlang
tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", ""); tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", "");
if tesseractlang: if tesseractlang:
return tesseractlang return tesseractlang
pdflangfile = os.path.join(self.confdir, b"ocrpdf")
tesseractlang = \ if os.path.isfile(pdflangfile):
open(os.path.join(self.confdir, "ocrpdf"), "r").read().strip() tesseractlang = open(pdflangfile, "r").read().strip()
if tesseractlang: if tesseractlang:
return tesseractlang return tesseractlang
@ -285,7 +294,7 @@ class PDFExtractor:
except Exception as e: except Exception as e:
self.em.rclog("tesseract failed: %s" % e) self.em.rclog("tesseract failed: %s" % e)
errlines = out.split('\n') errlines = out.split(b'\n')
if len(errlines) > 2: if len(errlines) > 2:
self.em.rclog("Tesseract error: %s" % out) self.em.rclog("Tesseract error: %s" % out)