pdf: ocr: small fixes, plus make pdfocr redefinable in subdirs

This commit is contained in:
Jean-Francois Dockes 2019-06-13 09:47:25 +02:00
parent 1991e132a7
commit 5ff1a92a51
9 changed files with 45 additions and 19 deletions

View File

@ -1,3 +1,9 @@
recoll (1.25.19-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* Fix issues in pdf handler
-- Jean-Francois Dockes <jf@dockes.org> Thu, 13 Jun 2019 08:41:00 +0200
recoll (1.25.18-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* GUI: fixed webengine result list for newer qt versions (5.12 ok)

View File

@ -1,3 +1,9 @@
kio-recoll (1.25.19-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* keep kio in sync
-- Jean-Francois Dockes <jf@dockes.org> Thu, 13 Jun 2019 08:41:00 +0200
kio-recoll (1.25.17-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
* New release 1.25.17

View File

@ -686,8 +686,8 @@ with possibly meaning-altering missing words.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR">
<term><varname>pdfocr</varname></term>
<listitem><para>Attempt OCR of PDF files with no text content if both tesseract and
pdftoppm are installed. The default is off because OCR is so
very slow.</para></listitem></varlistentry>
pdftoppm are installed. This can be defined in subdirectories. The default is off because
OCR is so very slow.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCRLANG">
<term><varname>pdfocrlang</varname></term>
<listitem><para>Language to assume for PDF OCR. This is very important for having a reasonable rate of errors

View File

@ -9551,7 +9551,8 @@ for i in range(nres):
"RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR"></a><span class="term"><code class="varname">pdfocr</code></span></dt>
<dd>
<p>Attempt OCR of PDF files with no text content
if both tesseract and pdftoppm are installed. The
if both tesseract and pdftoppm are installed.
This can be defined in subdirectories. The
default is off because OCR is so very slow.</p>
</dd>
<dt><a name=

View File

@ -22,12 +22,13 @@
#
# If pdftotext produces no text and tesseract is available, we try to
# perform OCR. As this can be very slow and the result not always
# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
# good, we only do this if this is required by the configuration
#
# We guess the OCR language in order of preference:
# - From the content of a ".ocrpdflang" file if it exists in the same
# directory as the PDF
# - From an RECOLL_TESSERACT_LANG environment variable
# - Else from the pdfocrlang in recoll.conf
# - Else from an RECOLL_TESSERACT_LANG environment variable
# - From the content of $RECOLL_CONFDIR/ocrpdf
# - Default to "eng"
@ -119,14 +120,12 @@ class PDFExtractor:
# either the presence of a file in the config dir (historical)
# or a set config variable.
self.ocrpossible = False
cf_doocr = self.config.getConfParam("pdfocr")
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
self.tesseract = rclexecm.which("tesseract")
if self.tesseract:
self.pdftoppm = rclexecm.which("pdftoppm")
if self.pdftoppm:
self.ocrpossible = True
self.maybemaketmpdir()
self.tesseract = rclexecm.which("tesseract")
if self.tesseract:
self.pdftoppm = rclexecm.which("pdftoppm")
if self.pdftoppm:
self.ocrpossible = True
self.maybemaketmpdir()
# self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
# Pdftk is optionally used to extract attachments. This takes
@ -288,6 +287,7 @@ class PDFExtractor:
files = glob.glob(tmpfile + "*")
for f in files:
out = b''
try:
out = subprocess.check_output([self.tesseract, f, f, "-l",
tesseractlang],
@ -305,8 +305,6 @@ class PDFExtractor:
for f in files:
data += open(f, "rb").read()
if not data:
return b""
return b'''<html><head>
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
</head><body><pre>''' + \
@ -469,7 +467,10 @@ class PDFExtractor:
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
if isempty and self.ocrpossible:
html = self.ocrpdf()
self.config.setKeyDir(os.path.dirname(self.filename))
cf_doocr = self.config.getConfParam("pdfocr")
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
html = self.ocrpdf()
if self.extrameta:
try:

View File

@ -858,8 +858,9 @@ snippetMaxPosWalk = 1000000
# <var name="pdfocr" type="bool">
#
# <brief>Attempt OCR of PDF files with no text content if both tesseract and
# pdftoppm are installed.</brief><descr>The default is off because OCR is so
# very slow.</descr></var>
# pdftoppm are installed.</brief>
# <descr>This can be defined in subdirectories. The default is off because
# OCR is so very slow.</descr></var>
#pdfocr = 0
# <var name="pdfocrlang" type="string">

View File

@ -37,6 +37,9 @@ daemSkippedPaths = \
/home/dockes/projets/fulltext/testrecoll/config
pdfextrameta = pdf:Producer dc:identifier
[/home/dockes/projets/fulltext/testrecoll/pdf]
pdfocr = 1
pdfocrlang = eng
unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl

View File

@ -12,6 +12,8 @@ initvariables $0
# url is prefixed for the dc:identifier search to work
recollq dc:identifier:10.12345/sampledoi
recollq 'pdf:Producer:"GPL Ghostscript 9.18"'
recollq '"bubbleupnp server to simulate openhome"'
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1

View File

@ -1,3 +1,9 @@
2 results
2 results
application/pdf [file:///home/dockes/projets/fulltext/testrecoll/pdf/ThinkingInPostScript.pdf] [Thinking In PostScript] 846435 bytes
application/pdf [file:///home/dockes/projets/fulltext/testrecoll/pdf/linux unicode.pdf] [developerWorks: Linux | Unicode : Linux Unicode programming] 72669 bytes
1 results
application/pdf [10.12345/sampledoi] [How to edit a recoll python script in order to extract data] 6426 bytes
1 results
application/pdf [10.12345/sampledoi] [How to edit a recoll python script in order to extract data] 6426 bytes
1 results
application/pdf [file:///home/dockes/projets/fulltext/testrecoll/pdf/pdf-tesseract.pdf] [pdf-tesseract.pdf] 226331 bytes