pdf: ocr: small fixes, plus make pdfocr redefinable in subdirs
This commit is contained in:
parent
1991e132a7
commit
5ff1a92a51
@ -1,3 +1,9 @@
|
||||
recoll (1.25.19-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||
|
||||
* Fix issues in pdf handler
|
||||
|
||||
-- Jean-Francois Dockes <jf@dockes.org> Thu, 13 Jun 2019 08:41:00 +0200
|
||||
|
||||
recoll (1.25.18-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||
|
||||
* GUI: fixed webengine result list for newer qt versions (5.12 ok)
|
||||
|
||||
@ -1,3 +1,9 @@
|
||||
kio-recoll (1.25.19-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||
|
||||
* keep kio in sync
|
||||
|
||||
-- Jean-Francois Dockes <jf@dockes.org> Thu, 13 Jun 2019 08:41:00 +0200
|
||||
|
||||
kio-recoll (1.25.17-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||
|
||||
* New release 1.25.17
|
||||
|
||||
@ -686,8 +686,8 @@ with possibly meaning-altering missing words.</para></listitem></varlistentry>
|
||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR">
|
||||
<term><varname>pdfocr</varname></term>
|
||||
<listitem><para>Attempt OCR of PDF files with no text content if both tesseract and
|
||||
pdftoppm are installed. The default is off because OCR is so
|
||||
very slow.</para></listitem></varlistentry>
|
||||
pdftoppm are installed. This can be defined in subdirectories. The default is off because
|
||||
OCR is so very slow.</para></listitem></varlistentry>
|
||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCRLANG">
|
||||
<term><varname>pdfocrlang</varname></term>
|
||||
<listitem><para>Language to assume for PDF OCR. This is very important for having a reasonable rate of errors
|
||||
|
||||
@ -9551,7 +9551,8 @@ for i in range(nres):
|
||||
"RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR"></a><span class="term"><code class="varname">pdfocr</code></span></dt>
|
||||
<dd>
|
||||
<p>Attempt OCR of PDF files with no text content
|
||||
if both tesseract and pdftoppm are installed. The
|
||||
if both tesseract and pdftoppm are installed.
|
||||
This can be defined in subdirectories. The
|
||||
default is off because OCR is so very slow.</p>
|
||||
</dd>
|
||||
<dt><a name=
|
||||
|
||||
@ -22,12 +22,13 @@
|
||||
#
|
||||
# If pdftotext produces no text and tesseract is available, we try to
|
||||
# perform OCR. As this can be very slow and the result not always
|
||||
# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
|
||||
# good, we only do this if this is required by the configuration
|
||||
#
|
||||
# We guess the OCR language in order of preference:
|
||||
# - From the content of a ".ocrpdflang" file if it exists in the same
|
||||
# directory as the PDF
|
||||
# - From an RECOLL_TESSERACT_LANG environment variable
|
||||
# - Else from the pdfocrlang in recoll.conf
|
||||
# - Else from an RECOLL_TESSERACT_LANG environment variable
|
||||
# - From the content of $RECOLL_CONFDIR/ocrpdf
|
||||
# - Default to "eng"
|
||||
|
||||
@ -119,14 +120,12 @@ class PDFExtractor:
|
||||
# either the presence of a file in the config dir (historical)
|
||||
# or a set config variable.
|
||||
self.ocrpossible = False
|
||||
cf_doocr = self.config.getConfParam("pdfocr")
|
||||
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
|
||||
self.tesseract = rclexecm.which("tesseract")
|
||||
if self.tesseract:
|
||||
self.pdftoppm = rclexecm.which("pdftoppm")
|
||||
if self.pdftoppm:
|
||||
self.ocrpossible = True
|
||||
self.maybemaketmpdir()
|
||||
self.tesseract = rclexecm.which("tesseract")
|
||||
if self.tesseract:
|
||||
self.pdftoppm = rclexecm.which("pdftoppm")
|
||||
if self.pdftoppm:
|
||||
self.ocrpossible = True
|
||||
self.maybemaketmpdir()
|
||||
# self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
|
||||
|
||||
# Pdftk is optionally used to extract attachments. This takes
|
||||
@ -288,6 +287,7 @@ class PDFExtractor:
|
||||
|
||||
files = glob.glob(tmpfile + "*")
|
||||
for f in files:
|
||||
out = b''
|
||||
try:
|
||||
out = subprocess.check_output([self.tesseract, f, f, "-l",
|
||||
tesseractlang],
|
||||
@ -305,8 +305,6 @@ class PDFExtractor:
|
||||
for f in files:
|
||||
data += open(f, "rb").read()
|
||||
|
||||
if not data:
|
||||
return b""
|
||||
return b'''<html><head>
|
||||
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
|
||||
</head><body><pre>''' + \
|
||||
@ -469,7 +467,10 @@ class PDFExtractor:
|
||||
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
||||
|
||||
if isempty and self.ocrpossible:
|
||||
html = self.ocrpdf()
|
||||
self.config.setKeyDir(os.path.dirname(self.filename))
|
||||
cf_doocr = self.config.getConfParam("pdfocr")
|
||||
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
|
||||
html = self.ocrpdf()
|
||||
|
||||
if self.extrameta:
|
||||
try:
|
||||
|
||||
@ -858,8 +858,9 @@ snippetMaxPosWalk = 1000000
|
||||
# <var name="pdfocr" type="bool">
|
||||
#
|
||||
# <brief>Attempt OCR of PDF files with no text content if both tesseract and
|
||||
# pdftoppm are installed.</brief><descr>The default is off because OCR is so
|
||||
# very slow.</descr></var>
|
||||
# pdftoppm are installed.</brief>
|
||||
# <descr>This can be defined in subdirectories. The default is off because
|
||||
# OCR is so very slow.</descr></var>
|
||||
#pdfocr = 0
|
||||
|
||||
# <var name="pdfocrlang" type="string">
|
||||
|
||||
@ -37,6 +37,9 @@ daemSkippedPaths = \
|
||||
/home/dockes/projets/fulltext/testrecoll/config
|
||||
|
||||
pdfextrameta = pdf:Producer dc:identifier
|
||||
[/home/dockes/projets/fulltext/testrecoll/pdf]
|
||||
pdfocr = 1
|
||||
pdfocrlang = eng
|
||||
|
||||
unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl
|
||||
|
||||
|
||||
@ -12,6 +12,8 @@ initvariables $0
|
||||
# url is prefixed for the dc:identifier search to work
|
||||
recollq dc:identifier:10.12345/sampledoi
|
||||
recollq 'pdf:Producer:"GPL Ghostscript 9.18"'
|
||||
recollq '"bubbleupnp server to simulate openhome"'
|
||||
|
||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||
|
||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||
|
||||
@ -1,3 +1,9 @@
|
||||
2 results
|
||||
2 results
|
||||
application/pdf [file:///home/dockes/projets/fulltext/testrecoll/pdf/ThinkingInPostScript.pdf] [Thinking In PostScript] 846435 bytes
|
||||
application/pdf [file:///home/dockes/projets/fulltext/testrecoll/pdf/linux unicode.pdf] [developerWorks: Linux | Unicode : Linux Unicode programming] 72669 bytes
|
||||
1 results
|
||||
application/pdf [10.12345/sampledoi] [How to edit a recoll python script in order to extract data] 6426 bytes
|
||||
1 results
|
||||
application/pdf [10.12345/sampledoi] [How to edit a recoll python script in order to extract data] 6426 bytes
|
||||
1 results
|
||||
application/pdf [file:///home/dockes/projets/fulltext/testrecoll/pdf/pdf-tesseract.pdf] [pdf-tesseract.pdf] 226331 bytes
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user