pdf: ocr: small fixes, plus make pdfocr redefinable in subdirs
This commit is contained in:
parent
1991e132a7
commit
5ff1a92a51
@ -1,3 +1,9 @@
|
|||||||
|
recoll (1.25.19-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||||
|
|
||||||
|
* Fix issues in pdf handler
|
||||||
|
|
||||||
|
-- Jean-Francois Dockes <jf@dockes.org> Thu, 13 Jun 2019 08:41:00 +0200
|
||||||
|
|
||||||
recoll (1.25.18-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
recoll (1.25.18-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||||
|
|
||||||
* GUI: fixed webengine result list for newer qt versions (5.12 ok)
|
* GUI: fixed webengine result list for newer qt versions (5.12 ok)
|
||||||
|
|||||||
@ -1,3 +1,9 @@
|
|||||||
|
kio-recoll (1.25.19-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||||
|
|
||||||
|
* keep kio in sync
|
||||||
|
|
||||||
|
-- Jean-Francois Dockes <jf@dockes.org> Thu, 13 Jun 2019 08:41:00 +0200
|
||||||
|
|
||||||
kio-recoll (1.25.17-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
kio-recoll (1.25.17-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
|
||||||
|
|
||||||
* New release 1.25.17
|
* New release 1.25.17
|
||||||
|
|||||||
@ -686,8 +686,8 @@ with possibly meaning-altering missing words.</para></listitem></varlistentry>
|
|||||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR">
|
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR">
|
||||||
<term><varname>pdfocr</varname></term>
|
<term><varname>pdfocr</varname></term>
|
||||||
<listitem><para>Attempt OCR of PDF files with no text content if both tesseract and
|
<listitem><para>Attempt OCR of PDF files with no text content if both tesseract and
|
||||||
pdftoppm are installed. The default is off because OCR is so
|
pdftoppm are installed. This can be defined in subdirectories. The default is off because
|
||||||
very slow.</para></listitem></varlistentry>
|
OCR is so very slow.</para></listitem></varlistentry>
|
||||||
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCRLANG">
|
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCRLANG">
|
||||||
<term><varname>pdfocrlang</varname></term>
|
<term><varname>pdfocrlang</varname></term>
|
||||||
<listitem><para>Language to assume for PDF OCR. This is very important for having a reasonable rate of errors
|
<listitem><para>Language to assume for PDF OCR. This is very important for having a reasonable rate of errors
|
||||||
|
|||||||
@ -9551,7 +9551,8 @@ for i in range(nres):
|
|||||||
"RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR"></a><span class="term"><code class="varname">pdfocr</code></span></dt>
|
"RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR"></a><span class="term"><code class="varname">pdfocr</code></span></dt>
|
||||||
<dd>
|
<dd>
|
||||||
<p>Attempt OCR of PDF files with no text content
|
<p>Attempt OCR of PDF files with no text content
|
||||||
if both tesseract and pdftoppm are installed. The
|
if both tesseract and pdftoppm are installed.
|
||||||
|
This can be defined in subdirectories. The
|
||||||
default is off because OCR is so very slow.</p>
|
default is off because OCR is so very slow.</p>
|
||||||
</dd>
|
</dd>
|
||||||
<dt><a name=
|
<dt><a name=
|
||||||
|
|||||||
@ -22,12 +22,13 @@
|
|||||||
#
|
#
|
||||||
# If pdftotext produces no text and tesseract is available, we try to
|
# If pdftotext produces no text and tesseract is available, we try to
|
||||||
# perform OCR. As this can be very slow and the result not always
|
# perform OCR. As this can be very slow and the result not always
|
||||||
# good, we only do this if a file named $RECOLL_CONFDIR/ocrpdf exists
|
# good, we only do this if this is required by the configuration
|
||||||
#
|
#
|
||||||
# We guess the OCR language in order of preference:
|
# We guess the OCR language in order of preference:
|
||||||
# - From the content of a ".ocrpdflang" file if it exists in the same
|
# - From the content of a ".ocrpdflang" file if it exists in the same
|
||||||
# directory as the PDF
|
# directory as the PDF
|
||||||
# - From an RECOLL_TESSERACT_LANG environment variable
|
# - Else from the pdfocrlang in recoll.conf
|
||||||
|
# - Else from an RECOLL_TESSERACT_LANG environment variable
|
||||||
# - From the content of $RECOLL_CONFDIR/ocrpdf
|
# - From the content of $RECOLL_CONFDIR/ocrpdf
|
||||||
# - Default to "eng"
|
# - Default to "eng"
|
||||||
|
|
||||||
@ -119,14 +120,12 @@ class PDFExtractor:
|
|||||||
# either the presence of a file in the config dir (historical)
|
# either the presence of a file in the config dir (historical)
|
||||||
# or a set config variable.
|
# or a set config variable.
|
||||||
self.ocrpossible = False
|
self.ocrpossible = False
|
||||||
cf_doocr = self.config.getConfParam("pdfocr")
|
self.tesseract = rclexecm.which("tesseract")
|
||||||
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
|
if self.tesseract:
|
||||||
self.tesseract = rclexecm.which("tesseract")
|
self.pdftoppm = rclexecm.which("pdftoppm")
|
||||||
if self.tesseract:
|
if self.pdftoppm:
|
||||||
self.pdftoppm = rclexecm.which("pdftoppm")
|
self.ocrpossible = True
|
||||||
if self.pdftoppm:
|
self.maybemaketmpdir()
|
||||||
self.ocrpossible = True
|
|
||||||
self.maybemaketmpdir()
|
|
||||||
# self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
|
# self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
|
||||||
|
|
||||||
# Pdftk is optionally used to extract attachments. This takes
|
# Pdftk is optionally used to extract attachments. This takes
|
||||||
@ -288,6 +287,7 @@ class PDFExtractor:
|
|||||||
|
|
||||||
files = glob.glob(tmpfile + "*")
|
files = glob.glob(tmpfile + "*")
|
||||||
for f in files:
|
for f in files:
|
||||||
|
out = b''
|
||||||
try:
|
try:
|
||||||
out = subprocess.check_output([self.tesseract, f, f, "-l",
|
out = subprocess.check_output([self.tesseract, f, f, "-l",
|
||||||
tesseractlang],
|
tesseractlang],
|
||||||
@ -305,8 +305,6 @@ class PDFExtractor:
|
|||||||
for f in files:
|
for f in files:
|
||||||
data += open(f, "rb").read()
|
data += open(f, "rb").read()
|
||||||
|
|
||||||
if not data:
|
|
||||||
return b""
|
|
||||||
return b'''<html><head>
|
return b'''<html><head>
|
||||||
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
|
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
|
||||||
</head><body><pre>''' + \
|
</head><body><pre>''' + \
|
||||||
@ -469,7 +467,10 @@ class PDFExtractor:
|
|||||||
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
||||||
|
|
||||||
if isempty and self.ocrpossible:
|
if isempty and self.ocrpossible:
|
||||||
html = self.ocrpdf()
|
self.config.setKeyDir(os.path.dirname(self.filename))
|
||||||
|
cf_doocr = self.config.getConfParam("pdfocr")
|
||||||
|
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
|
||||||
|
html = self.ocrpdf()
|
||||||
|
|
||||||
if self.extrameta:
|
if self.extrameta:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -858,8 +858,9 @@ snippetMaxPosWalk = 1000000
|
|||||||
# <var name="pdfocr" type="bool">
|
# <var name="pdfocr" type="bool">
|
||||||
#
|
#
|
||||||
# <brief>Attempt OCR of PDF files with no text content if both tesseract and
|
# <brief>Attempt OCR of PDF files with no text content if both tesseract and
|
||||||
# pdftoppm are installed.</brief><descr>The default is off because OCR is so
|
# pdftoppm are installed.</brief>
|
||||||
# very slow.</descr></var>
|
# <descr>This can be defined in subdirectories. The default is off because
|
||||||
|
# OCR is so very slow.</descr></var>
|
||||||
#pdfocr = 0
|
#pdfocr = 0
|
||||||
|
|
||||||
# <var name="pdfocrlang" type="string">
|
# <var name="pdfocrlang" type="string">
|
||||||
|
|||||||
@ -37,6 +37,9 @@ daemSkippedPaths = \
|
|||||||
/home/dockes/projets/fulltext/testrecoll/config
|
/home/dockes/projets/fulltext/testrecoll/config
|
||||||
|
|
||||||
pdfextrameta = pdf:Producer dc:identifier
|
pdfextrameta = pdf:Producer dc:identifier
|
||||||
|
[/home/dockes/projets/fulltext/testrecoll/pdf]
|
||||||
|
pdfocr = 1
|
||||||
|
pdfocrlang = eng
|
||||||
|
|
||||||
unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl
|
unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl
|
||||||
|
|
||||||
|
|||||||
@ -12,6 +12,8 @@ initvariables $0
|
|||||||
# url is prefixed for the dc:identifier search to work
|
# url is prefixed for the dc:identifier search to work
|
||||||
recollq dc:identifier:10.12345/sampledoi
|
recollq dc:identifier:10.12345/sampledoi
|
||||||
recollq 'pdf:Producer:"GPL Ghostscript 9.18"'
|
recollq 'pdf:Producer:"GPL Ghostscript 9.18"'
|
||||||
|
recollq '"bubbleupnp server to simulate openhome"'
|
||||||
|
|
||||||
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
) 2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
|
||||||
|
|
||||||
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
|
||||||
|
|||||||
@ -1,3 +1,9 @@
|
|||||||
2 results
|
2 results
|
||||||
application/pdf [file:///home/dockes/projets/fulltext/testrecoll/pdf/ThinkingInPostScript.pdf] [Thinking In PostScript] 846435 bytes
|
application/pdf [file:///home/dockes/projets/fulltext/testrecoll/pdf/ThinkingInPostScript.pdf] [Thinking In PostScript] 846435 bytes
|
||||||
application/pdf [file:///home/dockes/projets/fulltext/testrecoll/pdf/linux unicode.pdf] [developerWorks: Linux | Unicode : Linux Unicode programming] 72669 bytes
|
application/pdf [file:///home/dockes/projets/fulltext/testrecoll/pdf/linux unicode.pdf] [developerWorks: Linux | Unicode : Linux Unicode programming] 72669 bytes
|
||||||
|
1 results
|
||||||
|
application/pdf [10.12345/sampledoi] [How to edit a recoll python script in order to extract data] 6426 bytes
|
||||||
|
1 results
|
||||||
|
application/pdf [10.12345/sampledoi] [How to edit a recoll python script in order to extract data] 6426 bytes
|
||||||
|
1 results
|
||||||
|
application/pdf [file:///home/dockes/projets/fulltext/testrecoll/pdf/pdf-tesseract.pdf] [pdf-tesseract.pdf] 226331 bytes
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user