diff --git a/packaging/debian/debian/changelog b/packaging/debian/debian/changelog index 2b2430fe..3f8e6e28 100644 --- a/packaging/debian/debian/changelog +++ b/packaging/debian/debian/changelog @@ -1,3 +1,9 @@ +recoll (1.25.19-1~ppaPPAVERS~SERIES1) SERIES; urgency=low + + * Fix issues in pdf handler + + -- Jean-Francois Dockes Thu, 13 Jun 2019 08:41:00 +0200 + recoll (1.25.18-1~ppaPPAVERS~SERIES1) SERIES; urgency=low * GUI: fixed webengine result list for newer qt versions (5.12 ok) diff --git a/packaging/debian/debiankio/changelog b/packaging/debian/debiankio/changelog index 30a6245a..862b346c 100644 --- a/packaging/debian/debiankio/changelog +++ b/packaging/debian/debiankio/changelog @@ -1,3 +1,9 @@ +kio-recoll (1.25.19-1~ppaPPAVERS~SERIES1) SERIES; urgency=low + + * keep kio in sync + + -- Jean-Francois Dockes Thu, 13 Jun 2019 08:41:00 +0200 + kio-recoll (1.25.17-1~ppaPPAVERS~SERIES1) SERIES; urgency=low * New release 1.25.17 diff --git a/src/doc/user/recoll.conf.xml b/src/doc/user/recoll.conf.xml index 0d6d7557..f9dc94a4 100644 --- a/src/doc/user/recoll.conf.xml +++ b/src/doc/user/recoll.conf.xml @@ -686,8 +686,8 @@ with possibly meaning-altering missing words. pdfocr Attempt OCR of PDF files with no text content if both tesseract and -pdftoppm are installed. The default is off because OCR is so -very slow. +pdftoppm are installed. This can be defined in subdirectories. The default is off because +OCR is so very slow. pdfocrlang Language to assume for PDF OCR. This is very important for having a reasonable rate of errors diff --git a/src/doc/user/usermanual.html b/src/doc/user/usermanual.html index 4eec2043..60c0cb20 100644 --- a/src/doc/user/usermanual.html +++ b/src/doc/user/usermanual.html @@ -9551,7 +9551,8 @@ for i in range(nres): "RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR">pdfocr

Attempt OCR of PDF files with no text content - if both tesseract and pdftoppm are installed. The + if both tesseract and pdftoppm are installed. + This can be defined in subdirectories. The default is off because OCR is so very slow.

''' + \
@@ -469,7 +467,10 @@ class PDFExtractor:
         #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
 
         if isempty and self.ocrpossible:
-            html = self.ocrpdf()
+            self.config.setKeyDir(os.path.dirname(self.filename))
+            cf_doocr = self.config.getConfParam("pdfocr")
+            if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
+                html = self.ocrpdf()
 
         if self.extrameta:
             try:
diff --git a/src/sampleconf/recoll.conf b/src/sampleconf/recoll.conf
index 2aba3583..a2ff5e9e 100644
--- a/src/sampleconf/recoll.conf
+++ b/src/sampleconf/recoll.conf
@@ -858,8 +858,9 @@ snippetMaxPosWalk = 1000000
 # 
 #
 # Attempt OCR of PDF files with no text content if both tesseract and
-# pdftoppm are installed.The default is off because OCR is so
-# very slow.
+# pdftoppm are installed.
+# This can be defined in subdirectories. The default is off because
+# OCR is so very slow.
 #pdfocr = 0
 
 # 
diff --git a/tests/config/recoll.conf b/tests/config/recoll.conf
index 488bd09f..484fe83e 100644
--- a/tests/config/recoll.conf
+++ b/tests/config/recoll.conf
@@ -37,6 +37,9 @@ daemSkippedPaths =  \
     /home/dockes/projets/fulltext/testrecoll/config
 
 pdfextrameta = pdf:Producer dc:identifier
+[/home/dockes/projets/fulltext/testrecoll/pdf]
+pdfocr = 1
+pdfocrlang = eng
 
 unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl
 
diff --git a/tests/pdf/pdf.sh b/tests/pdf/pdf.sh
index c59687a2..fe5a1ce6 100755
--- a/tests/pdf/pdf.sh
+++ b/tests/pdf/pdf.sh
@@ -12,6 +12,8 @@ initvariables $0
     # url is prefixed for the dc:identifier search to work
     recollq dc:identifier:10.12345/sampledoi
     recollq 'pdf:Producer:"GPL Ghostscript 9.18"'
+    recollq '"bubbleupnp server to simulate openhome"'
+    
 )  2> $mystderr | egrep -v '^Recoll query: ' > $mystdout
 
 diff -w ${myname}.txt $mystdout > $mydiffs 2>&1
diff --git a/tests/pdf/pdf.txt b/tests/pdf/pdf.txt
index a1e1697b..b45815d8 100644
--- a/tests/pdf/pdf.txt
+++ b/tests/pdf/pdf.txt
@@ -1,3 +1,9 @@
-2 results 
+2 results
 application/pdf	[file:///home/dockes/projets/fulltext/testrecoll/pdf/ThinkingInPostScript.pdf]	[Thinking In PostScript]	846435	bytes	
 application/pdf	[file:///home/dockes/projets/fulltext/testrecoll/pdf/linux unicode.pdf]	[developerWorks: Linux | Unicode : Linux Unicode programming]	72669	bytes	
+1 results
+application/pdf	[10.12345/sampledoi]	[How to edit a recoll python script in order to extract data]	6426	bytes	
+1 results
+application/pdf	[10.12345/sampledoi]	[How to edit a recoll python script in order to extract data]	6426	bytes	
+1 results
+application/pdf	[file:///home/dockes/projets/fulltext/testrecoll/pdf/pdf-tesseract.pdf]	[pdf-tesseract.pdf]	226331	bytes