Merge branch 'index_path' into dev

Use splitter
2021-07-02 19:35:50 +10:00 · 2021-07-02 19:35:33 +10:00 · 2021-07-02 18:32:17 +10:00 · 2021-07-02 18:31:12 +10:00 · 2021-07-02 18:21:07 +10:00
2 changed files with 38 additions and 1 deletions
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@ -469,7 +469,40 @@ class PDFExtractor:
        if annotsfield:
            self.em.setfield("pdfannot", annotsfield)
        return html
-    
+
+    def _patch_meta(self, html):
+        '''This fixes https://gitlab.freedesktop.org/poppler/poppler/-/issues/136'''
+
+        if not _mswindows:
+            pdfinfo = rclexecm.which("pdfinfo")
+        if not pdfinfo:
+            pdfinfo = rclexecm.which("poppler/pdfinfo")
+        
+        if not pdfinfo:
+            return html
+
+        info = subprocess.check_output([pdfinfo, '-isodates', self.filename])
+
+        meta = {}
+        for line in info.split(b'\n'):
+            try:
+                key, value = line.strip().split(b':', 1)
+                meta[key.strip()] = value.strip().replace(b'"', b'\"')
+            except ValueError:
+                pass
+
+        if b'CreationDate' in meta:
+            meta[b'date'] = meta[b'CreationDate']
+
+        title = meta.get('Title')
+        head = [b'<title>%s</title>' % title] if title else []
+        head += [ b'<meta name="%s" content="%s"/>' % x for x in meta.items() ]
+
+        start = html.index(b'<head>') + 6
+        end = html.index(b'</head>')
+        return html[:start] + b"".join(head) + html[end:]
+
+
    def _selfdoc(self):
        '''Extract the text from the pdf doc (as opposed to attachment)'''
        self.em.setmimetype('text/html')
@ -483,6 +516,8 @@ class PDFExtractor:
                                        "UTF-8", "-eol", "unix", "-q",
                                        self.filename, "-"])

+        html = self._patch_meta(html)
+
        html, isempty = self._fixhtml(html)
        #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@ -1601,6 +1601,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
                                        splitter.basepos + splitter.curpos++);
            }
            splitter.basepos += splitter.curpos + 100;
+	    splitter.setTraits(FieldTraits());
+	    splitter.text_to_words(path);
        }

        // Index textual metadata.  These are all indexed as text with
Author	SHA1	Message	Date
Tris	53b4f4fcb2	Merge branch 'index_path' into dev	2021-07-02 19:35:50 +10:00
Tris	0f47653b53	Use splitter	2021-07-02 19:35:33 +10:00
Tris	404d0e418e	Merge branch 'index_path' into dev	2021-07-02 18:32:17 +10:00
Tris	551bc3df5e	Fixes issue with pdftohtml	2021-07-02 18:31:12 +10:00
Tris	0a761f269e	Add unprefixed term for path components	2021-07-02 18:21:07 +10:00