Compare commits

...

5 Commits
master ... dev

Author SHA1 Message Date
Tris
53b4f4fcb2 Merge branch 'index_path' into dev 2021-07-02 19:35:50 +10:00
Tris
0f47653b53 Use splitter 2021-07-02 19:35:33 +10:00
Tris
404d0e418e Merge branch 'index_path' into dev 2021-07-02 18:32:17 +10:00
Tris
551bc3df5e Fixes issue with pdftohtml 2021-07-02 18:31:12 +10:00
Tris
0a761f269e Add unprefixed term for path components 2021-07-02 18:21:07 +10:00
2 changed files with 38 additions and 1 deletions

View File

@ -469,7 +469,40 @@ class PDFExtractor:
if annotsfield:
self.em.setfield("pdfannot", annotsfield)
return html
def _patch_meta(self, html):
'''This fixes https://gitlab.freedesktop.org/poppler/poppler/-/issues/136'''
if not _mswindows:
pdfinfo = rclexecm.which("pdfinfo")
if not pdfinfo:
pdfinfo = rclexecm.which("poppler/pdfinfo")
if not pdfinfo:
return html
info = subprocess.check_output([pdfinfo, '-isodates', self.filename])
meta = {}
for line in info.split(b'\n'):
try:
key, value = line.strip().split(b':', 1)
meta[key.strip()] = value.strip().replace(b'"', b'\"')
except ValueError:
pass
if b'CreationDate' in meta:
meta[b'date'] = meta[b'CreationDate']
title = meta.get('Title')
head = [b'<title>%s</title>' % title] if title else []
head += [ b'<meta name="%s" content="%s"/>' % x for x in meta.items() ]
start = html.index(b'<head>') + 6
end = html.index(b'</head>')
return html[:start] + b"".join(head) + html[end:]
def _selfdoc(self):
'''Extract the text from the pdf doc (as opposed to attachment)'''
self.em.setmimetype('text/html')
@ -483,6 +516,8 @@ class PDFExtractor:
"UTF-8", "-eol", "unix", "-q",
self.filename, "-"])
html = self._patch_meta(html)
html, isempty = self._fixhtml(html)
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))

View File

@ -1601,6 +1601,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
splitter.basepos + splitter.curpos++);
}
splitter.basepos += splitter.curpos + 100;
splitter.setTraits(FieldTraits());
splitter.text_to_words(path);
}
// Index textual metadata. These are all indexed as text with