Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
53b4f4fcb2 | ||
|
|
0f47653b53 | ||
|
|
404d0e418e | ||
|
|
551bc3df5e | ||
|
|
0a761f269e |
@ -469,7 +469,40 @@ class PDFExtractor:
|
|||||||
if annotsfield:
|
if annotsfield:
|
||||||
self.em.setfield("pdfannot", annotsfield)
|
self.em.setfield("pdfannot", annotsfield)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
def _patch_meta(self, html):
|
||||||
|
'''This fixes https://gitlab.freedesktop.org/poppler/poppler/-/issues/136'''
|
||||||
|
|
||||||
|
if not _mswindows:
|
||||||
|
pdfinfo = rclexecm.which("pdfinfo")
|
||||||
|
if not pdfinfo:
|
||||||
|
pdfinfo = rclexecm.which("poppler/pdfinfo")
|
||||||
|
|
||||||
|
if not pdfinfo:
|
||||||
|
return html
|
||||||
|
|
||||||
|
info = subprocess.check_output([pdfinfo, '-isodates', self.filename])
|
||||||
|
|
||||||
|
meta = {}
|
||||||
|
for line in info.split(b'\n'):
|
||||||
|
try:
|
||||||
|
key, value = line.strip().split(b':', 1)
|
||||||
|
meta[key.strip()] = value.strip().replace(b'"', b'\"')
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if b'CreationDate' in meta:
|
||||||
|
meta[b'date'] = meta[b'CreationDate']
|
||||||
|
|
||||||
|
title = meta.get('Title')
|
||||||
|
head = [b'<title>%s</title>' % title] if title else []
|
||||||
|
head += [ b'<meta name="%s" content="%s"/>' % x for x in meta.items() ]
|
||||||
|
|
||||||
|
start = html.index(b'<head>') + 6
|
||||||
|
end = html.index(b'</head>')
|
||||||
|
return html[:start] + b"".join(head) + html[end:]
|
||||||
|
|
||||||
|
|
||||||
def _selfdoc(self):
|
def _selfdoc(self):
|
||||||
'''Extract the text from the pdf doc (as opposed to attachment)'''
|
'''Extract the text from the pdf doc (as opposed to attachment)'''
|
||||||
self.em.setmimetype('text/html')
|
self.em.setmimetype('text/html')
|
||||||
@ -483,6 +516,8 @@ class PDFExtractor:
|
|||||||
"UTF-8", "-eol", "unix", "-q",
|
"UTF-8", "-eol", "unix", "-q",
|
||||||
self.filename, "-"])
|
self.filename, "-"])
|
||||||
|
|
||||||
|
html = self._patch_meta(html)
|
||||||
|
|
||||||
html, isempty = self._fixhtml(html)
|
html, isempty = self._fixhtml(html)
|
||||||
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
||||||
|
|
||||||
|
|||||||
@ -1601,6 +1601,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi, Doc &doc)
|
|||||||
splitter.basepos + splitter.curpos++);
|
splitter.basepos + splitter.curpos++);
|
||||||
}
|
}
|
||||||
splitter.basepos += splitter.curpos + 100;
|
splitter.basepos += splitter.curpos + 100;
|
||||||
|
splitter.setTraits(FieldTraits());
|
||||||
|
splitter.text_to_words(path);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Index textual metadata. These are all indexed as text with
|
// Index textual metadata. These are all indexed as text with
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user