1205 lines
33 KiB
HTML
1205 lines
33 KiB
HTML
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
|
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
|
<head>
|
|
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=UTF-8" />
|
|
<meta name="generator" content="AsciiDoc 8.6.9" />
|
|
<title>Indexing PDF XMP-metadata with Recoll</title>
|
|
<style type="text/css">
|
|
/* Shared CSS for AsciiDoc xhtml11 and html5 backends */
|
|
|
|
/* Default font. */
|
|
body {
|
|
font-family: Georgia,serif;
|
|
}
|
|
|
|
/* Title font. */
|
|
h1, h2, h3, h4, h5, h6,
|
|
div.title, caption.title,
|
|
thead, p.table.header,
|
|
#toctitle,
|
|
#author, #revnumber, #revdate, #revremark,
|
|
#footer {
|
|
font-family: Arial,Helvetica,sans-serif;
|
|
}
|
|
|
|
body {
|
|
margin: 1em 5% 1em 5%;
|
|
}
|
|
|
|
a {
|
|
color: blue;
|
|
text-decoration: underline;
|
|
}
|
|
a:visited {
|
|
color: fuchsia;
|
|
}
|
|
|
|
em {
|
|
font-style: italic;
|
|
color: navy;
|
|
}
|
|
|
|
strong {
|
|
font-weight: bold;
|
|
color: #083194;
|
|
}
|
|
|
|
h1, h2, h3, h4, h5, h6 {
|
|
color: #527bbd;
|
|
margin-top: 1.2em;
|
|
margin-bottom: 0.5em;
|
|
line-height: 1.3;
|
|
}
|
|
|
|
h1, h2, h3 {
|
|
border-bottom: 2px solid silver;
|
|
}
|
|
h2 {
|
|
padding-top: 0.5em;
|
|
}
|
|
h3 {
|
|
float: left;
|
|
}
|
|
h3 + * {
|
|
clear: left;
|
|
}
|
|
h5 {
|
|
font-size: 1.0em;
|
|
}
|
|
|
|
div.sectionbody {
|
|
margin-left: 0;
|
|
}
|
|
|
|
hr {
|
|
border: 1px solid silver;
|
|
}
|
|
|
|
p {
|
|
margin-top: 0.5em;
|
|
margin-bottom: 0.5em;
|
|
}
|
|
|
|
ul, ol, li > p {
|
|
margin-top: 0;
|
|
}
|
|
ul > li { color: #aaa; }
|
|
ul > li > * { color: black; }
|
|
|
|
.monospaced, code, pre {
|
|
font-family: "Courier New", Courier, monospace;
|
|
font-size: inherit;
|
|
color: navy;
|
|
padding: 0;
|
|
margin: 0;
|
|
}
|
|
pre {
|
|
white-space: pre-wrap;
|
|
}
|
|
|
|
#author {
|
|
color: #527bbd;
|
|
font-weight: bold;
|
|
font-size: 1.1em;
|
|
}
|
|
#email {
|
|
}
|
|
#revnumber, #revdate, #revremark {
|
|
}
|
|
|
|
#footer {
|
|
font-size: small;
|
|
border-top: 2px solid silver;
|
|
padding-top: 0.5em;
|
|
margin-top: 4.0em;
|
|
}
|
|
#footer-text {
|
|
float: left;
|
|
padding-bottom: 0.5em;
|
|
}
|
|
#footer-badges {
|
|
float: right;
|
|
padding-bottom: 0.5em;
|
|
}
|
|
|
|
#preamble {
|
|
margin-top: 1.5em;
|
|
margin-bottom: 1.5em;
|
|
}
|
|
div.imageblock, div.exampleblock, div.verseblock,
|
|
div.quoteblock, div.literalblock, div.listingblock, div.sidebarblock,
|
|
div.admonitionblock {
|
|
margin-top: 1.0em;
|
|
margin-bottom: 1.5em;
|
|
}
|
|
div.admonitionblock {
|
|
margin-top: 2.0em;
|
|
margin-bottom: 2.0em;
|
|
margin-right: 10%;
|
|
color: #606060;
|
|
}
|
|
|
|
div.content { /* Block element content. */
|
|
padding: 0;
|
|
}
|
|
|
|
/* Block element titles. */
|
|
div.title, caption.title {
|
|
color: #527bbd;
|
|
font-weight: bold;
|
|
text-align: left;
|
|
margin-top: 1.0em;
|
|
margin-bottom: 0.5em;
|
|
}
|
|
div.title + * {
|
|
margin-top: 0;
|
|
}
|
|
|
|
td div.title:first-child {
|
|
margin-top: 0.0em;
|
|
}
|
|
div.content div.title:first-child {
|
|
margin-top: 0.0em;
|
|
}
|
|
div.content + div.title {
|
|
margin-top: 0.0em;
|
|
}
|
|
|
|
div.sidebarblock > div.content {
|
|
background: #ffffee;
|
|
border: 1px solid #dddddd;
|
|
border-left: 4px solid #f0f0f0;
|
|
padding: 0.5em;
|
|
}
|
|
|
|
div.listingblock > div.content {
|
|
border: 1px solid #dddddd;
|
|
border-left: 5px solid #f0f0f0;
|
|
background: #f8f8f8;
|
|
padding: 0.5em;
|
|
}
|
|
|
|
div.quoteblock, div.verseblock {
|
|
padding-left: 1.0em;
|
|
margin-left: 1.0em;
|
|
margin-right: 10%;
|
|
border-left: 5px solid #f0f0f0;
|
|
color: #888;
|
|
}
|
|
|
|
div.quoteblock > div.attribution {
|
|
padding-top: 0.5em;
|
|
text-align: right;
|
|
}
|
|
|
|
div.verseblock > pre.content {
|
|
font-family: inherit;
|
|
font-size: inherit;
|
|
}
|
|
div.verseblock > div.attribution {
|
|
padding-top: 0.75em;
|
|
text-align: left;
|
|
}
|
|
/* DEPRECATED: Pre version 8.2.7 verse style literal block. */
|
|
div.verseblock + div.attribution {
|
|
text-align: left;
|
|
}
|
|
|
|
div.admonitionblock .icon {
|
|
vertical-align: top;
|
|
font-size: 1.1em;
|
|
font-weight: bold;
|
|
text-decoration: underline;
|
|
color: #527bbd;
|
|
padding-right: 0.5em;
|
|
}
|
|
div.admonitionblock td.content {
|
|
padding-left: 0.5em;
|
|
border-left: 3px solid #dddddd;
|
|
}
|
|
|
|
div.exampleblock > div.content {
|
|
border-left: 3px solid #dddddd;
|
|
padding-left: 0.5em;
|
|
}
|
|
|
|
div.imageblock div.content { padding-left: 0; }
|
|
span.image img { border-style: none; vertical-align: text-bottom; }
|
|
a.image:visited { color: white; }
|
|
|
|
dl {
|
|
margin-top: 0.8em;
|
|
margin-bottom: 0.8em;
|
|
}
|
|
dt {
|
|
margin-top: 0.5em;
|
|
margin-bottom: 0;
|
|
font-style: normal;
|
|
color: navy;
|
|
}
|
|
dd > *:first-child {
|
|
margin-top: 0.1em;
|
|
}
|
|
|
|
ul, ol {
|
|
list-style-position: outside;
|
|
}
|
|
ol.arabic {
|
|
list-style-type: decimal;
|
|
}
|
|
ol.loweralpha {
|
|
list-style-type: lower-alpha;
|
|
}
|
|
ol.upperalpha {
|
|
list-style-type: upper-alpha;
|
|
}
|
|
ol.lowerroman {
|
|
list-style-type: lower-roman;
|
|
}
|
|
ol.upperroman {
|
|
list-style-type: upper-roman;
|
|
}
|
|
|
|
div.compact ul, div.compact ol,
|
|
div.compact p, div.compact p,
|
|
div.compact div, div.compact div {
|
|
margin-top: 0.1em;
|
|
margin-bottom: 0.1em;
|
|
}
|
|
|
|
tfoot {
|
|
font-weight: bold;
|
|
}
|
|
td > div.verse {
|
|
white-space: pre;
|
|
}
|
|
|
|
div.hdlist {
|
|
margin-top: 0.8em;
|
|
margin-bottom: 0.8em;
|
|
}
|
|
div.hdlist tr {
|
|
padding-bottom: 15px;
|
|
}
|
|
dt.hdlist1.strong, td.hdlist1.strong {
|
|
font-weight: bold;
|
|
}
|
|
td.hdlist1 {
|
|
vertical-align: top;
|
|
font-style: normal;
|
|
padding-right: 0.8em;
|
|
color: navy;
|
|
}
|
|
td.hdlist2 {
|
|
vertical-align: top;
|
|
}
|
|
div.hdlist.compact tr {
|
|
margin: 0;
|
|
padding-bottom: 0;
|
|
}
|
|
|
|
.comment {
|
|
background: yellow;
|
|
}
|
|
|
|
.footnote, .footnoteref {
|
|
font-size: 0.8em;
|
|
}
|
|
|
|
span.footnote, span.footnoteref {
|
|
vertical-align: super;
|
|
}
|
|
|
|
#footnotes {
|
|
margin: 20px 0 20px 0;
|
|
padding: 7px 0 0 0;
|
|
}
|
|
|
|
#footnotes div.footnote {
|
|
margin: 0 0 5px 0;
|
|
}
|
|
|
|
#footnotes hr {
|
|
border: none;
|
|
border-top: 1px solid silver;
|
|
height: 1px;
|
|
text-align: left;
|
|
margin-left: 0;
|
|
width: 20%;
|
|
min-width: 100px;
|
|
}
|
|
|
|
div.colist td {
|
|
padding-right: 0.5em;
|
|
padding-bottom: 0.3em;
|
|
vertical-align: top;
|
|
}
|
|
div.colist td img {
|
|
margin-top: 0.3em;
|
|
}
|
|
|
|
@media print {
|
|
#footer-badges { display: none; }
|
|
}
|
|
|
|
#toc {
|
|
margin-bottom: 2.5em;
|
|
}
|
|
|
|
#toctitle {
|
|
color: #527bbd;
|
|
font-size: 1.1em;
|
|
font-weight: bold;
|
|
margin-top: 1.0em;
|
|
margin-bottom: 0.1em;
|
|
}
|
|
|
|
div.toclevel0, div.toclevel1, div.toclevel2, div.toclevel3, div.toclevel4 {
|
|
margin-top: 0;
|
|
margin-bottom: 0;
|
|
}
|
|
div.toclevel2 {
|
|
margin-left: 2em;
|
|
font-size: 0.9em;
|
|
}
|
|
div.toclevel3 {
|
|
margin-left: 4em;
|
|
font-size: 0.9em;
|
|
}
|
|
div.toclevel4 {
|
|
margin-left: 6em;
|
|
font-size: 0.9em;
|
|
}
|
|
|
|
span.aqua { color: aqua; }
|
|
span.black { color: black; }
|
|
span.blue { color: blue; }
|
|
span.fuchsia { color: fuchsia; }
|
|
span.gray { color: gray; }
|
|
span.green { color: green; }
|
|
span.lime { color: lime; }
|
|
span.maroon { color: maroon; }
|
|
span.navy { color: navy; }
|
|
span.olive { color: olive; }
|
|
span.purple { color: purple; }
|
|
span.red { color: red; }
|
|
span.silver { color: silver; }
|
|
span.teal { color: teal; }
|
|
span.white { color: white; }
|
|
span.yellow { color: yellow; }
|
|
|
|
span.aqua-background { background: aqua; }
|
|
span.black-background { background: black; }
|
|
span.blue-background { background: blue; }
|
|
span.fuchsia-background { background: fuchsia; }
|
|
span.gray-background { background: gray; }
|
|
span.green-background { background: green; }
|
|
span.lime-background { background: lime; }
|
|
span.maroon-background { background: maroon; }
|
|
span.navy-background { background: navy; }
|
|
span.olive-background { background: olive; }
|
|
span.purple-background { background: purple; }
|
|
span.red-background { background: red; }
|
|
span.silver-background { background: silver; }
|
|
span.teal-background { background: teal; }
|
|
span.white-background { background: white; }
|
|
span.yellow-background { background: yellow; }
|
|
|
|
span.big { font-size: 2em; }
|
|
span.small { font-size: 0.6em; }
|
|
|
|
span.underline { text-decoration: underline; }
|
|
span.overline { text-decoration: overline; }
|
|
span.line-through { text-decoration: line-through; }
|
|
|
|
div.unbreakable { page-break-inside: avoid; }
|
|
|
|
|
|
/*
|
|
* xhtml11 specific
|
|
*
|
|
* */
|
|
|
|
div.tableblock {
|
|
margin-top: 1.0em;
|
|
margin-bottom: 1.5em;
|
|
}
|
|
div.tableblock > table {
|
|
border: 3px solid #527bbd;
|
|
}
|
|
thead, p.table.header {
|
|
font-weight: bold;
|
|
color: #527bbd;
|
|
}
|
|
p.table {
|
|
margin-top: 0;
|
|
}
|
|
/* Because the table frame attribute is overriden by CSS in most browsers. */
|
|
div.tableblock > table[frame="void"] {
|
|
border-style: none;
|
|
}
|
|
div.tableblock > table[frame="hsides"] {
|
|
border-left-style: none;
|
|
border-right-style: none;
|
|
}
|
|
div.tableblock > table[frame="vsides"] {
|
|
border-top-style: none;
|
|
border-bottom-style: none;
|
|
}
|
|
|
|
|
|
/*
|
|
* html5 specific
|
|
*
|
|
* */
|
|
|
|
table.tableblock {
|
|
margin-top: 1.0em;
|
|
margin-bottom: 1.5em;
|
|
}
|
|
thead, p.tableblock.header {
|
|
font-weight: bold;
|
|
color: #527bbd;
|
|
}
|
|
p.tableblock {
|
|
margin-top: 0;
|
|
}
|
|
table.tableblock {
|
|
border-width: 3px;
|
|
border-spacing: 0px;
|
|
border-style: solid;
|
|
border-color: #527bbd;
|
|
border-collapse: collapse;
|
|
}
|
|
th.tableblock, td.tableblock {
|
|
border-width: 1px;
|
|
padding: 4px;
|
|
border-style: solid;
|
|
border-color: #527bbd;
|
|
}
|
|
|
|
table.tableblock.frame-topbot {
|
|
border-left-style: hidden;
|
|
border-right-style: hidden;
|
|
}
|
|
table.tableblock.frame-sides {
|
|
border-top-style: hidden;
|
|
border-bottom-style: hidden;
|
|
}
|
|
table.tableblock.frame-none {
|
|
border-style: hidden;
|
|
}
|
|
|
|
th.tableblock.halign-left, td.tableblock.halign-left {
|
|
text-align: left;
|
|
}
|
|
th.tableblock.halign-center, td.tableblock.halign-center {
|
|
text-align: center;
|
|
}
|
|
th.tableblock.halign-right, td.tableblock.halign-right {
|
|
text-align: right;
|
|
}
|
|
|
|
th.tableblock.valign-top, td.tableblock.valign-top {
|
|
vertical-align: top;
|
|
}
|
|
th.tableblock.valign-middle, td.tableblock.valign-middle {
|
|
vertical-align: middle;
|
|
}
|
|
th.tableblock.valign-bottom, td.tableblock.valign-bottom {
|
|
vertical-align: bottom;
|
|
}
|
|
|
|
|
|
/*
|
|
* manpage specific
|
|
*
|
|
* */
|
|
|
|
body.manpage h1 {
|
|
padding-top: 0.5em;
|
|
padding-bottom: 0.5em;
|
|
border-top: 2px solid silver;
|
|
border-bottom: 2px solid silver;
|
|
}
|
|
body.manpage h2 {
|
|
border-style: none;
|
|
}
|
|
body.manpage div.sectionbody {
|
|
margin-left: 3em;
|
|
}
|
|
|
|
@media print {
|
|
body.manpage div#toc { display: none; }
|
|
}
|
|
|
|
|
|
</style>
|
|
<script type="text/javascript">
|
|
/*<+'])');
|
|
// Function that scans the DOM tree for header elements (the DOM2
|
|
// nodeIterator API would be a better technique but not supported by all
|
|
// browsers).
|
|
var iterate = function (el) {
|
|
for (var i = el.firstChild; i != null; i = i.nextSibling) {
|
|
if (i.nodeType == 1 /* Node.ELEMENT_NODE */) {
|
|
var mo = re.exec(i.tagName);
|
|
if (mo && (i.getAttribute("class") || i.getAttribute("className")) != "float") {
|
|
result[result.length] = new TocEntry(i, getText(i), mo[1]-1);
|
|
}
|
|
iterate(i);
|
|
}
|
|
}
|
|
}
|
|
iterate(el);
|
|
return result;
|
|
}
|
|
|
|
var toc = document.getElementById("toc");
|
|
if (!toc) {
|
|
return;
|
|
}
|
|
|
|
// Delete existing TOC entries in case we're reloading the TOC.
|
|
var tocEntriesToRemove = [];
|
|
var i;
|
|
for (i = 0; i < toc.childNodes.length; i++) {
|
|
var entry = toc.childNodes[i];
|
|
if (entry.nodeName.toLowerCase() == 'div'
|
|
&& entry.getAttribute("class")
|
|
&& entry.getAttribute("class").match(/^toclevel/))
|
|
tocEntriesToRemove.push(entry);
|
|
}
|
|
for (i = 0; i < tocEntriesToRemove.length; i++) {
|
|
toc.removeChild(tocEntriesToRemove[i]);
|
|
}
|
|
|
|
// Rebuild TOC entries.
|
|
var entries = tocEntries(document.getElementById("content"), toclevels);
|
|
for (var i = 0; i < entries.length; ++i) {
|
|
var entry = entries[i];
|
|
if (entry.element.id == "")
|
|
entry.element.id = "_toc_" + i;
|
|
var a = document.createElement("a");
|
|
a.href = "#" + entry.element.id;
|
|
a.appendChild(document.createTextNode(entry.text));
|
|
var div = document.createElement("div");
|
|
div.appendChild(a);
|
|
div.className = "toclevel" + entry.toclevel;
|
|
toc.appendChild(div);
|
|
}
|
|
if (entries.length == 0)
|
|
toc.parentNode.removeChild(toc);
|
|
},
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////
|
|
// Footnotes generator
|
|
/////////////////////////////////////////////////////////////////////
|
|
|
|
/* Based on footnote generation code from:
|
|
* http://www.brandspankingnew.net/archive/2005/07/format_footnote.html
|
|
*/
|
|
|
|
footnotes: function () {
|
|
// Delete existing footnote entries in case we're reloading the footnodes.
|
|
var i;
|
|
var noteholder = document.getElementById("footnotes");
|
|
if (!noteholder) {
|
|
return;
|
|
}
|
|
var entriesToRemove = [];
|
|
for (i = 0; i < noteholder.childNodes.length; i++) {
|
|
var entry = noteholder.childNodes[i];
|
|
if (entry.nodeName.toLowerCase() == 'div' && entry.getAttribute("class") == "footnote")
|
|
entriesToRemove.push(entry);
|
|
}
|
|
for (i = 0; i < entriesToRemove.length; i++) {
|
|
noteholder.removeChild(entriesToRemove[i]);
|
|
}
|
|
|
|
// Rebuild footnote entries.
|
|
var cont = document.getElementById("content");
|
|
var spans = cont.getElementsByTagName("span");
|
|
var refs = {};
|
|
var n = 0;
|
|
for (i=0; i<spans.length; i++) {
|
|
if (spans[i].className == "footnote") {
|
|
n++;
|
|
var note = spans[i].getAttribute("data-note");
|
|
if (!note) {
|
|
// Use [\s\S] in place of . so multi-line matches work.
|
|
// Because JavaScript has no s (dotall) regex flag.
|
|
note = spans[i].innerHTML.match(/\s*\[([\s\S]*)]\s*/)[1];
|
|
spans[i].innerHTML =
|
|
"[<a id='_footnoteref_" + n + "' href='#_footnote_" + n +
|
|
"' title='View footnote' class='footnote'>" + n + "</a>]";
|
|
spans[i].setAttribute("data-note", note);
|
|
}
|
|
noteholder.innerHTML +=
|
|
"<div class='footnote' id='_footnote_" + n + "'>" +
|
|
"<a href='#_footnoteref_" + n + "' title='Return to text'>" +
|
|
n + "</a>. " + note + "</div>";
|
|
var id =spans[i].getAttribute("id");
|
|
if (id != null) refs["#"+id] = n;
|
|
}
|
|
}
|
|
if (n == 0)
|
|
noteholder.parentNode.removeChild(noteholder);
|
|
else {
|
|
// Process footnoterefs.
|
|
for (i=0; i<spans.length; i++) {
|
|
if (spans[i].className == "footnoteref") {
|
|
var href = spans[i].getElementsByTagName("a")[0].getAttribute("href");
|
|
href = href.match(/#.*/)[0]; // Because IE return full URL.
|
|
n = refs[href];
|
|
spans[i].innerHTML =
|
|
"[<a href='#_footnote_" + n +
|
|
"' title='View footnote' class='footnote'>" + n + "</a>]";
|
|
}
|
|
}
|
|
}
|
|
},
|
|
|
|
install: function(toclevels) {
|
|
var timerId;
|
|
|
|
function reinstall() {
|
|
asciidoc.footnotes();
|
|
if (toclevels) {
|
|
asciidoc.toc(toclevels);
|
|
}
|
|
}
|
|
|
|
function reinstallAndRemoveTimer() {
|
|
clearInterval(timerId);
|
|
reinstall();
|
|
}
|
|
|
|
timerId = setInterval(reinstall, 500);
|
|
if (document.addEventListener)
|
|
document.addEventListener("DOMContentLoaded", reinstallAndRemoveTimer, false);
|
|
else
|
|
window.onload = reinstallAndRemoveTimer;
|
|
}
|
|
|
|
}
|
|
asciidoc.install();
|
|
/*]]>*/
|
|
</script>
|
|
</head>
|
|
<body class="article">
|
|
<div id="header">
|
|
<h1>Indexing PDF XMP-metadata with Recoll</h1>
|
|
</div>
|
|
<div id="content">
|
|
<div id="preamble">
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>The original document describing XMP metadata usage with Recoll was
|
|
written by Jeffrey Dick and is <a href="original-text.html">still available
|
|
here</a>. However it described using the old shell-based PDF Recoll input
|
|
handler, which differs a lot from doing something equivalent with the
|
|
current Python-based one (for which XMP capability is available from
|
|
recoll 1.23.2, but the new handler can be used with previous Recoll
|
|
versions).</p></div>
|
|
<div class="paragraph"><p>I based this page on the text by Jeffrey Dick, using input from Johannes
|
|
Menzel for all examples about the new features. The discussion which led to
|
|
the updated handler is a
|
|
<a href="https://bitbucket.org/medoc/recoll/issues/300/extracting-xmp-metadata-and-tmsu-tags">Bitbucket
|
|
Recoll issue</a>.</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_introduction">Introduction</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>Organizing and searching a large collection of PDFs as part of a
|
|
research project can be a demanding task.
|
|
<a href="http://en.wikipedia.org/wiki/Extensible_Metadata_Platform">XMP
|
|
metadata</a> stored in a PDF, such as journal title, publication year,
|
|
and user-added keywords, are often useful when searching for a
|
|
publication.</p></div>
|
|
<div class="paragraph"><p>Here, we describe customizing Recoll to retrieve this metadata, store it,
|
|
and defining a result paragraph format to display it. See also a related
|
|
wiki entry,
|
|
<a href="https://bitbucket.org/medoc/recoll/wiki/HandleCustomField.wiki">Generating
|
|
a custom field and using it to sort results</a>, for sorting results on PDF
|
|
page count.</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_saving_metadata_to_pdfs">Saving metadata to PDFs</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>Bibliographic metadata can be saved in the PDF file itself. In
|
|
the <a href="http://jabref.sourceforge.net">JabRef</a> bibliography
|
|
manager, this is done with the "Write XMP-metadata to PDFs" menu
|
|
item. Note the presence of the keywords in the screenshot below; this
|
|
field is a good place to tag the PDF with any words of your choosing
|
|
to describe genre, topic, etc.</p></div>
|
|
<div class="imageblock">
|
|
<div class="content">
|
|
<img src="jabref_metadata.png" alt="Editing metadata with jabref" />
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_custom_indexing_short_example_fields_file">Custom indexing short example (fields file)</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>The following example (extract from a complete configuration shown later)
|
|
creates two fields named "refjournal" and "refpages", which are both stored
|
|
(so they can be displayed in result list entries), and indexed (you can
|
|
specifically search them).</p></div>
|
|
<div class="paragraph"><p>Some other types of metadata, such as title, author and keywords, are
|
|
already indexed by Recoll (the default rclpdf finds them using the
|
|
<strong>pdftotext</strong> command) so there is no need to add those to the [prefixes]
|
|
section.</p></div>
|
|
<div class="paragraph"><p>This is taken from the <code>fields</code> file inside the configuration
|
|
(e.g. <em>~/.recoll/fields</em>).</p></div>
|
|
<div class="listingblock">
|
|
<div class="content">
|
|
<pre><code>[prefixes]
|
|
refjournal=RFJOURNAL
|
|
refpages=RFPAGES
|
|
|
|
[stored]
|
|
refjournal =
|
|
refpages =
|
|
|
|
[aliases]
|
|
refjournal = bibtex:journal bibtex:journaltitle
|
|
refpages = bibtex:pages</code></pre>
|
|
</div></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_telling_the_handler_what_fields_to_extract">Telling the handler what fields to extract</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>As of Recoll 1.23.2, the PDF handler has the capability to use <strong>pdfinfo</strong>
|
|
for extracting XMP metadata. The switch for executing <strong>pdfinfo</strong> is the
|
|
<em>pdfextrameta</em> configuration parameter, and the value of the parameter is a
|
|
list of XMP tags to extract, with optional conversion to Recoll field names
|
|
(the XMP qualified tag name is kept by default, the translation is
|
|
separated by a <em>|</em> character). Example (without translations):</p></div>
|
|
<div class="listingblock">
|
|
<div class="content">
|
|
<pre><code>pdfextrameta = bibtex:year bibtex:journal bibtex:journaltitle</code></pre>
|
|
</div></div>
|
|
<div class="paragraph"><p>Note that it is quite equivalent to translate a field name inside
|
|
<em>pdfextrameta</em> or to uses aliases inside the <em>fields</em> file.</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_editing_the_field_values">Editing the field values</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>Shortly after the 1.23.2 release, the new rclpdf.py was modified to
|
|
enable calling external Python code for editing the values of the XMP
|
|
metadata fields. The name of the external script is defined by the
|
|
<em>pdfextrametafix</em> configuration variable, and it should define a
|
|
<em>MetaFixer</em> class, with a <em>metafix()</em> method.</p></div>
|
|
<div class="paragraph"><p>In practise, add the following to recoll.conf:</p></div>
|
|
<div class="listingblock">
|
|
<div class="content">
|
|
<pre><code>pdfextrametafix = /path/to/my/script.py</code></pre>
|
|
</div></div>
|
|
<div class="paragraph"><p>The Python script could look like the following:</p></div>
|
|
<div class="listingblock">
|
|
<div class="content">
|
|
<pre><code>import sys
|
|
import re
|
|
|
|
# This can be used for local XMP field editing.
|
|
#
|
|
# A new instance is created for each PDF document (so the object could
|
|
# keep state to avoid, e.g. duplicate values)
|
|
#
|
|
# The metafix method receives an (original) field name, and the text
|
|
# value, and should return the possibly modified text.
|
|
class MetaFixer(object):
|
|
def __init__(self):
|
|
pass
|
|
|
|
def metafix(self, nm, txt):
|
|
if nm == 'bibtex:pages':
|
|
txt = re.sub(r'--', '-', txt)
|
|
elif nm == 'someothername':
|
|
# do something else
|
|
pass
|
|
elif nm == 'stillanother':
|
|
# etc.
|
|
pass
|
|
|
|
return txt</code></pre>
|
|
</div></div>
|
|
<div class="paragraph"><p>The metadata-editing script can be modified to fill in the "journal" field for
|
|
BibTex entries that aren’t journal articles (e.g. bibtex:booktitle
|
|
for "InCollection" entries), by defining a <em>wrapup()</em> method which will
|
|
be called with the whole metadata array (an array of <em>(nm,value)</em>
|
|
pairs) for global editing/removing/addition.</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_indexing">Indexing</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>Then index away!</p></div>
|
|
<div class="paragraph"><p>Note that you can also run the rclpdf.py script manually,
|
|
e.g. <code>rclpdf.py -d /path/to/some.pdf</code>, to inspect the
|
|
output. If things are working correctly, the <head> consists of the
|
|
HTML meta elements, and the <body> contains the text of the PDF.</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_result_paragraph_format">Result paragraph format</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>The result paragraph format defines what fields are displayed inside Recoll
|
|
result list, and how they are formatted.</p></div>
|
|
<div class="paragraph"><p>Edit this using the Recoll GUI: Preferences > GUI configuration >
|
|
Result List > Edit result paragraph format string.</p></div>
|
|
<div class="listingblock">
|
|
<div class="content">
|
|
<pre><code><table class="respar" style="padding-bottom: 10px;" cellspacing="5" cellpadding="5">
|
|
|
|
<thead style="vertical-align: top;">
|
|
<tr>
|
|
<td colspan="3" style="border-bottom: 1pt dotted #004070; font-size: smaller;"><a href="E%N">%u</a> | %S | Relevanz: %R</td>
|
|
</tr>
|
|
</thead>
|
|
|
|
<tbody style="vertical-align: top;">
|
|
<tr>
|
|
<td><a href="P%N"><img src="%I" alt="" width="64" height="auto" /></a></td>
|
|
<td style="width: 250px;"><span style="color: #004070;">
|
|
<div style="font-style: italic;">%(author)</div>
|
|
<div style="font-weight: bold;"><a href="E%N">&raquo;%T&laquo;</a></div>
|
|
<div style="text-transform: uppercase; margin-top: 5pt">%(reftype)</div></td>
|
|
<td>
|
|
<div style="font-size: smaller;">
|
|
%(refauthor)%(refchapter) %(reftitle)%(refeditor)%(refbooktitle)%(refjournal)%(refvolume)%(refnumber)%(refaddress)%(reflocation)%(refpublisher)%(refyear)%(refpages).</div>
|
|
<div style="text-align: justify; font-family: serif; margin-top: 5pt; margin-bottom: 5pt">&raquo;<a href="A%N">%A</a>&laquo;</div>
|
|
<div>%(refkeywords)</div>
|
|
<div style="font-size: smaller;"><a href="%(refurl)">%(refurl)</a></div>
|
|
<div style="font-size: smaller"> %(refkey) %(refisbn) %(refissn) %(refdoi)</div></td>
|
|
</tr>
|
|
</tbody>
|
|
|
|
</table></code></pre>
|
|
</div></div>
|
|
<div class="paragraph"><p>And the result list header (Preferences > GUI configuration >
|
|
Result List > Result Page HTML header insert):</p></div>
|
|
<div class="listingblock">
|
|
<div class="content">
|
|
<pre><code><!-- Custom Header -->
|
|
|
|
<script type="text/javascript">
|
|
function altRows() {
|
|
var rows = document.getElementsByClassName("rclresult");
|
|
for (i = 0; i < rows.length; i++) {
|
|
if (i % 2 == 0) {
|
|
rows[i].style.backgroundColor = "#f0f0f0";
|
|
}
|
|
}
|
|
}
|
|
window.onload = function() {
|
|
altRows();
|
|
}
|
|
</script>
|
|
|
|
<style type="text/css">
|
|
a:link {
|
|
color: #004070;
|
|
text-decoration: none;
|
|
}
|
|
a:visited {
|
|
color: #004070;
|
|
text-decoration: none;
|
|
}
|
|
a:hover {
|
|
color: #0050a0;
|
|
text-decoration: none;
|
|
}
|
|
a:active {
|
|
color: #005080;
|
|
text-decoration: none;
|
|
}
|
|
</style>
|
|
<!-- End of Custom Header --></code></pre>
|
|
</div></div>
|
|
<div class="paragraph"><p>There are
|
|
<a href="https://bitbucket.org/medoc/recoll/wiki/ResultsThumbnails">various
|
|
methods for creating the thumbnails</a>; the ones here were made by opening
|
|
the directory containing the PDFs in the Dolphin file manager (part of KDE)
|
|
and selecting the Preview option.</p></div>
|
|
<div class="paragraph"><p>And the result:</p></div>
|
|
<div class="imageblock">
|
|
<div class="content">
|
|
<img src="recoll_query.png" alt="Result list display" />
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_more_possibilities">More possibilities</h2>
|
|
<div class="sectionbody">
|
|
<div class="ulist"><ul>
|
|
<li>
|
|
<p>
|
|
The sort buttons (up- and down-arrows) in Recoll sort the
|
|
results by the modified date on the file at the time of indexing. If
|
|
you want this sorting to reflect the publication year, then the
|
|
timestamp should be set accordingly. If names of the PDFs contain
|
|
the year (e.g. BZS2007.pdf, CKE+2011.pdf), the following one-liner
|
|
would set the modified date to January 1st of the year:
|
|
</p>
|
|
</li>
|
|
</ul></div>
|
|
<div class="listingblock">
|
|
<div class="content">
|
|
<pre><code>for i in `ls *.pdf`; do touch -d `echo $i | sed 's/[^0-9]*//g'`-01-01 $i; done</code></pre>
|
|
</div></div>
|
|
<div class="paragraph"><p>Note that the publication year could then be shown in
|
|
the result list using the stored date of the file (using "%D" in the
|
|
result paragraph format, and date format "%Y") instead of having to
|
|
add the year to the index as shown above.</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_complete_example">Complete example</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>This was designed by Johannes Menzel, who kindly provided the data when we
|
|
worked on improving PDF XMP data extraction. The originals are listed in
|
|
this
|
|
<a href="https://bitbucket.org/medoc/recoll/issues/300/extracting-xmp-metadata-and-tmsu-tags">BitBucket issue</a></p></div>
|
|
<div class="paragraph"><p>The paragraph format is listed above.</p></div>
|
|
<div class="sect2">
|
|
<h3 id="_em_recoll_conf_em_additions"><em>recoll.conf</em> additions:</h3>
|
|
<div class="listingblock">
|
|
<div class="content">
|
|
<pre><code>pdfextrameta = bibtex:journal bibtex:journaltitle bibtex:pages \
|
|
bibtex:volume bibtex:number bibtex:booktitle bibtex:year bibtex:author \
|
|
bibtex:title bibtex:isbn bibtex:issn bibtex:editor bibtex:address \
|
|
bibtex:location bibtex:doi bibtex:chapter bibtex:url bibtex:entrytype \
|
|
bibtex:bibtexkey bibtex:abstract bibtex:date bibtex:keywords \
|
|
bibtex:comment bibtex:language bibtex:edition bibtex:totalpages \
|
|
dc:creator dc:relation dc:publisher dc:title dc:type dc:identifier
|
|
|
|
defaultcharset = UTF-8//
|
|
|
|
pdfextrametafix = /home/hannes/.recoll/metafix.py</code></pre>
|
|
</div></div>
|
|
</div>
|
|
<div class="sect2">
|
|
<h3 id="_em_metafix_py_em_script"><em>metafix.py</em> script:</h3>
|
|
<div class="listingblock">
|
|
<div class="content">
|
|
<pre><code>import sys
|
|
import re
|
|
|
|
# This can be used for local XMP field editing.
|
|
#
|
|
# A new instance is created for each PDF document (so the object could
|
|
# keep state to avoid, e.g. duplicate values)
|
|
#
|
|
# The metafix method receives an (original) field name, and the text
|
|
# value, and should return the possibly modified text.
|
|
class MetaFixer(object):
|
|
def __init__(self):
|
|
pass
|
|
|
|
def metafix(self, nm, txt):
|
|
if nm == 'bibtex:pages':
|
|
txt = re.sub(r'--', '-', txt)
|
|
txt = re.sub(r'^', ', p. ', txt)
|
|
elif nm == 'bibtex:author':
|
|
txt = re.sub(r'$', ':\ ', txt)
|
|
pass
|
|
elif nm == 'bibtex:chapter':
|
|
txt = re.sub(r'^', ', in: id.: ', txt)
|
|
pass
|
|
elif nm == 'bibtex:editor':
|
|
txt = re.sub(r'^', ', in: ', txt)
|
|
txt = re.sub(r'$', ' (ed.):\ ', txt)
|
|
pass
|
|
elif nm == 'bibtex:year':
|
|
txt = re.sub(r'^', ', ', txt)
|
|
pass
|
|
elif nm == 'bibtex:date':
|
|
txt = re.sub(r'^', ', ', txt)
|
|
pass
|
|
elif nm == 'bibtex:volume':
|
|
txt = re.sub(r'^', ', vol. ', txt)
|
|
pass
|
|
elif nm == 'bibtex:number':
|
|
txt = re.sub(r'^', ', no. ', txt)
|
|
pass
|
|
elif nm == 'bibtex:journaltitle':
|
|
txt = re.sub(r'^', ', in: ', txt)
|
|
pass
|
|
elif nm == 'bibtex:journal':
|
|
txt = re.sub(r'^', ', in: ', txt)
|
|
pass
|
|
elif nm == 'bibtex:title':
|
|
txt = re.sub(r'^', '"', txt)
|
|
txt = re.sub(r'$', '"', txt)
|
|
pass
|
|
elif nm == 'bibtex:location':
|
|
txt = re.sub(r'^', ', ', txt)
|
|
txt = re.sub(r'$', ':\ ', txt)
|
|
pass
|
|
elif nm == 'bibtex:address':
|
|
txt = re.sub(r'^', ', ', txt)
|
|
txt = re.sub(r'$', ':\ ', txt)
|
|
pass
|
|
elif nm == 'bibtex:isbn':
|
|
txt = re.sub(r'^', 'ISBN: ', txt)
|
|
pass
|
|
elif nm == 'bibtex:issn':
|
|
txt = re.sub(r'^', 'ISSN: ', txt)
|
|
pass
|
|
elif nm == 'bibtex:doi':
|
|
txt = re.sub(r'^', 'DOI: ', txt)
|
|
pass
|
|
elif nm == 'bibtex:bibtexkey':
|
|
txt = re.sub(r'^', 'Key: ', txt)
|
|
pass
|
|
|
|
return txt</code></pre>
|
|
</div></div>
|
|
</div>
|
|
<div class="sect2">
|
|
<h3 id="_em_fields_em_file"><em>fields</em> file:</h3>
|
|
<div class="listingblock">
|
|
<div class="content">
|
|
<pre><code>[prefixes]
|
|
|
|
refjournal=RFJOURNAL
|
|
refpages=RFPAGES
|
|
reftitle=RFTTITLE
|
|
refvolume=RFVOLUME
|
|
refauthor=RFAUTHOR
|
|
refyear=RFYYEAR
|
|
refisbn=RFISBN
|
|
refissn=RFISSN
|
|
refdoi=RFDOI
|
|
refeditor=RFEDITOR
|
|
refpublisher=RFPUBLISHER
|
|
refaddress=RFADDRESS
|
|
reflocation=RFLOCATION
|
|
refbooktitle=RFBOOKTITLE
|
|
refurl=RFURL
|
|
reftype=RFTYPE
|
|
refkey=RFKEY
|
|
refabstract=RFABSTRACT
|
|
refkeywords=RFKEYWORDS
|
|
refcomment=RFCOMMENT
|
|
refedition=RFEDITION
|
|
reflanguage=RFLANGUAGE
|
|
|
|
[stored]
|
|
|
|
refjournal=
|
|
refpages=
|
|
reftitle=
|
|
refvolume=
|
|
refauthor=
|
|
refyear=
|
|
refisbn=
|
|
refissn=
|
|
refdoi=
|
|
refeditor=
|
|
refpublisher=
|
|
refaddress=
|
|
reflocation=
|
|
refbooktitle=
|
|
refurl=
|
|
reftype=
|
|
refkey=
|
|
refabstract=
|
|
refkeywords=
|
|
refcomment=
|
|
refedition=
|
|
reflanguage=
|
|
refid=
|
|
|
|
[aliases]
|
|
|
|
refjournal = bibtex:journal bibtex:journaltitle
|
|
refpages = bibtex:pages
|
|
reftitle = bibtex:title
|
|
refvolume = bibtex:volume
|
|
refauthor = bibtex:author
|
|
refyear = bibtex:year bibtex:date
|
|
refid = dc:identifier bibtex:isbn bibtex:issn
|
|
refisbn = bibtex:isbn
|
|
refissn = bibtex:issn
|
|
refdoi = bibtex:doi
|
|
refeditor = bibtex:editor
|
|
refpublisher = bibtex:publisher
|
|
refaddress = bibtex:address
|
|
reflocation = bibtex:location
|
|
refbooktitle = bibtex:booktitle
|
|
refurl = bibtex:url
|
|
reftype = bibtex:entrytype bibtex:type
|
|
refkey = bibtex:bibtexkey
|
|
refabstract = bibtex:abstract
|
|
refkeywords = bibtex:keywords
|
|
refcomment = bibtex:comment
|
|
refedition = bibtex:edition
|
|
reflanguage = bibtex:language
|
|
author = xesam:author</code></pre>
|
|
</div></div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div id="footnotes"><hr /></div>
|
|
<div id="footer">
|
|
<div id="footer-text">
|
|
Last updated
|
|
2017-05-24 11:10:15 CEST
|
|
</div>
|
|
</div>
|
|
</body>
|
|
</html>
|