diff --git a/src/sampleconf/recoll.conf.in b/src/sampleconf/recoll.conf.in
index 7ae6f188..307458d4 100644
--- a/src/sampleconf/recoll.conf.in
+++ b/src/sampleconf/recoll.conf.in
@@ -123,6 +123,18 @@ maxXapianClauses = 100000
 # If nothing is specified, the default is then ~/.recoll/xapiandb/
 dbdir = xapiandb
 
+# Indexing process threads configuration. If Recoll is configured for 
+# multithreading, this defines what queues are active and how many threads
+# to start for any of them. The default values were found good on a
+# quad-core processor. The three steps are file conversion, term extraction
+# and conversion and Xapian index update. The three queue values define the
+# max number of jobs waiting on one of the corresponding queues. Setting a
+# value to -1 disables a queue (replaced by a direct call). The thrTcounts
+# values define the number of threads to start for each queue. The last
+# value can only be one (as Xapian is single-threaded).
+thrQSizes = 2 2 2
+thrTCounts =  4 2 1
+
 # Maximum file system occupation before we stop indexing. The default value
 # is 0, meaning no checking. The value is a percentage, corresponding to
 # what the "Capacity" df output column shows.
diff --git a/website/doc.html b/website/doc.html
index 4d06d750..9b801e6f 100644
--- a/website/doc.html
+++ b/website/doc.html
@@ -49,6 +49,8 @@
       <li><a href="perfs.html">Index size and indexing performance
 	  data.</a></li>
       <li><a href="custom.html">Result list format samples.</a></li>
+      <li><a href="idxthreads/threadingRecoll.html">Lessons learned
+          while modifying Recoll indexing to be multithreaded</a>.</li>
       <li><a href="http://bitbucket.org/medoc/recoll/wiki/FaqsAndHowTos">
           Faqs and Howtos</a> are now kept in the 
 	  <a href="http://bitbucket.org/medoc/recoll/wiki/">
diff --git a/website/idxthreads/Makefile b/website/idxthreads/Makefile
new file mode 100644
index 00000000..9eeefc66
--- /dev/null
+++ b/website/idxthreads/Makefile
@@ -0,0 +1,2 @@
+threadingRecoll.html : threadingRecoll.txt nothreads.png
+	asciidoc threadingRecoll.txt
diff --git a/website/idxthreads/assembly.dia b/website/idxthreads/assembly.dia
new file mode 100644
index 00000000..187071ec
Binary files /dev/null and b/website/idxthreads/assembly.dia differ
diff --git a/website/idxthreads/assembly.png b/website/idxthreads/assembly.png
new file mode 100644
index 00000000..aaebf236
Binary files /dev/null and b/website/idxthreads/assembly.png differ
diff --git a/website/idxthreads/multipara.dia b/website/idxthreads/multipara.dia
new file mode 100644
index 00000000..941bf31c
Binary files /dev/null and b/website/idxthreads/multipara.dia differ
diff --git a/website/idxthreads/multipara.png b/website/idxthreads/multipara.png
new file mode 100644
index 00000000..03da2167
Binary files /dev/null and b/website/idxthreads/multipara.png differ
diff --git a/website/idxthreads/nothreads.dia b/website/idxthreads/nothreads.dia
new file mode 100644
index 00000000..0a216aa9
Binary files /dev/null and b/website/idxthreads/nothreads.dia differ
diff --git a/website/idxthreads/nothreads.png b/website/idxthreads/nothreads.png
new file mode 100644
index 00000000..aa93710d
Binary files /dev/null and b/website/idxthreads/nothreads.png differ
diff --git a/website/idxthreads/threadingRecoll.html b/website/idxthreads/threadingRecoll.html
new file mode 100644
index 00000000..c4d6ee71
--- /dev/null
+++ b/website/idxthreads/threadingRecoll.html
@@ -0,0 +1,1285 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
+    "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+<head>
+<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=UTF-8" />
+<meta name="generator" content="AsciiDoc 8.6.7" />
+<title>Converting Recoll indexing to multithreading</title>
+<style type="text/css">
+/* Shared CSS for AsciiDoc xhtml11 and html5 backends */
+
+/* Default font. */
+body {
+  font-family: Georgia,serif;
+}
+
+/* Title font. */
+h1, h2, h3, h4, h5, h6,
+div.title, caption.title,
+thead, p.table.header,
+#toctitle,
+#author, #revnumber, #revdate, #revremark,
+#footer {
+  font-family: Arial,Helvetica,sans-serif;
+}
+
+body {
+  margin: 1em 5% 1em 5%;
+}
+
+a {
+  color: blue;
+  text-decoration: underline;
+}
+a:visited {
+  color: fuchsia;
+}
+
+em {
+  font-style: italic;
+  color: navy;
+}
+
+strong {
+  font-weight: bold;
+  color: #083194;
+}
+
+h1, h2, h3, h4, h5, h6 {
+  color: #527bbd;
+  margin-top: 1.2em;
+  margin-bottom: 0.5em;
+  line-height: 1.3;
+}
+
+h1, h2, h3 {
+  border-bottom: 2px solid silver;
+}
+h2 {
+  padding-top: 0.5em;
+}
+h3 {
+  float: left;
+}
+h3 + * {
+  clear: left;
+}
+h5 {
+  font-size: 1.0em;
+}
+
+div.sectionbody {
+  margin-left: 0;
+}
+
+hr {
+  border: 1px solid silver;
+}
+
+p {
+  margin-top: 0.5em;
+  margin-bottom: 0.5em;
+}
+
+ul, ol, li > p {
+  margin-top: 0;
+}
+ul > li     { color: #aaa; }
+ul > li > * { color: black; }
+
+pre {
+  padding: 0;
+  margin: 0;
+}
+
+#author {
+  color: #527bbd;
+  font-weight: bold;
+  font-size: 1.1em;
+}
+#email {
+}
+#revnumber, #revdate, #revremark {
+}
+
+#footer {
+  font-size: small;
+  border-top: 2px solid silver;
+  padding-top: 0.5em;
+  margin-top: 4.0em;
+}
+#footer-text {
+  float: left;
+  padding-bottom: 0.5em;
+}
+#footer-badges {
+  float: right;
+  padding-bottom: 0.5em;
+}
+
+#preamble {
+  margin-top: 1.5em;
+  margin-bottom: 1.5em;
+}
+div.imageblock, div.exampleblock, div.verseblock,
+div.quoteblock, div.literalblock, div.listingblock, div.sidebarblock,
+div.admonitionblock {
+  margin-top: 1.0em;
+  margin-bottom: 1.5em;
+}
+div.admonitionblock {
+  margin-top: 2.0em;
+  margin-bottom: 2.0em;
+  margin-right: 10%;
+  color: #606060;
+}
+
+div.content { /* Block element content. */
+  padding: 0;
+}
+
+/* Block element titles. */
+div.title, caption.title {
+  color: #527bbd;
+  font-weight: bold;
+  text-align: left;
+  margin-top: 1.0em;
+  margin-bottom: 0.5em;
+}
+div.title + * {
+  margin-top: 0;
+}
+
+td div.title:first-child {
+  margin-top: 0.0em;
+}
+div.content div.title:first-child {
+  margin-top: 0.0em;
+}
+div.content + div.title {
+  margin-top: 0.0em;
+}
+
+div.sidebarblock > div.content {
+  background: #ffffee;
+  border: 1px solid #dddddd;
+  border-left: 4px solid #f0f0f0;
+  padding: 0.5em;
+}
+
+div.listingblock > div.content {
+  border: 1px solid #dddddd;
+  border-left: 5px solid #f0f0f0;
+  background: #f8f8f8;
+  padding: 0.5em;
+}
+
+div.quoteblock, div.verseblock {
+  padding-left: 1.0em;
+  margin-left: 1.0em;
+  margin-right: 10%;
+  border-left: 5px solid #f0f0f0;
+  color: #888;
+}
+
+div.quoteblock > div.attribution {
+  padding-top: 0.5em;
+  text-align: right;
+}
+
+div.verseblock > pre.content {
+  font-family: inherit;
+  font-size: inherit;
+}
+div.verseblock > div.attribution {
+  padding-top: 0.75em;
+  text-align: left;
+}
+/* DEPRECATED: Pre version 8.2.7 verse style literal block. */
+div.verseblock + div.attribution {
+  text-align: left;
+}
+
+div.admonitionblock .icon {
+  vertical-align: top;
+  font-size: 1.1em;
+  font-weight: bold;
+  text-decoration: underline;
+  color: #527bbd;
+  padding-right: 0.5em;
+}
+div.admonitionblock td.content {
+  padding-left: 0.5em;
+  border-left: 3px solid #dddddd;
+}
+
+div.exampleblock > div.content {
+  border-left: 3px solid #dddddd;
+  padding-left: 0.5em;
+}
+
+div.imageblock div.content { padding-left: 0; }
+span.image img { border-style: none; }
+a.image:visited { color: white; }
+
+dl {
+  margin-top: 0.8em;
+  margin-bottom: 0.8em;
+}
+dt {
+  margin-top: 0.5em;
+  margin-bottom: 0;
+  font-style: normal;
+  color: navy;
+}
+dd > *:first-child {
+  margin-top: 0.1em;
+}
+
+ul, ol {
+    list-style-position: outside;
+}
+ol.arabic {
+  list-style-type: decimal;
+}
+ol.loweralpha {
+  list-style-type: lower-alpha;
+}
+ol.upperalpha {
+  list-style-type: upper-alpha;
+}
+ol.lowerroman {
+  list-style-type: lower-roman;
+}
+ol.upperroman {
+  list-style-type: upper-roman;
+}
+
+div.compact ul, div.compact ol,
+div.compact p, div.compact p,
+div.compact div, div.compact div {
+  margin-top: 0.1em;
+  margin-bottom: 0.1em;
+}
+
+tfoot {
+  font-weight: bold;
+}
+td > div.verse {
+  white-space: pre;
+}
+
+div.hdlist {
+  margin-top: 0.8em;
+  margin-bottom: 0.8em;
+}
+div.hdlist tr {
+  padding-bottom: 15px;
+}
+dt.hdlist1.strong, td.hdlist1.strong {
+  font-weight: bold;
+}
+td.hdlist1 {
+  vertical-align: top;
+  font-style: normal;
+  padding-right: 0.8em;
+  color: navy;
+}
+td.hdlist2 {
+  vertical-align: top;
+}
+div.hdlist.compact tr {
+  margin: 0;
+  padding-bottom: 0;
+}
+
+.comment {
+  background: yellow;
+}
+
+.footnote, .footnoteref {
+  font-size: 0.8em;
+}
+
+span.footnote, span.footnoteref {
+  vertical-align: super;
+}
+
+#footnotes {
+  margin: 20px 0 20px 0;
+  padding: 7px 0 0 0;
+}
+
+#footnotes div.footnote {
+  margin: 0 0 5px 0;
+}
+
+#footnotes hr {
+  border: none;
+  border-top: 1px solid silver;
+  height: 1px;
+  text-align: left;
+  margin-left: 0;
+  width: 20%;
+  min-width: 100px;
+}
+
+div.colist td {
+  padding-right: 0.5em;
+  padding-bottom: 0.3em;
+  vertical-align: top;
+}
+div.colist td img {
+  margin-top: 0.3em;
+}
+
+@media print {
+  #footer-badges { display: none; }
+}
+
+#toc {
+  margin-bottom: 2.5em;
+}
+
+#toctitle {
+  color: #527bbd;
+  font-size: 1.1em;
+  font-weight: bold;
+  margin-top: 1.0em;
+  margin-bottom: 0.1em;
+}
+
+div.toclevel0, div.toclevel1, div.toclevel2, div.toclevel3, div.toclevel4 {
+  margin-top: 0;
+  margin-bottom: 0;
+}
+div.toclevel2 {
+  margin-left: 2em;
+  font-size: 0.9em;
+}
+div.toclevel3 {
+  margin-left: 4em;
+  font-size: 0.9em;
+}
+div.toclevel4 {
+  margin-left: 6em;
+  font-size: 0.9em;
+}
+
+span.aqua { color: aqua; }
+span.black { color: black; }
+span.blue { color: blue; }
+span.fuchsia { color: fuchsia; }
+span.gray { color: gray; }
+span.green { color: green; }
+span.lime { color: lime; }
+span.maroon { color: maroon; }
+span.navy { color: navy; }
+span.olive { color: olive; }
+span.purple { color: purple; }
+span.red { color: red; }
+span.silver { color: silver; }
+span.teal { color: teal; }
+span.white { color: white; }
+span.yellow { color: yellow; }
+
+span.aqua-background { background: aqua; }
+span.black-background { background: black; }
+span.blue-background { background: blue; }
+span.fuchsia-background { background: fuchsia; }
+span.gray-background { background: gray; }
+span.green-background { background: green; }
+span.lime-background { background: lime; }
+span.maroon-background { background: maroon; }
+span.navy-background { background: navy; }
+span.olive-background { background: olive; }
+span.purple-background { background: purple; }
+span.red-background { background: red; }
+span.silver-background { background: silver; }
+span.teal-background { background: teal; }
+span.white-background { background: white; }
+span.yellow-background { background: yellow; }
+
+span.big { font-size: 2em; }
+span.small { font-size: 0.6em; }
+
+span.underline { text-decoration: underline; }
+span.overline { text-decoration: overline; }
+span.line-through { text-decoration: line-through; }
+
+div.unbreakable { page-break-inside: avoid; }
+
+
+/*
+ * xhtml11 specific
+ *
+ * */
+
+tt {
+  font-family: "Courier New", Courier, monospace;
+  font-size: inherit;
+  color: navy;
+}
+
+div.tableblock {
+  margin-top: 1.0em;
+  margin-bottom: 1.5em;
+}
+div.tableblock > table {
+  border: 3px solid #527bbd;
+}
+thead, p.table.header {
+  font-weight: bold;
+  color: #527bbd;
+}
+p.table {
+  margin-top: 0;
+}
+/* Because the table frame attribute is overriden by CSS in most browsers. */
+div.tableblock > table[frame="void"] {
+  border-style: none;
+}
+div.tableblock > table[frame="hsides"] {
+  border-left-style: none;
+  border-right-style: none;
+}
+div.tableblock > table[frame="vsides"] {
+  border-top-style: none;
+  border-bottom-style: none;
+}
+
+
+/*
+ * html5 specific
+ *
+ * */
+
+.monospaced {
+  font-family: "Courier New", Courier, monospace;
+  font-size: inherit;
+  color: navy;
+}
+
+table.tableblock {
+  margin-top: 1.0em;
+  margin-bottom: 1.5em;
+}
+thead, p.tableblock.header {
+  font-weight: bold;
+  color: #527bbd;
+}
+p.tableblock {
+  margin-top: 0;
+}
+table.tableblock {
+  border-width: 3px;
+  border-spacing: 0px;
+  border-style: solid;
+  border-color: #527bbd;
+  border-collapse: collapse;
+}
+th.tableblock, td.tableblock {
+  border-width: 1px;
+  padding: 4px;
+  border-style: solid;
+  border-color: #527bbd;
+}
+
+table.tableblock.frame-topbot {
+  border-left-style: hidden;
+  border-right-style: hidden;
+}
+table.tableblock.frame-sides {
+  border-top-style: hidden;
+  border-bottom-style: hidden;
+}
+table.tableblock.frame-none {
+  border-style: hidden;
+}
+
+th.tableblock.halign-left, td.tableblock.halign-left {
+  text-align: left;
+}
+th.tableblock.halign-center, td.tableblock.halign-center {
+  text-align: center;
+}
+th.tableblock.halign-right, td.tableblock.halign-right {
+  text-align: right;
+}
+
+th.tableblock.valign-top, td.tableblock.valign-top {
+  vertical-align: top;
+}
+th.tableblock.valign-middle, td.tableblock.valign-middle {
+  vertical-align: middle;
+}
+th.tableblock.valign-bottom, td.tableblock.valign-bottom {
+  vertical-align: bottom;
+}
+
+
+/*
+ * manpage specific
+ *
+ * */
+
+body.manpage h1 {
+  padding-top: 0.5em;
+  padding-bottom: 0.5em;
+  border-top: 2px solid silver;
+  border-bottom: 2px solid silver;
+}
+body.manpage h2 {
+  border-style: none;
+}
+body.manpage div.sectionbody {
+  margin-left: 3em;
+}
+
+@media print {
+  body.manpage div#toc { display: none; }
+}
+</style>
+<script type="text/javascript">
+/*<![CDATA[*/
+var asciidoc = {  // Namespace.
+
+/////////////////////////////////////////////////////////////////////
+// Table Of Contents generator
+/////////////////////////////////////////////////////////////////////
+
+/* Author: Mihai Bazon, September 2002
+ * http://students.infoiasi.ro/~mishoo
+ *
+ * Table Of Content generator
+ * Version: 0.4
+ *
+ * Feel free to use this script under the terms of the GNU General Public
+ * License, as long as you do not remove or alter this notice.
+ */
+
+ /* modified by Troy D. Hanson, September 2006. License: GPL */
+ /* modified by Stuart Rackham, 2006, 2009. License: GPL */
+
+// toclevels = 1..4.
+toc: function (toclevels) {
+
+  function getText(el) {
+    var text = "";
+    for (var i = el.firstChild; i != null; i = i.nextSibling) {
+      if (i.nodeType == 3 /* Node.TEXT_NODE */) // IE doesn't speak constants.
+        text += i.data;
+      else if (i.firstChild != null)
+        text += getText(i);
+    }
+    return text;
+  }
+
+  function TocEntry(el, text, toclevel) {
+    this.element = el;
+    this.text = text;
+    this.toclevel = toclevel;
+  }
+
+  function tocEntries(el, toclevels) {
+    var result = new Array;
+    var re = new RegExp('[hH]([1-'+(toclevels+1)+'])');
+    // Function that scans the DOM tree for header elements (the DOM2
+    // nodeIterator API would be a better technique but not supported by all
+    // browsers).
+    var iterate = function (el) {
+      for (var i = el.firstChild; i != null; i = i.nextSibling) {
+        if (i.nodeType == 1 /* Node.ELEMENT_NODE */) {
+          var mo = re.exec(i.tagName);
+          if (mo && (i.getAttribute("class") || i.getAttribute("className")) != "float") {
+            result[result.length] = new TocEntry(i, getText(i), mo[1]-1);
+          }
+          iterate(i);
+        }
+      }
+    }
+    iterate(el);
+    return result;
+  }
+
+  var toc = document.getElementById("toc");
+  if (!toc) {
+    return;
+  }
+
+  // Delete existing TOC entries in case we're reloading the TOC.
+  var tocEntriesToRemove = [];
+  var i;
+  for (i = 0; i < toc.childNodes.length; i++) {
+    var entry = toc.childNodes[i];
+    if (entry.nodeName.toLowerCase() == 'div'
+     && entry.getAttribute("class")
+     && entry.getAttribute("class").match(/^toclevel/))
+      tocEntriesToRemove.push(entry);
+  }
+  for (i = 0; i < tocEntriesToRemove.length; i++) {
+    toc.removeChild(tocEntriesToRemove[i]);
+  }
+
+  // Rebuild TOC entries.
+  var entries = tocEntries(document.getElementById("content"), toclevels);
+  for (var i = 0; i < entries.length; ++i) {
+    var entry = entries[i];
+    if (entry.element.id == "")
+      entry.element.id = "_toc_" + i;
+    var a = document.createElement("a");
+    a.href = "#" + entry.element.id;
+    a.appendChild(document.createTextNode(entry.text));
+    var div = document.createElement("div");
+    div.appendChild(a);
+    div.className = "toclevel" + entry.toclevel;
+    toc.appendChild(div);
+  }
+  if (entries.length == 0)
+    toc.parentNode.removeChild(toc);
+},
+
+
+/////////////////////////////////////////////////////////////////////
+// Footnotes generator
+/////////////////////////////////////////////////////////////////////
+
+/* Based on footnote generation code from:
+ * http://www.brandspankingnew.net/archive/2005/07/format_footnote.html
+ */
+
+footnotes: function () {
+  // Delete existing footnote entries in case we're reloading the footnodes.
+  var i;
+  var noteholder = document.getElementById("footnotes");
+  if (!noteholder) {
+    return;
+  }
+  var entriesToRemove = [];
+  for (i = 0; i < noteholder.childNodes.length; i++) {
+    var entry = noteholder.childNodes[i];
+    if (entry.nodeName.toLowerCase() == 'div' && entry.getAttribute("class") == "footnote")
+      entriesToRemove.push(entry);
+  }
+  for (i = 0; i < entriesToRemove.length; i++) {
+    noteholder.removeChild(entriesToRemove[i]);
+  }
+
+  // Rebuild footnote entries.
+  var cont = document.getElementById("content");
+  var spans = cont.getElementsByTagName("span");
+  var refs = {};
+  var n = 0;
+  for (i=0; i<spans.length; i++) {
+    if (spans[i].className == "footnote") {
+      n++;
+      var note = spans[i].getAttribute("data-note");
+      if (!note) {
+        // Use [\s\S] in place of . so multi-line matches work.
+        // Because JavaScript has no s (dotall) regex flag.
+        note = spans[i].innerHTML.match(/\s*\[([\s\S]*)]\s*/)[1];
+        spans[i].innerHTML =
+          "[<a id='_footnoteref_" + n + "' href='#_footnote_" + n +
+          "' title='View footnote' class='footnote'>" + n + "</a>]";
+        spans[i].setAttribute("data-note", note);
+      }
+      noteholder.innerHTML +=
+        "<div class='footnote' id='_footnote_" + n + "'>" +
+        "<a href='#_footnoteref_" + n + "' title='Return to text'>" +
+        n + "</a>. " + note + "</div>";
+      var id =spans[i].getAttribute("id");
+      if (id != null) refs["#"+id] = n;
+    }
+  }
+  if (n == 0)
+    noteholder.parentNode.removeChild(noteholder);
+  else {
+    // Process footnoterefs.
+    for (i=0; i<spans.length; i++) {
+      if (spans[i].className == "footnoteref") {
+        var href = spans[i].getElementsByTagName("a")[0].getAttribute("href");
+        href = href.match(/#.*/)[0];  // Because IE return full URL.
+        n = refs[href];
+        spans[i].innerHTML =
+          "[<a href='#_footnote_" + n +
+          "' title='View footnote' class='footnote'>" + n + "</a>]";
+      }
+    }
+  }
+},
+
+install: function(toclevels) {
+  var timerId;
+
+  function reinstall() {
+    asciidoc.footnotes();
+    if (toclevels) {
+      asciidoc.toc(toclevels);
+    }
+  }
+
+  function reinstallAndRemoveTimer() {
+    clearInterval(timerId);
+    reinstall();
+  }
+
+  timerId = setInterval(reinstall, 500);
+  if (document.addEventListener)
+    document.addEventListener("DOMContentLoaded", reinstallAndRemoveTimer, false);
+  else
+    window.onload = reinstallAndRemoveTimer;
+}
+
+}
+asciidoc.install();
+/*]]>*/
+</script>
+</head>
+<body class="article">
+<div id="header">
+<h1>Converting Recoll indexing to multithreading</h1>
+<span id="author">Jean-François Dockès</span><br />
+<span id="email"><tt>&lt;<a href="mailto:jfd@recoll.org">jfd@recoll.org</a>&gt;</tt></span><br />
+<span id="revdate">2012-12-03</span>
+</div>
+<div id="content">
+<div id="preamble">
+<div class="sectionbody">
+<div class="paragraph"><p><a href="http://www.recoll.org"><strong>Recoll</strong></a> is a document indexing application, it
+allows you to find documents by specifying search terms.</p></div>
+<div class="paragraph"><p>The documents need to be <em>indexed</em> for searches to be fast. In a nutshell,
+we convert the different document formats to text, then split the text into
+terms and remember where those occur. This is a time-consuming operation.</p></div>
+<div class="paragraph"><p>Up to version 1.18 <strong>Recoll</strong> indexing is single-threaded: routines which
+call each other sequentially.</p></div>
+<div class="paragraph"><p>In most personal indexer contexts, it is also CPU-bound. There is a lot of
+conversion work necessary for turning those PDF (or other) files into
+appropriately cleaned up pure text, then split it into terms and update the
+index. Given the relatively modest amount of data, and the speed of
+storage, I/O issues are secondary.</p></div>
+<div class="paragraph"><p>Looking at the <em>CPU idle</em> <strong>top</strong> output stuck at 75% on my quad-core CPU,
+while waiting for the indexing to finish, was frustrating, and I was
+tempted to find a way to keep those other cores at temperature and shorten
+the waiting.</p></div>
+<div class="paragraph"><p>For some usages, the best way to accomplish this may be to just partition
+the index and independantly start indexing on different configurations,
+using multiple processes to better utilize the available processing power.</p></div>
+<div class="paragraph"><p>This is not an universal solution though, as it is complicated to set up,
+not optimal in general for indexing performance, and not always optimal
+either at query time.</p></div>
+<div class="paragraph"><p>The most natural way to improve indexing times is to increase CPU
+utilization by using multiple threads inside an indexing process.</p></div>
+<div class="paragraph"><p>Something similar had been done with earlier versions of the <strong>Recoll</strong> GUI,
+which had an internal indexing thread. This had been a frequent source of
+trouble though, and linking the GUI and indexing process lifetimes was a
+bad idea, so, in recent versions, the indexing is always performed by an
+external process. Still, this experience had put in light most of the
+problem areas, and prepared the code for further work.</p></div>
+<div class="paragraph"><p>It should be noted that, as <tt>recollindex</tt> is both <em>nice</em>'d and <em>ionice</em>'d
+as a lowest priority process, it will only use free computing power on the
+machine, and will step down as soon as anything else wants to work.</p></div>
+<div class="sidebarblock">
+<div class="content">
+<div class="paragraph"><p>The only case where you may notice that the indexing is at work
+is when the machine is short on memory and things (such as
+your Web browser) get swapped-out while you are not actively using
+them. You then notice a long delay when you want to start, because they
+need to be swapped back in. There is little which can be done about
+this. Setting <em>idxflushmb</em> to a low value may help in some cases (depending
+on the document sizes). May I also suggest in this case that, if your
+machine can take more memory, it may be a good idea to procure some, as
+memory is nowadays quite cheap, and memory-starved machines are not fun.</p></div>
+</div></div>
+<div class="paragraph"><p>In general, augmenting the machine utilisation by <tt>recollindex</tt> just does
+not change its responsiveness. My PC has a an Intel Pentium Core i5 750 (4
+cores, no hyperthreading), which is far from being a high performance CPU
+(nowadays&#8230;), and I often forget that I am running indexing tests, it is
+just not noticeable. The machine does have a lot of memory though (12GB).</p></div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="_the_recoll_indexing_processing_flow">The Recoll indexing processing flow</h2>
+<div class="sectionbody">
+<div class="imageblock" style="float:right;">
+<div class="content">
+<img src="nothreads.png" alt="Basic flow" />
+</div>
+</div>
+<div class="paragraph"><p>There are 4 main steps in the <tt>recollindex</tt> processing pipeline:</p></div>
+<div class="olist arabic"><ol class="arabic">
+<li>
+<p>
+Find the file
+</p>
+</li>
+<li>
+<p>
+Convert it to text
+</p>
+</li>
+<li>
+<p>
+Process the text (split, strip etc.) and create a <strong>Xapian</strong> document
+</p>
+</li>
+<li>
+<p>
+Update the index
+</p>
+</li>
+</ol></div>
+<div class="paragraph"><p>The first step, walking the file system (or some other data source), is
+usually much faster than the others, and we just leave it alone to be
+performed by the main thread. It outputs file names (and the associated
+<strong>POSIX</strong> <em>stat</em> data).</p></div>
+<div class="paragraph"><p>The last step, <strong>Xapian</strong> index updating, can only be single-threaded.</p></div>
+<div class="paragraph"><p>The first idea is to change the indexing pipeline so that each step is
+performed by an independant worker thread, passing its output to the next
+thread, in assembly-line fashion.</p></div>
+<div class="paragraph"><p>In order to achieve this, we need to decouple the different phases. They
+are normally linked by procedure calls, which we replace with a job
+control object: the <em>WorkQueue</em>.</p></div>
+<div class="sect2">
+<h3 id="_the_workqueue">The WorkQueue</h3>
+<div class="paragraph"><p>The <em>WorkQueue</em> object is implemented by a reasonably simple class, which
+manages an input queue on which client append jobs, and a set of worker
+threads, which retrieve and perform the jobs, and whose lifetime are
+managed by the <em>WorkQueue</em> object. The
+<a href="https://bitbucket.org/medoc/recoll/src/f06f3aba912045d6ad52e9a0fd930b95e363fd10/src/utils/workqueue.h?at=default">implementation</a> is straightforward with
+<strong>POSIX</strong> threads synchronization functions and C++ <strong>STL</strong> data structures.</p></div>
+<div class="paragraph"><p>In practise it proved quite simple to modify existing code to create a job
+object and put it on the queue, instead of calling the downstream routine
+with the job parameters, <em>while keeping the capacity to call the downstream
+routine directly</em>. The kind of coupling is determined either by compilation
+flags (for global disabling/enabling of multithreading), or according to
+configuration data, which allows experimenting with different threads
+arrangements just by changing parameters in a file, without recompiling.</p></div>
+<div class="paragraph"><p>Each <em>WorkQueue</em> accepts two parameters: the length of the input queue
+(before a client will block when trying to add a job), and the number of
+worker threads. Both parameters can be set in the <strong>Recoll</strong> configuration
+file for each of the three queues used in the indexing pipeline. Setting
+the queue length to -1 will disable the corresponding queue (using a direct
+call instead).</p></div>
+<div style="clear:both;"></div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="_the_assembly_line">The Assembly Line</h2>
+<div class="sectionbody">
+<div class="imageblock" style="float:right;">
+<div class="content">
+<img src="assembly.png" alt="Assembly line" />
+</div>
+</div>
+<div class="paragraph"><p>So the first idea is to create 3 explicit threads to manage the file
+conversion, the term generation, and the <strong>Xapian</strong> index update. The first
+thread prepares a file, passes it on to the term generation thread, and
+immediately goes back to work on the next file, etc.</p></div>
+<div class="paragraph"><p>The presumed advantage of this method is that the different stages, which
+perform disjointed processing, should share little, so that we can hope to
+minimize the changes necessitated by the threads interactions.</p></div>
+<div class="paragraph"><p>However some changes to the code were needed to make this work (and a few
+bugs were missed, which only became apparent at later stages, confirming
+that the <em>low interaction</em> idea was not completely false).</p></div>
+<div class="sect2">
+<h3 id="_converting_to_multithreading_what_to_look_for">Converting to multithreading: what to look for</h3>
+<div class="paragraph"><p>I am probably stating the obvious here, but when preparing a program for
+multi-threading, problems can only arise where non-constant data is
+accessed by different threads.</p></div>
+<div class="paragraph"><p>Once you have solved the core problems posed by the obvious data that needs
+to be shared, you will be left to deal with less obvious, hidden,
+interactions inside the program.</p></div>
+<div class="paragraph"><p>Classically this would concern global or static data, but in a C++ program,
+class members will be a concern if a single object can be accessed by
+several threads.</p></div>
+<div class="paragraph"><p>Hunting for static data inside a program of non trivial size is not always
+obvious. Two approaches can be used: hunting for the <em>static</em> keyword in
+source code, or looking at global and static data symbols in <strong>nm</strong> output.</p></div>
+<div class="paragraph"><p>Once found, there are mostly three types of static/global data:</p></div>
+<div class="ulist"><ul>
+<li>
+<p>
+Things that need to be eliminated: for example, routines can be made
+   reentrant by letting the caller supply a storage buffer instead of using
+   an internal static one (which was a bad idea in the first place
+   anyway).
+</p>
+</li>
+<li>
+<p>
+Things that need to be protected: sometimes, the best approach is just
+   to protect the access with a mutex lock. It is trivial to encapsulate
+   the locks in C++ objects to use the "Resource Acquisition is
+   Initialization" idiom, easily making sure that locks are freed when
+   exiting the critical section. A very basic
+   <a href="https://bitbucket.org/medoc/recoll/src/f06f3aba9120/src/utils/ptmutex.h?at=default">example of implementation</a>
+   can be found in the <strong>Recoll</strong> source code.
+</p>
+</li>
+<li>
+<p>
+Things which can stay: this is mostly initialization data such as value
+   tables which are computed once, and then stay logically constant during
+   program execution. In order to be sure of a correct single-threaded
+   initialization, it is best to explicitly initialize the modules or
+   functions that use this kind of data in the main thread when the program
+   starts.
+</p>
+</li>
+</ul></div>
+</div>
+<div class="sect2">
+<h3 id="_assembly_line_approach_the_results">Assembly line approach: the results</h3>
+<div class="paragraph"><p>Unfortunately, the assembly line approach yields very modest improvements
+when used inside <strong>Recoll</strong> indexing. The reason, is that this method needs
+stages of equivalent complexity to be efficient. If one of the stages
+dominates the others, its thread will be the only one active at any time,
+and little will be gained.</p></div>
+<div class="paragraph"><p>What is especially problematic is that the balance between tasks need not
+only exist on average, but also for the majority of individual jobs.</p></div>
+<div class="paragraph"><p>For <strong>Recoll</strong> indexing, even if the data preparation and index update steps
+are often of the same order of magnitude <em>on average</em>, their balance
+depends a lot on the kind of data being processed, so that things are
+usually unbalanced at any given time: the index update thread is mostly
+idle while processing PDF files, and the data preparation has little to do
+when working on HTML or plain text.</p></div>
+<div class="paragraph"><p>In practice, very modest indexing time improvements from 5% to 15% were
+achieved with this method.</p></div>
+</div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="_the_next_step_multi_stage_parallelism">The next step: multi-stage parallelism</h2>
+<div class="sectionbody">
+<div class="imageblock" style="float:right;">
+<div class="content">
+<img src="multipara.png" alt="Multi-stage parallelism" />
+</div>
+</div>
+<div class="paragraph"><p>Given the limitations of the assembly line approach, the next step in the
+transformation of <strong>Recoll</strong> indexing was to enable full parallelism wherever
+possible.</p></div>
+<div class="paragraph"><p>Of the four processing steps (see figures), two are not candidates for
+parallelization:</p></div>
+<div class="ulist"><ul>
+<li>
+<p>
+File system walking is so fast compared to the other steps that using
+   several threads would make no sense (it would also quite probably become
+   IO bound if we tried anyway).
+</p>
+</li>
+<li>
+<p>
+The <strong>Xapian</strong> library index updating code is not designed for
+   multi-threading and must stay protected from multiple accesses.
+</p>
+</li>
+</ul></div>
+<div class="paragraph"><p>The two other steps are good candidates.</p></div>
+<div class="paragraph"><p>Most of the work to make <strong>Recoll</strong> code reentrant had been performed for the
+previous transformation. Going full-parallel only implied protecting the
+data structures that needed to be shared by the threads performing a given
+processing step.</p></div>
+<div class="paragraph"><p>Just for the anecdotic value, a list of the elements that needed mutexes:</p></div>
+<div class="ulist"><ul>
+<li>
+<p>
+Filter subprocesses cache: some file conversion subprocesses may be
+  expensive (starting a Python process is no piece of cake), so they are
+  cached for reuse after they are done translating a file. The shared cache
+  needs protection.
+</p>
+</li>
+<li>
+<p>
+Status updates: an object used to update the current file name and indexing
+  status to a shared file.
+</p>
+</li>
+<li>
+<p>
+Missing store: the list of missing helper programs
+</p>
+</li>
+<li>
+<p>
+The readonly <strong>Xapian</strong> database object: a Xapian::Database object which is
+  used for checking the validity of current index data against a file&#8217;s
+  last modification date.
+</p>
+</li>
+<li>
+<p>
+Document existence map: a bit array used to store an existence bit about
+  every document, and purge the disappeared at the end of the indexing
+  pass. This is accessed both from the file conversion and database update
+  code, so it also needed protection in the previous assembly line
+  approach.
+</p>
+</li>
+<li>
+<p>
+Mbox offsets cache. Used to store the offsets of individual messages
+  inside <strong>mbox</strong> files.
+</p>
+</li>
+<li>
+<p>
+<strong>iconv</strong> control blocks: these are cached for reuse in several places, and
+  need protection. Actually, it might be better in multithreading context
+  to just suppress the reuse and locking. Rough tests seem to indicate that
+  the impact on overall performance is small, but this might change with
+  higher parallelism (or not&#8230;).
+</p>
+</li>
+</ul></div>
+<div class="paragraph"><p>The <strong>Recoll</strong> configuration also used to be managed by a single shared
+object, which is mutable as values may depend on what area of the
+file-system we are exploring, so that the object is stateful and updated as
+we change directories. The choice made here was to duplicate the object
+where needed (each indexing thread gets its own). This gave rise to the
+sneakiest bug in the whole transformation (see further down).</p></div>
+<div class="paragraph"><p>Having a dynamic way to define the threads configuration makes it easy to
+experiment. For example, the following data defines the configuration that
+was finally found to be best overall on my hardware:</p></div>
+<div class="literalblock">
+<div class="content">
+<pre><tt>thrQSizes = 2 2 2
+thrTCounts =  4 2 1</tt></pre>
+</div></div>
+<div class="paragraph"><p>This is using 3 queues of depth 2, 4 threads working on file conversion, 2
+on text splitting and other document processing, and 1 on Xapian updating
+(no choice here).</p></div>
+<div style="clear:both;"></div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="_bench_results">Bench results</h2>
+<div class="sectionbody">
+<div class="paragraph"><p>So the big question after all the work: was it worth it ? I could only get
+a real answer when the program stopped crashing, so this took some time and
+a little faith&#8230;</p></div>
+<div class="paragraph"><p>The answer is mostly yes, as far as I&#8217;m concerned. Indexing tests running
+almost twice as fast are good for my blood pressure and I don&#8217;t need a
+faster PC, I&#8217;ll buy more red wine instead (good for my health too, or maybe
+not). And it was a fun project anyway.</p></div>
+<div class="tableblock">
+<table rules="all"
+width="70%"
+frame="border"
+cellspacing="0" cellpadding="4">
+<caption class="title">Table 1. Results on a variety of file system areas:</caption>
+<col width="20%" />
+<col width="20%" />
+<col width="20%" />
+<col width="20%" />
+<col width="20%" />
+<thead>
+<tr>
+<th align="left" valign="top">Area </th>
+<th align="left" valign="top">Seconds before </th>
+<th align="left" valign="top">Seconds after</th>
+<th align="left" valign="top"> Percent Improvement</th>
+<th align="left" valign="top"> Speed Factor</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td align="left" valign="top"><p class="table">home</p></td>
+<td align="left" valign="top"><p class="table">12742</p></td>
+<td align="left" valign="top"><p class="table">6942</p></td>
+<td align="left" valign="top"><p class="table">46%</p></td>
+<td align="left" valign="top"><p class="table">1.8</p></td>
+</tr>
+<tr>
+<td align="left" valign="top"><p class="table">mail</p></td>
+<td align="left" valign="top"><p class="table">2700</p></td>
+<td align="left" valign="top"><p class="table">1563</p></td>
+<td align="left" valign="top"><p class="table">58%</p></td>
+<td align="left" valign="top"><p class="table">1.7</p></td>
+</tr>
+<tr>
+<td align="left" valign="top"><p class="table">projets</p></td>
+<td align="left" valign="top"><p class="table">5022</p></td>
+<td align="left" valign="top"><p class="table">1970</p></td>
+<td align="left" valign="top"><p class="table">61%</p></td>
+<td align="left" valign="top"><p class="table">2.5</p></td>
+</tr>
+<tr>
+<td align="left" valign="top"><p class="table">pdf</p></td>
+<td align="left" valign="top"><p class="table">2164</p></td>
+<td align="left" valign="top"><p class="table">770</p></td>
+<td align="left" valign="top"><p class="table">64%</p></td>
+<td align="left" valign="top"><p class="table">2.8</p></td>
+</tr>
+<tr>
+<td align="left" valign="top"><p class="table">otherhtml</p></td>
+<td align="left" valign="top"><p class="table">5593</p></td>
+<td align="left" valign="top"><p class="table">4014</p></td>
+<td align="left" valign="top"><p class="table">28%</p></td>
+<td align="left" valign="top"><p class="table">1.4</p></td>
+</tr>
+</tbody>
+</table>
+</div>
+<div class="tableblock">
+<table rules="all"
+width="70%"
+frame="border"
+cellspacing="0" cellpadding="4">
+<caption class="title">Table 2. Characteristics of the data</caption>
+<col width="20%" />
+<col width="20%" />
+<col width="20%" />
+<col width="20%" />
+<col width="20%" />
+<thead>
+<tr>
+<th align="left" valign="top">Area </th>
+<th align="left" valign="top"> Files MB </th>
+<th align="left" valign="top"> Files </th>
+<th align="left" valign="top"> DB MB </th>
+<th align="left" valign="top"> Documents</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td align="left" valign="top"><p class="table">home</p></td>
+<td align="left" valign="top"><p class="table">64106</p></td>
+<td align="left" valign="top"><p class="table">44897</p></td>
+<td align="left" valign="top"><p class="table">1197</p></td>
+<td align="left" valign="top"><p class="table">104797</p></td>
+</tr>
+<tr>
+<td align="left" valign="top"><p class="table">mail</p></td>
+<td align="left" valign="top"><p class="table">813</p></td>
+<td align="left" valign="top"><p class="table">232</p></td>
+<td align="left" valign="top"><p class="table">663</p></td>
+<td align="left" valign="top"><p class="table">47267</p></td>
+</tr>
+<tr>
+<td align="left" valign="top"><p class="table">projets</p></td>
+<td align="left" valign="top"><p class="table">2056</p></td>
+<td align="left" valign="top"><p class="table">34504</p></td>
+<td align="left" valign="top"><p class="table">549</p></td>
+<td align="left" valign="top"><p class="table">40281</p></td>
+</tr>
+<tr>
+<td align="left" valign="top"><p class="table">pdf</p></td>
+<td align="left" valign="top"><p class="table">1123</p></td>
+<td align="left" valign="top"><p class="table">1139</p></td>
+<td align="left" valign="top"><p class="table">111</p></td>
+<td align="left" valign="top"><p class="table">1139</p></td>
+</tr>
+<tr>
+<td align="left" valign="top"><p class="table">otherhtml</p></td>
+<td align="left" valign="top"><p class="table">3442</p></td>
+<td align="left" valign="top"><p class="table">223007</p></td>
+<td align="left" valign="top"><p class="table">2080</p></td>
+<td align="left" valign="top"><p class="table">221890</p></td>
+</tr>
+</tbody>
+</table>
+</div>
+<div class="paragraph"><p><em>home</em> is my home directory. The high megabyte value is due to a number of
+very big and not indexed <strong>VirtualBox</strong> images. Otherwise, it&#8217;s a wide
+mix of source files, email,  miscellaneous documents and ebooks.</p></div>
+<div class="paragraph"><p><em>mail</em> is my mail directory, full of <strong>mbox</strong> files.</p></div>
+<div class="paragraph"><p><em>projets</em> mostly holds source files, and a number of documents.</p></div>
+<div class="paragraph"><p><em>pdf</em> holds random <strong>pdf</strong> files harvested on the internets. The performance
+is quite spectacular, because most of the processing time goes to
+converting them to text, and this is done in parallel. Probably could be
+made a bit faster with more cores, until we hit the <strong>Xapian</strong> update speed
+limit.</p></div>
+<div class="paragraph"><p><em>otherhtml</em> holds myriad of small html files, mostly from
+<strong>wikipedia</strong>. The improvement is not great here because a lot of time is
+spent in the single-threaded <strong>Xapian</strong> index update.</p></div>
+<div class="paragraph"><p>The tests were made with queue depths of 2 on all queues, and 4 threads
+working on the file conversion step, 2 on the term generation.</p></div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="_a_variation_linear_parallelism">A variation: linear parallelism</h2>
+<div class="sectionbody">
+<div class="paragraph"><p>Once past the assembly-line idea, another possible transformation would be
+to get rid of the two downstream queues, and just create a job for each
+file and let it go to the end (using a mutex to protect accesses to the
+writable <strong>Xapian</strong> database).</p></div>
+<div class="paragraph"><p>With the current <strong>Recoll</strong> code, this can be defined by the following
+parameters (one can also use a deeper front queue, this changes little):</p></div>
+<div class="literalblock">
+<div class="content">
+<pre><tt>thrQSizes = 2 -1 -1
+thrTCounts =  4 0 0</tt></pre>
+</div></div>
+<div class="paragraph"><p>In practise, the performance is close to the one for the multistage
+version.</p></div>
+<div class="paragraph"><p>If we were to hard-code this approach, this would be a simpler
+modification, necessitating less changes to the code, but it has a slight
+inconvenient: when working on a single big multi-document file, no
+parallelism at all can be obtained. In this situation, the multi-stage
+approach brings us back to the assembly-line behaviour, so the improvements
+are not great, but they do exist.</p></div>
+</div>
+</div>
+<div class="sect1">
+<h2 id="_miscellany">Miscellany</h2>
+<div class="sectionbody">
+<div class="sect2">
+<h3 id="_the_big_gotcha_my_stack_dump_staring_days">The big gotcha: my stack dump staring days</h3>
+<div class="paragraph"><p>Overall, debugging the modified program was reasonably
+straightforward. Data access synchronization issues mostly provoke dynamic
+data corruption, which can be beastly to debug. I was lucky enough that
+most crashes occurred in the code that was actually related to the
+corrupted data, not in some randomly located and unrelated dynamic memory
+user, so that the issues were reasonably easy to find.</p></div>
+<div class="paragraph"><p>One issue though kept me working for a few days. The indexing process kept
+crashing randomly at an interval of a few thousands documents, segfaulting
+on a bad pointer. An access to the configuration data structure seemed to
+be involved, but, as each thread was supposed to have its own copy, I was
+out of ideas.</p></div>
+<div class="paragraph"><p>After reviewing all the uses for the configuration data (there are quite a
+few), the problem was finally revealed to lie with the filter process
+cache. Each filter structure stored in the cache stores a pointer to a
+configuration structure. This belonged to the thread which initially
+created the filter. But the filter would often be reused by a different
+thread, with the consequence that the configuration object was now accessed
+and modified by two unsynchronized threads&#8230; Resetting the config pointer
+at the time of filter reuse was the ridiculously simple single-line fix to
+this evasive problem.</p></div>
+<div class="paragraph"><p>Looking at multi-threaded stack dumps is mostly fun for people with several
+heads, which is unfortunately not my case, so I was quite elated when this
+was over.</p></div>
+</div>
+<div class="sect2">
+<h3 id="_fork_performance_issues">Fork performance issues</h3>
+<div class="paragraph"><p>On a quite unrelated note, something that I discovered while evaluating the
+program performance is that forking a big process like <tt>recollindex</tt> can be
+quite expensive. Even if the memory space of the forked process is not
+copied (it&#8217;s Copy On Write, and we write very little before the following
+exec), just duplicating the memory maps can be slow when the process uses a
+few hundred megabytes.</p></div>
+<div class="paragraph"><p>I modified the single-threaded version of <tt>recollindex</tt> to use <strong>vfork</strong>
+instead of <strong>fork</strong>, but this can&#8217;t be used with multiple threads (no
+modification of the process memory space is allowed in the child between
+<strong>vfork</strong> and <strong>exec</strong>, so we&#8217;d have to have a way to suspend all the threads
+first).</p></div>
+<div class="paragraph"><p>I did not implement a solution to this issue, and I don&#8217;t think
+that a simple one exists. The workaround is to use modest <strong>Xapian</strong> flush
+values to prevent the process from becoming too big.</p></div>
+<div class="paragraph"><p>A longer time solution would be to implement a small slave process to do
+the executing of ephemeral external commands.</p></div>
+</div>
+</div>
+</div>
+</div>
+<div id="footnotes"><hr /></div>
+<div id="footer">
+<div id="footer-text">
+Last updated 2012-12-04 11:14:07 CET
+</div>
+</div>
+</body>
+</html>
diff --git a/website/idxthreads/threadingRecoll.txt b/website/idxthreads/threadingRecoll.txt
new file mode 100644
index 00000000..0fe8e9a8
--- /dev/null
+++ b/website/idxthreads/threadingRecoll.txt
@@ -0,0 +1,399 @@
+= Converting Recoll indexing to multithreading
+:Author: Jean-François Dockès
+:Email: jfd@recoll.org
+:Date: 2012-12-03
+
+http://www.recoll.org[*Recoll*] is a document indexing application, it
+allows you to find documents by specifying search terms. 
+
+The documents need to be _indexed_ for searches to be fast. In a nutshell,
+we convert the different document formats to text, then split the text into
+terms and remember where those occur. This is a time-consuming operation.
+
+Up to version 1.18 *Recoll* indexing is single-threaded: routines which
+call each other sequentially.
+
+In most personal indexer contexts, it is also CPU-bound. There is a lot of
+conversion work necessary for turning those PDF (or other) files into
+appropriately cleaned up pure text, then split it into terms and update the
+index. Given the relatively modest amount of data, and the speed of
+storage, I/O issues are secondary.
+
+Looking at the _CPU idle_ *top* output stuck at 75% on my quad-core CPU,
+while waiting for the indexing to finish, was frustrating, and I was
+tempted to find a way to keep those other cores at temperature and shorten
+the waiting.
+
+For some usages, the best way to accomplish this may be to just partition
+the index and independantly start indexing on different configurations,
+using multiple processes to better utilize the available processing power.
+
+This is not an universal solution though, as it is complicated to set up,
+not optimal in general for indexing performance, and not always optimal
+either at query time. 
+
+The most natural way to improve indexing times is to increase CPU
+utilization by using multiple threads inside an indexing process.
+
+Something similar had been done with earlier versions of the *Recoll* GUI,
+which had an internal indexing thread. This had been a frequent source of
+trouble though, and linking the GUI and indexing process lifetimes was a
+bad idea, so, in recent versions, the indexing is always performed by an
+external process. Still, this experience had put in light most of the
+problem areas, and prepared the code for further work.
+
+It should be noted that, as `recollindex` is both _nice_'d and _ionice_'d
+as a lowest priority process, it will only use free computing power on the
+machine, and will step down as soon as anything else wants to work.
+
+****
+
+The only case where you may notice that the indexing is at work
+is when the machine is short on memory and things (such as
+your Web browser) get swapped-out while you are not actively using
+them. You then notice a long delay when you want to start, because they
+need to be swapped back in. There is little which can be done about
+this. Setting _idxflushmb_ to a low value may help in some cases (depending
+on the document sizes). May I also suggest in this case that, if your
+machine can take more memory, it may be a good idea to procure some, as
+memory is nowadays quite cheap, and memory-starved machines are not fun.
+
+****
+
+In general, augmenting the machine utilisation by `recollindex` just does
+not change its responsiveness. My PC has a an Intel Pentium Core i5 750 (4
+cores, no hyperthreading), which is far from being a high performance CPU
+(nowadays...), and I often forget that I am running indexing tests, it is
+just not noticeable. The machine does have a lot of memory though (12GB).
+
+
+== The Recoll indexing processing flow
+
+image::nothreads.png["Basic flow", float="right"]
+
+There are 4 main steps in the `recollindex` processing pipeline:
+
+ . Find the file
+ . Convert it to text
+ . Process the text (split, strip etc.) and create a *Xapian* document
+ . Update the index
+
+The first step, walking the file system (or some other data source), is
+usually much faster than the others, and we just leave it alone to be
+performed by the main thread. It outputs file names (and the associated
+*POSIX* _stat_ data).
+
+The last step, *Xapian* index updating, can only be single-threaded.
+
+The first idea is to change the indexing pipeline so that each step is
+performed by an independant worker thread, passing its output to the next
+thread, in assembly-line fashion.
+
+In order to achieve this, we need to decouple the different phases. They
+are normally linked by procedure calls, which we replace with a job
+control object: the 'WorkQueue'.
+
+=== The WorkQueue
+
+
+The _WorkQueue_ object is implemented by a reasonably simple class, which
+manages an input queue on which client append jobs, and a set of worker
+threads, which retrieve and perform the jobs, and whose lifetime are
+managed by the _WorkQueue_ object. The 
+https://bitbucket.org/medoc/recoll/src/f06f3aba912045d6ad52e9a0fd930b95e363fd10/src/utils/workqueue.h?at=default[implementation] is straightforward with
+*POSIX* threads synchronization functions and C++ *STL* data structures.
+
+In practise it proved quite simple to modify existing code to create a job
+object and put it on the queue, instead of calling the downstream routine
+with the job parameters, _while keeping the capacity to call the downstream
+routine directly_. The kind of coupling is determined either by compilation
+flags (for global disabling/enabling of multithreading), or according to
+configuration data, which allows experimenting with different threads
+arrangements just by changing parameters in a file, without recompiling.
+
+Each _WorkQueue_ accepts two parameters: the length of the input queue
+(before a client will block when trying to add a job), and the number of
+worker threads. Both parameters can be set in the *Recoll* configuration
+file for each of the three queues used in the indexing pipeline. Setting
+the queue length to -1 will disable the corresponding queue (using a direct
+call instead).
+
+unfloat::[]
+
+
+== The Assembly Line
+
+image::assembly.png["Assembly line", float="right"]
+
+So the first idea is to create 3 explicit threads to manage the file
+conversion, the term generation, and the *Xapian* index update. The first
+thread prepares a file, passes it on to the term generation thread, and
+immediately goes back to work on the next file, etc. 
+
+The presumed advantage of this method is that the different stages, which
+perform disjointed processing, should share little, so that we can hope to
+minimize the changes necessitated by the threads interactions.
+
+However some changes to the code were needed to make this work (and a few
+bugs were missed, which only became apparent at later stages, confirming
+that the _low interaction_ idea was not completely false).
+
+=== Converting to multithreading: what to look for
+
+I am probably stating the obvious here, but when preparing a program for
+multi-threading, problems can only arise where non-constant data is
+accessed by different threads.
+
+Once you have solved the core problems posed by the obvious data that needs
+to be shared, you will be left to deal with less obvious, hidden,
+interactions inside the program.
+
+Classically this would concern global or static data, but in a C++ program,
+class members will be a concern if a single object can be accessed by
+several threads.
+
+Hunting for static data inside a program of non trivial size is not always
+obvious. Two approaches can be used: hunting for the _static_ keyword in
+source code, or looking at global and static data symbols in *nm* output.
+
+Once found, there are mostly three types of static/global data:
+
+ * Things that need to be eliminated: for example, routines can be made
+   reentrant by letting the caller supply a storage buffer instead of using
+   an internal static one (which was a bad idea in the first place
+   anyway).
+ * Things that need to be protected: sometimes, the best approach is just
+   to protect the access with a mutex lock. It is trivial to encapsulate
+   the locks in C++ objects to use the "Resource Acquisition is
+   Initialization" idiom, easily making sure that locks are freed when
+   exiting the critical section. A very basic
+   https://bitbucket.org/medoc/recoll/src/f06f3aba9120/src/utils/ptmutex.h?at=default[example of implementation] 
+   can be found in the *Recoll* source code.
+ * Things which can stay: this is mostly initialization data such as value
+   tables which are computed once, and then stay logically constant during
+   program execution. In order to be sure of a correct single-threaded
+   initialization, it is best to explicitly initialize the modules or
+   functions that use this kind of data in the main thread when the program
+   starts.
+
+=== Assembly line approach: the results
+
+Unfortunately, the assembly line approach yields very modest improvements
+when used inside *Recoll* indexing. The reason, is that this method needs
+stages of equivalent complexity to be efficient. If one of the stages
+dominates the others, its thread will be the only one active at any time,
+and little will be gained.
+
+What is especially problematic is that the balance between tasks need not
+only exist on average, but also for the majority of individual jobs.
+
+For *Recoll* indexing, even if the data preparation and index update steps
+are often of the same order of magnitude _on average_, their balance
+depends a lot on the kind of data being processed, so that things are
+usually unbalanced at any given time: the index update thread is mostly
+idle while processing PDF files, and the data preparation has little to do
+when working on HTML or plain text.
+
+In practice, very modest indexing time improvements from 5% to 15% were
+achieved with this method.
+
+== The next step: multi-stage parallelism
+
+image::multipara.png["Multi-stage parallelism", float="right"]
+
+Given the limitations of the assembly line approach, the next step in the
+transformation of *Recoll* indexing was to enable full parallelism wherever
+possible.
+
+Of the four processing steps (see figures), two are not candidates for
+parallelization: 
+
+ * File system walking is so fast compared to the other steps that using
+   several threads would make no sense (it would also quite probably become
+   IO bound if we tried anyway).
+ * The *Xapian* library index updating code is not designed for
+   multi-threading and must stay protected from multiple accesses.
+
+The two other steps are good candidates.
+
+Most of the work to make *Recoll* code reentrant had been performed for the
+previous transformation. Going full-parallel only implied protecting the
+data structures that needed to be shared by the threads performing a given
+processing step.
+
+Just for the anecdotic value, a list of the elements that needed mutexes:
+
+- Filter subprocesses cache: some file conversion subprocesses may be
+  expensive (starting a Python process is no piece of cake), so they are
+  cached for reuse after they are done translating a file. The shared cache
+  needs protection.
+- Status updates: an object used to update the current file name and indexing
+  status to a shared file. 
+- Missing store: the list of missing helper programs
+- The readonly *Xapian* database object: a Xapian::Database object which is
+  used for checking the validity of current index data against a file's
+  last modification date.
+- Document existence map: a bit array used to store an existence bit about
+  every document, and purge the disappeared at the end of the indexing
+  pass. This is accessed both from the file conversion and database update
+  code, so it also needed protection in the previous assembly line
+  approach. 
+- Mbox offsets cache. Used to store the offsets of individual messages
+  inside *mbox* files.
+- *iconv* control blocks: these are cached for reuse in several places, and
+  need protection. Actually, it might be better in multithreading context
+  to just suppress the reuse and locking. Rough tests seem to indicate that
+  the impact on overall performance is small, but this might change with
+  higher parallelism (or not...).
+
+The *Recoll* configuration also used to be managed by a single shared
+object, which is mutable as values may depend on what area of the
+file-system we are exploring, so that the object is stateful and updated as
+we change directories. The choice made here was to duplicate the object
+where needed (each indexing thread gets its own). This gave rise to the
+sneakiest bug in the whole transformation (see further down).
+
+Having a dynamic way to define the threads configuration makes it easy to
+experiment. For example, the following data defines the configuration that
+was finally found to be best overall on my hardware:
+
+ thrQSizes = 2 2 2
+ thrTCounts =  4 2 1
+
+This is using 3 queues of depth 2, 4 threads working on file conversion, 2
+on text splitting and other document processing, and 1 on Xapian updating
+(no choice here).
+
+unfloat::[]
+
+== Bench results
+
+So the big question after all the work: was it worth it ? I could only get
+a real answer when the program stopped crashing, so this took some time and
+a little faith...
+
+The answer is mostly yes, as far as I'm concerned. Indexing tests running
+almost twice as fast are good for my blood pressure and I don't need a
+faster PC, I'll buy more red wine instead (good for my health too, or maybe
+not). And it was a fun project anyway.
+
+.Results on a variety of file system areas:
+[options="header", width="70%"]
+|=======================
+|Area |Seconds before |Seconds after| Percent Improvement| Speed Factor
+|home |12742     | 6942 | 46%| 1.8
+|mail |2700     | 1563 | 58% | 1.7
+|projets | 5022 | 1970 | 61% | 2.5
+|pdf  | 2164 | 770 | 64% | 2.8
+|otherhtml | 5593 | 4014| 28% | 1.4
+|=======================
+
+.Characteristics of the data
+[options="header", width="70%"]
+|=======================
+|Area | Files MB | Files | DB MB | Documents
+|home | 64106 | 44897 | 1197 | 104797
+|mail | 813 | 232 | 663 | 47267
+|projets | 2056 | 34504 | 549 | 40281
+|pdf  | 1123 | 1139 | 111 | 1139 
+|otherhtml | 3442 | 223007 | 2080 | 221890 |
+|=======================
+
+_home_ is my home directory. The high megabyte value is due to a number of
+very big and not indexed *VirtualBox* images. Otherwise, it's a wide
+mix of source files, email,  miscellaneous documents and ebooks.
+
+_mail_ is my mail directory, full of *mbox* files.
+
+_projets_ mostly holds source files, and a number of documents.
+
+_pdf_ holds random *pdf* files harvested on the internets. The performance
+is quite spectacular, because most of the processing time goes to
+converting them to text, and this is done in parallel. Probably could be
+made a bit faster with more cores, until we hit the *Xapian* update speed
+limit.
+
+_otherhtml_ holds myriad of small html files, mostly from
+*wikipedia*. The improvement is not great here because a lot of time is
+spent in the single-threaded *Xapian* index update.
+
+The tests were made with queue depths of 2 on all queues, and 4 threads
+working on the file conversion step, 2 on the term generation.
+
+== A variation: linear parallelism
+
+Once past the assembly-line idea, another possible transformation would be
+to get rid of the two downstream queues, and just create a job for each
+file and let it go to the end (using a mutex to protect accesses to the
+writable *Xapian* database). 
+
+With the current *Recoll* code, this can be defined by the following
+parameters (one can also use a deeper front queue, this changes little):
+
+ thrQSizes = 2 -1 -1
+ thrTCounts =  4 0 0
+
+In practise, the performance is close to the one for the multistage
+version.
+
+If we were to hard-code this approach, this would be a simpler
+modification, necessitating less changes to the code, but it has a slight
+inconvenient: when working on a single big multi-document file, no
+parallelism at all can be obtained. In this situation, the multi-stage
+approach brings us back to the assembly-line behaviour, so the improvements
+are not great, but they do exist.
+
+
+
+== Miscellany
+
+=== The big gotcha: my stack dump staring days
+
+Overall, debugging the modified program was reasonably
+straightforward. Data access synchronization issues mostly provoke dynamic
+data corruption, which can be beastly to debug. I was lucky enough that
+most crashes occurred in the code that was actually related to the
+corrupted data, not in some randomly located and unrelated dynamic memory
+user, so that the issues were reasonably easy to find.
+
+One issue though kept me working for a few days. The indexing process kept
+crashing randomly at an interval of a few thousands documents, segfaulting
+on a bad pointer. An access to the configuration data structure seemed to
+be involved, but, as each thread was supposed to have its own copy, I was
+out of ideas.
+
+After reviewing all the uses for the configuration data (there are quite a
+few), the problem was finally revealed to lie with the filter process
+cache. Each filter structure stored in the cache stores a pointer to a
+configuration structure. This belonged to the thread which initially
+created the filter. But the filter would often be reused by a different
+thread, with the consequence that the configuration object was now accessed
+and modified by two unsynchronized threads... Resetting the config pointer
+at the time of filter reuse was the ridiculously simple single-line fix to
+this evasive problem.
+
+Looking at multi-threaded stack dumps is mostly fun for people with several
+heads, which is unfortunately not my case, so I was quite elated when this
+was over.
+
+=== Fork performance issues
+
+On a quite unrelated note, something that I discovered while evaluating the
+program performance is that forking a big process like `recollindex` can be
+quite expensive. Even if the memory space of the forked process is not
+copied (it's Copy On Write, and we write very little before the following
+exec), just duplicating the memory maps can be slow when the process uses a
+few hundred megabytes.
+
+I modified the single-threaded version of `recollindex` to use *vfork*
+instead of *fork*, but this can't be used with multiple threads (no
+modification of the process memory space is allowed in the child between
+*vfork* and *exec*, so we'd have to have a way to suspend all the threads
+first).
+
+I did not implement a solution to this issue, and I don't think
+that a simple one exists. The workaround is to use modest *Xapian* flush
+values to prevent the process from becoming too big.
+
+A longer time solution would be to implement a small slave process to do
+the executing of ephemeral external commands.

Area	Seconds before	Seconds after	Percent Improvement	Speed Factor
home	12742	6942	46%	1.8
mail	2700	1563	58%	1.7
projets	5022	1970	61%	2.5
pdf	2164	770	64%	2.8
otherhtml	5593	4014	28%	1.4
Area	Files MB	Files	DB MB	Documents
home	64106	44897	1197	104797
mail	813	232	663	47267
projets	2056	34504	549	40281
pdf	1123	1139	111	1139
otherhtml	3442	223007	2080	221890