doc

oops forgotten -lmagic in Makefile
doc
2022-10-02 09:05:33 +02:00 · 2022-09-25 20:28:35 +02:00 · 2022-09-25 19:10:57 +02:00 · 2022-09-25 17:20:12 +02:00 · 2022-09-24 15:16:52 +02:00 · 2022-09-24 09:14:51 +02:00
392 changed files with 86560 additions and 37989 deletions
--- a/.gitignore
+++ b/.gitignore
@ -24,8 +24,9 @@ build-*-Debug
 build-*-Release
 libtool
 ptrans
+**/Makefile.in
 src/Makefile
-src/Makefile.in
+src/rclgrep/Makefile
 src/TAGS
 src/aclocal.m4
 src/autom4te.cache
@ -77,7 +78,6 @@ src/recollq
 src/sampleconf/rclmon.sh
 src/sampleconf/recoll.conf
 src/testmains/Makefile
-src/testmains/Makefile.in
 src/xadump
 stamp-h1
 tests/casediac/aspdict.en.rws
@ -103,6 +103,7 @@ tests/indexedmimetypes/missing
 tests/indexedmimetypes/recoll.conf
 tests/indexedmimetypes/xapiandb
 tests/xattr/mimeview
+unac/autom4te.cache
 website/faqsandhowtos/*.html
 website/idxthreads/forkingRecoll.html
 website/idxthreads/xapDocCopyCrash.html
--- a/packaging/FreeBSD/recoll/pkg-plist
+++ b/packaging/FreeBSD/recoll/pkg-plist
@ -14,8 +14,8 @@ share/pixmaps/recoll.png
 %%DATADIR%%/filters/hotrecoll.py	
 %%DATADIR%%/filters/rclabw
 %%DATADIR%%/filters/rclaptosidman
-%%DATADIR%%/filters/rclaudio
-%%DATADIR%%/filters/rclchm
+%%DATADIR%%/filters/rclaudio.py
+%%DATADIR%%/filters/rclchm.py
 %%DATADIR%%/filters/rcldjvu
 %%DATADIR%%/filters/rcldoc
 %%DATADIR%%/filters/rcldvi
@ -23,11 +23,11 @@ share/pixmaps/recoll.png
 %%DATADIR%%/filters/rclfb2
 %%DATADIR%%/filters/rclflac
 %%DATADIR%%/filters/rclgaim
-%%DATADIR%%/filters/rclics
+%%DATADIR%%/filters/rclics.py
 %%DATADIR%%/filters/rclid3
 %%DATADIR%%/filters/rclimg
-%%DATADIR%%/filters/rclinfo
-%%DATADIR%%/filters/rclkar
+%%DATADIR%%/filters/rclinfo.py
+%%DATADIR%%/filters/rclkar.py
 %%DATADIR%%/filters/rclkwd
 %%DATADIR%%/filters/rcllatinclass.py
 %%DATADIR%%/filters/rcllatinstops.zip
@ -41,7 +41,7 @@ share/pixmaps/recoll.png
 %%DATADIR%%/filters/rclps
 %%DATADIR%%/filters/rclpurple
 %%DATADIR%%/filters/rclpython
-%%DATADIR%%/filters/rclrar
+%%DATADIR%%/filters/rclrar.py
 %%DATADIR%%/filters/rclrtf
 %%DATADIR%%/filters/rclscribus
 %%DATADIR%%/filters/rclshowinfo
@ -51,11 +51,11 @@ share/pixmaps/recoll.png
 %%DATADIR%%/filters/rcltex
 %%DATADIR%%/filters/rcltext
 %%DATADIR%%/filters/rcluncomp
-%%DATADIR%%/filters/rclwar
+%%DATADIR%%/filters/rclwar.py
 %%DATADIR%%/filters/rclwpd
 %%DATADIR%%/filters/rclxls
-%%DATADIR%%/filters/rclzip
-%%DATADIR%%/filters/rcl7z
+%%DATADIR%%/filters/rclzip.py
+%%DATADIR%%/filters/rcl7z.py
 %%DATADIR%%/filters/xdg-open
 %%DATADIR%%/images/aptosid-book.png
 %%DATADIR%%/images/aptosid-manual.png
--- a/packaging/debian/buildppa.sh
+++ b/packaging/debian/buildppa.sh
@ -5,30 +5,30 @@
 # sudo apt-get install pkg-kde-tools  cdbs

 # Active series:
-# 16.04LTS xenial 2021-04
 # 18.04LTS bionic 2023-04
 # 20.04LTS focal  2025-04
-# 20.10    groovy 2021-07
-# 21.04    hirsute 2022-01
+# 22.04LTS jammy  2027-04
+SERIES="bionic focal jammy kinetic"

 PPA_KEYID=7808CE96D38B9201

-RCLVERS=1.31.0
+RCLVERS=1.33.1
 SCOPEVERS=1.20.2.4
-GSSPVERS=1.1.0
+GSSPVERS=1.1.1
 PPAVERS=1

-# 
-RCLSRC=/y/home/dockes/projets/fulltext/recoll/src
-SCOPESRC=/y/home/dockes/projets/fulltext/unity-scope-recoll
-GSSPSRC=/y/home/dockes/projets/fulltext/gssp-recoll
-RCLDOWNLOAD=/y/home/dockes/projets/lesbonscomptes/recoll
+#
+#Y=/y
+Y=
+RCLSRC=${Y}/home/dockes/projets/fulltext/recoll/src
+SCOPESRC=${Y}/home/dockes/projets/fulltext/unity-scope-recoll
+GSSPSRC=${Y}/home/dockes/projets/fulltext/gssp-recoll
+RCLDOWNLOAD=${Y}/home/dockes/projets/lesbonscomptes/recoll
+
+PPANAME=recoll15-ppa
+PPANAME=recollexp1-ppa
+#PPANAME=recoll-webengine-ppa

-case $RCLVERS in
-    [23]*) PPANAME=recollexp-ppa;;
-    *)     PPANAME=recoll15-ppa;;
-esac
-#PPANAME=recollexp-ppa
 echo "PPA: $PPANAME. Type CR if Ok, else ^C"
 read rep

@ -49,8 +49,8 @@ check_recoll_orig()

 ####### QT
 debdir=debian
-series="bionic focal groovy hirsute"
-series=
+series=$SERIES
+#series=bionic

 if test "X$series" != X ; then
    check_recoll_orig
@ -77,7 +77,7 @@ for series in $series ; do
      -e s/PPAVERS/${PPAVERS}/g \
      < ${debdir}/changelog > recoll-${RCLVERS}/debian/changelog

-  (cd recoll-${RCLVERS};debuild -k$PPA_KEYID -S -sa)  || break
+  (cd recoll-${RCLVERS};debuild -d -k$PPA_KEYID -S -sa)  || break

  dput $PPANAME recoll_${RCLVERS}-1~ppa${PPAVERS}~${series}1_source.changes
 done
@ -85,8 +85,8 @@ done


 ### KIO.
-series="bionic focal groovy hirsute"
-#series=
+series=$SERIES
+series=

 debdir=debiankio
 topdir=kio-recoll-${RCLVERS}
@ -125,7 +125,7 @@ for svers in $series ; do
 done

 ### GSSP
-series="bionic focal groovy hirsute"
+series=$SERIES
 series=

 debdir=debiangssp
--- a/packaging/debian/debian/changelog
+++ b/packaging/debian/debian/changelog
@ -1,3 +1,169 @@
+recoll (1.33.1-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Small updates to the build files to accomodate the new rclgrep utility.
+  * New textunknownasplain configuration variable to index all files with no known association
+    but identified as text/xxx by the "file" or "xdg-mime" command.
+  * Make sure that a single double-quoted word is not  stem-expanded (act as if it was
+    capitalized). Expanding a quoted term is unexpected.
+  * Apply stemming to terms containing a single dash. These were not expanded before.
+  * Linux real time: fix monitoring under topdirs members which are symbolic links.
+  * Fix the GUI simple search which was broken in 1.33.0 when switching filters on/off
+  * Exclude Tamil characters from unac processing (experimental for now).
+  * Windows GUI directory side filters: the computed paths were wrong on Windows.
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Sun, 25 Sep 2022 19:19:00 +0200
+
+recoll (1.33.0-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Query processing: simplify queries a bit more before sending to Xapian, allows using OP_FILTER
+    for path filtering. -> Medium version bump.
+  * GUI: allow specifying a fixed geometry for the results list viewport by setting
+    RECOLL_RESULTS_GEOMETRY=widthxheight . For people with fixed-width result formats CSS.
+  * recollq: add option to extract a result document into a file.
+  * Replace application/x-flac with audio/flac for FLAC audio files.
+  * Fix web queue processing for non-default configuration directories.
+  * Fix encoding issue in pdf attachment extraction.
+  * GUI: result list: fix issue with webengine builds not displaying Icons. Paging still not working
+    right with webengine (QTBUG-105842). Main builds revert/remain to webkit.
+  * Misc. small adjustments.
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Mon, 30 Aug 2022 10:59:00 +0200
+
+recoll (1.32.8-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Add environment variable RECOLL_RESULTS_GEOMETRY for forcing a fixed geometry to the results
+    list viewport.
+  * Fix result list Qt Webengine icon display issue.
+  * Improve result list paging behaviour. Only fully works with Qt Webkit.
+  * recollq: add option to extract result document to a file.
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Sun, 21 Aug 2022 07:59:00 +0200
+
+recoll (1.32.7-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * kio_recoll: updated to work with newer kf5 versions (it seems that 5.96 broke it at least on
+    arch linux).
+  * rclaudio: fix extracting comment fields from flac files.
+  * Python code preview:  get rid of spurious encoding value output.
+  * Fix glitch in Qt GUI when between list and table display.
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Sun, 07 Aug 2022 17:42:00 +0200
+
+recoll (1.32.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * GUI: switch to using Qt-Webengine instead of Qt-Webkit because of CSS support issues in Webkit.
+  * GUI: result list paragraph format. Preserve unquoted % characters if there is no matching
+    translation. USer manual: document the need to quote % as %% anyway.
+  * GUI: result list devel/debug. Add parameter to dump the HTML sent to the engine.
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Tue, 05 Jul 2022 09:56:00 +0200
+
+recoll (1.32.4-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Try to improve font size consistency by using px sizes everywhere.
+  * Fix Increase/Decrease font size menu options.
+  * Allow displaying line numbers in snippets.
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Wed, 29 Jun 2022 09:36:00 +0200
+
+recoll (1.32.3-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Fix issues when opening a file with %F (parent of subdocument): avoid creating a temporary file
+    when this can be avoided.
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Tue, 21 Jun 2022 20:51:00 +0200
+
+recoll (1.32.2-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Restore result list popup menu function when using webengine. This was broken in 1.32.1.
+  * Show progress dialog when result list abstracts generation takes too long
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Tue, 14 Jun 2022 07:51:00 +0200
+
+recoll (1.32.1-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * GUI side panel filters: make sure the filter is applied even if set before
+    the query.
+  * GUI side panel directory filter: compute the tree from the index, not the
+    file system, to allow filtering data from external indexes. Update the tree
+    when an indexing completes.
+  * Implement whole UI scaling factor (fonts only, no icons).
+  * Orgmode: add orgmodesubdocs configuration variable to decide if we index
+    whole files or create subdocuments for nodes. Also index text before the
+    first heading.
+  * GUI: fix path translation for importing an index from Windows.
+
+ -- Jean-Francois Dockes <jf@dockes.org>  Fri, 20 May 2022 10:55:00 +0200
+
+recoll (1.32.0-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * GUI: added a collapsible side pane for quick filtering on dates or
+    directories.
+  * Showing duplicates now uses a spreadsheet like the result table for
+    easy access to the duplicate files.
+  * Fixed the temporary copy open dialog (again!).
+  * The default mimeview and mimeconf configuration files were
+    separated into generic and system-specific parts to avoid update
+    errors (no consequences for users).
+  * Renamed all Python input handler with a .py extension. This is
+    relied on Windows rather than listing an explicit python
+    interpreter.
+  * Added %l specification to viewer definitions for opening at a
+    specific line.
+    
+ -- Jean-Francois Dockes <jf@dockes.org>  Fri, 11 Mar 2022 18:17:00 +0100
+
+recoll (1.31.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Almost no change: translation files update.
+    
+ -- Jean-Francois Dockes <jf@dockes.org>  Sat, 20 Dec 2021 09:25:00 +0100
+
+recoll (1.31.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Fix ennoying bug in tesseract OCR temporary files cleanup.
+    
+ -- Jean-Francois Dockes <jf@dockes.org>  Sat, 04 Dec 2021 10:05:00 +0100
+
+recoll (1.31.4-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Linux/Mac: Bug in threads management could result in index corruption or crash
+    after signal interrupt.
+    
+ -- Jean-Francois Dockes <jf@dockes.org>  Thu, 25 Nov 2021 16:30:00 +0100
+
+recoll (1.31.3-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Highligthing for group (phrase/near): eliminate some spurious matches.
+  * Fix page number string detection which could sometimes prevent correct
+    highlighting in snippets.
+  * Avoid query completer consuming excessive resources on unstripped
+    indexes.
+  * Fix some cases where different instances of the indexer could use
+    different pid/lock files.
+  * Fix processing on some unicode dash and apos character variations.
+  * PST: fix indexing in marginal cases. Extract message dates.
+
+ -- Jean-Francois Dockes <jf@dockes.org>  Sat, 13 Nov 2021 16:30:00 +0100
+
+recoll (1.31.2-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Add support for .ipynb iPython/Jupyter notebook format.
+  * Implement Alt+/ shortcut to search the menu entries and possibly execute the result.
+  * Fix configuration GUI button margins on Mac OS.
+  * Add *.pyc __pycache__ .pytest_cache .tox  and .direnv to the default skipped names list.
+  * Add /opt/homebrew/bin to the helper search path when built under Mac Homebrew.
+  * Linux: let recollindex adjust its OOM killer "badness" on startup.
+  * simple search: add Ctrl+H as keyboard shortcut for "show history".
+  * Renamed the fragment buttons configuration file from fragbuts.xml to fragment-buttons.xml.
+  * Zip archives: set the modification date attribute for members.
+  * ost/pst filter: fix not fetching the message dates.
+  * Anchored searches: remove unwarranted slack increase. The anchor term should behave like a
+    normal one for slack computations.
+  * Fix djvu issues on Windows.
+
+ -- Jean-Francois Dockes <jf@dockes.org>  Mon, 11 Oct 2021 10:51:00 +0200
+
 recoll (1.31.0-1~ppaPPAVERS~SERIES1) SERIES; urgency=low

  * GUI: modified shortcuts were not read from the preferences !
--- a/packaging/debian/debian/control
+++ b/packaging/debian/debian/control
@ -3,18 +3,19 @@ Section: x11
 Priority: optional
 Maintainer: Jean-Francois Dockes <jfd@recoll.org>
 Build-Depends: bison,
-               debhelper (>= 9),
+               debhelper (>= 10),
               dh-python,
               dpkg-dev (>= 1.16.1~),
               libaspell-dev,
               libchm-dev,
+#               qtwebengine5-dev,
               libqt5webkit5-dev,
               libx11-dev,
               libxapian-dev (>= 1.2.0),
               libxslt1-dev,
               libz-dev,
               pkg-config,
-               python-all-dev (>= 2.6.6-3~),
+               python2-dev (>= 2.6.6-3~),
               python-setuptools,
               python3-all-dev,
               python3-setuptools,
--- a/packaging/debian/debian/control-bionic
+++ b/packaging/debian/debian/control-bionic
@ -0,0 +1,115 @@
+Source: recoll
+Section: x11
+Priority: optional
+Maintainer: Jean-Francois Dockes <jfd@recoll.org>
+Build-Depends: bison,
+               debhelper (>= 9),
+               dh-python,
+               dpkg-dev (>= 1.16.1~),
+               libaspell-dev,
+               libchm-dev,
+               libqt5webkit5-dev,
+#               qtwebengine5-dev,
+               libx11-dev,
+               libxapian-dev (>= 1.2.0),
+               libxslt1-dev,
+               libz-dev,
+               pkg-config,
+               python-all-dev (>= 2.6.6-3~),
+               python-setuptools,
+               python3-all-dev,
+               python3-setuptools,
+               qtbase5-dev
+X-Python3-Version: >= 3.4
+Vcs-Git: https://salsa.debian.org/debian/recoll.git
+Vcs-Browser: https://salsa.debian.org/debian/recoll
+Homepage: https://www.lesbonscomptes.com/recoll
+Standards-Version: 4.2.1
+
+Package: recoll
+Architecture: all
+Depends: recollcmd, recollgui, ${misc:Depends}
+Description: Personal full text search package
+ This package is a personal full text search package is based on a very strong
+ backend (Xapian), for which it provides an easy to use and feature-rich
+ interface.
+ .
+ Features:
+  * Qt-based GUI.
+  * Supports the following document types (and their compressed versions)
+   - Natively: text, html, OpenOffice files,  excel, ppt, maildir and
+     mailbox (Mozilla and IceDove mail) with attachments, pidgin log files
+   - With external helpers:  pdf (pdftotext), postscript (ghostscript), msword
+     (antiword), rtf (unrtf). And others...
+  * Powerful query facilities, with boolean searches, phrases, filter on file
+    types and directory tree.
+  * Support for multiple charsets, Internal processing and storage uses Unicode
+    UTF-8.
+  * Stemming performed at query time (can switch stemming language after
+    indexing).
+  * Easy installation. No database daemon, web server or exotic language
+    necessary.
+  * The indexer can run either continuously or in batch.
+
+Package: recollcmd
+Architecture: any
+Breaks: recoll (<< 1.23.7-2)
+Replaces: recoll (<< 1.23.7-2)
+Depends: python3, ${misc:Depends}, ${shlibs:Depends}
+Recommends: antiword,
+            aspell,
+            groff,
+            libimage-exiftool-perl,
+            poppler-utils,
+            python3-lxml,
+            python3-recoll,
+            python3-six,
+            python3-mutagen,
+            python3-rarfile,
+            unrtf,
+            unzip,
+            xdg-utils
+Suggests: ghostscript,
+          libinotifytools0,
+          untex,
+          wv
+Description: Command line programs for recoll
+ This package supports indexing and command line querying.
+ 
+Package: recollgui
+Architecture: any
+Breaks: recoll (<< 1.23.7-2)
+Replaces: recoll (<< 1.23.7-2)
+Depends: recollcmd (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends}
+Description: GUI program and elements for recoll
+ Main recoll GUI for configuring, controlling and querying recoll indexes.
+
+Package: python-recoll
+Architecture: any
+Section: python
+Depends: python2,
+         recollcmd (= ${binary:Version}),
+         ${misc:Depends},
+         ${python:Depends},
+         ${shlibs:Depends}
+Description: Python extension for recoll
+ Personal full text search package which is based on a very strong backend
+ (Xapian), for which it provides an easy to use and feature-rich interface.
+ .
+ This package provides Python extension module for recoll which can be use to
+ extend recoll such as an Ubuntu Unity Lens.
+
+Package: python3-recoll
+Architecture: any
+Section: python
+Depends: python3,
+         recollcmd (= ${binary:Version}),
+         ${misc:Depends},
+         ${python3:Depends},
+         ${shlibs:Depends}
+Description: Python extension for recoll (Python3)
+ Personal full text search package which is based on a very strong backend
+ (Xapian), for which it provides an easy to use and feature-rich interface.
+ .
+ This package provides Python3 extension module for recoll which can be use to
+ extend recoll such as an Ubuntu Unity Lens.
--- a/packaging/debian/debian/patches/mbox-use-streamptr-for-jessie.diff
+++ b/packaging/debian/debian/patches/mbox-use-streamptr-for-jessie.diff
@ -1,5 +1,5 @@
-diff --git a/internfile/mh_mbox.cpp b/srcinternfile/mh_mbox.cpp
-index 2a0918cf..92ad7e23 100644
+diff --git a/src/internfile/mh_mbox.cpp b/src/internfile/mh_mbox.cpp
+index c77d42c8..ccd6a613 100644
 --- a/internfile/mh_mbox.cpp
 +++ b/internfile/mh_mbox.cpp
@@ -27,6 +27,7 @@
@ -19,19 +19,25 @@ index 2a0918cf..92ad7e23 100644
     int        msgnum{0}; // Current message number in folder. Starts at 1
     int64_t    lineno{0}; // debug 
     int64_t    fsize{0};
-@@ -321,7 +322,6 @@ void MimeHandlerMbox::clear_impl()
- {
+@@ -322,13 +323,6 @@ void MimeHandlerMbox::clear_impl()
     m->fn.erase();
     m->ipath.erase();
-    m->instream = ifstream();
+ 
+-    // We used to use m->instream = ifstream() which fails with some compilers, as the copy
+-    // constructor is marked deleted in standard c++ (works with many compilers though).
+-    if (m->instream.is_open()) {
+-        m->instream.close();
+-    }
+-    m->instream.clear();
+-
     m->msgnum = 0;
     m->lineno = 0;
     m->fsize = 0;
-@@ -339,8 +339,9 @@ bool MimeHandlerMbox::set_document_file_impl(const string&, const string &fn)
+@@ -346,8 +340,9 @@ bool MimeHandlerMbox::set_document_file_impl(const string&, const string &fn)
     LOGDEB("MimeHandlerMbox::set_document_file(" << fn << ")\n");
     clear_impl();
     m->fn = fn;
-    m->instream = ifstream(fn.c_str(), std::ifstream::binary);
+-    m->instream.open(fn.c_str(), std::ifstream::binary);
 -    if (!m->instream.good()) {
 +    m->instream = std::unique_ptr<ifstream>(
 +        new ifstream(fn.c_str(), std::ifstream::binary));
@ -39,7 +45,7 @@ index 2a0918cf..92ad7e23 100644
         LOGSYSERR("MimeHandlerMail::set_document_file", "ifstream", fn);
         return false;
     }
-@@ -389,13 +390,13 @@ bool MimeHandlerMbox::Internal::tryUseCache(int mtarg)
+@@ -396,13 +391,13 @@ bool MimeHandlerMbox::Internal::tryUseCache(int mtarg)
                                    fsize)) < 0) {
         goto out;
     }
@ -57,7 +63,7 @@ index 2a0918cf..92ad7e23 100644
         LOGSYSERR("tryUseCache", "getline", "");
         goto out;
     }
-@@ -404,7 +405,7 @@ bool MimeHandlerMbox::Internal::tryUseCache(int mtarg)
+@@ -411,7 +406,7 @@ bool MimeHandlerMbox::Internal::tryUseCache(int mtarg)
     if ((fromregex(line) ||
          ((quirks & MBOXQUIRK_TBIRD) && minifromregex(line)))  ) {
         LOGDEB0("MimeHandlerMbox: Cache: From_ Ok\n");
@ -66,7 +72,7 @@ index 2a0918cf..92ad7e23 100644
         msgnum = mtarg -1;
         cachefound = true;
     } else {
-@@ -414,7 +415,7 @@ bool MimeHandlerMbox::Internal::tryUseCache(int mtarg)
+@@ -421,7 +416,7 @@ bool MimeHandlerMbox::Internal::tryUseCache(int mtarg)
 out:
     if (!cachefound) {
         // No cached result: scan.
@ -75,7 +81,7 @@ index 2a0918cf..92ad7e23 100644
         msgnum = 0;
     }
     return cachefound;
-@@ -422,7 +423,7 @@ out:
+@@ -429,7 +424,7 @@ out:
 
 bool MimeHandlerMbox::next_document()
 {
@ -84,7 +90,7 @@ index 2a0918cf..92ad7e23 100644
         LOGERR("MimeHandlerMbox::next_document: not open\n");
         return false;
     }
-@@ -458,10 +459,10 @@ bool MimeHandlerMbox::next_document()
+@@ -465,10 +460,10 @@ bool MimeHandlerMbox::next_document()
     msgtxt.erase();
     string line;
     for (;;) {
--- a/packaging/debian/debian/rules
+++ b/packaging/debian/debian/rules
@ -21,10 +21,11 @@ endif

 # main packaging script based on dh7 syntax
 %:
-	dh $@ --parallel --with python2 --with python3
+	dh $@ --with python2 --with python3

 override_dh_auto_configure:
-	dh_auto_configure -- --enable-recollq --enable-xadump
+	dh_auto_configure -- --enable-recollq --enable-xadump --enable-webkit
+#	dh_auto_configure -- --enable-recollq --enable-xadump --enable-webengine

 build3vers := $(shell py3versions -sv)

--- a/packaging/debian/debiankio/changelog
+++ b/packaging/debian/debiankio/changelog
@ -1,3 +1,76 @@
+kio-recoll (1.33.0-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Follow
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Mon, 30 Aug 2022 10:59:00 +0200
+
+kio-recoll (1.32.7-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Updated kio_recoll to work with newer kf5 versions (it seems that 5.96 broke it at least on
+    arch).
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Sun, 07 Aug 2022 17:42:00 +0200
+
+kio-recoll (1.32.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Follow
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Tue, 05 Jul 2022 09:56:00 +0200
+
+kio-recoll (1.32.4-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Follow
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Wed, 29 Jun 2022 09:36:00 +0200
+
+kio-recoll (1.32.2-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Follow
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Tue, 14 Jun 2022 07:51:00 +0200
+
+kio-recoll (1.32.1-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Follow
+
+ -- Jean-Francois Dockes <jf@dockes.org>  Sun, 15 May 2022 08:07:00 +0200
+
+kio-recoll (1.32.0-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Follow
+    
+ -- Jean-Francois Dockes <jf@dockes.org>  Fri, 11 Mar 2022 18:17:00 +0100
+
+kio-recoll (1.31.6-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Follow
+    
+ -- Jean-Francois Dockes <jf@dockes.org>  Sat, 20 Dec 2021 09:25:00 +0100
+
+kio-recoll (1.31.5-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Follow recoll version
+    
+ -- Jean-Francois Dockes <jf@dockes.org>  Sat, 04 Dec 2021 10:05:00 +0100
+
+kio-recoll (1.31.4-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Follow recoll version
+    
+ -- Jean-Francois Dockes <jf@dockes.org>  Thu, 25 Nov 2021 16:30:00 +0100
+
+kio-recoll (1.31.3-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Follow recoll version
+
+ -- Jean-Francois Dockes <jf@dockes.org>  Sat, 13 Nov 2021 16:30:00 +0200
+
+kio-recoll (1.31.2-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * Follow recoll version
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Mon, 11 Oct 2021 10:55:00 +0200
+
 kio-recoll (1.31.0-1~ppaPPAVERS~SERIES1) SERIES; urgency=low

  * Follow recoll version
--- a/packaging/debian/debiankio/patches/dirif-fix-kio-version-include.patch
+++ b/packaging/debian/debiankio/patches/dirif-fix-kio-version-include.patch
@ -0,0 +1,13 @@
+diff --git a/src/kde/kioslave/kio_recoll/dirif.cpp b/src/kde/kioslave/kio_recoll/dirif.cpp
+index 4438a1e7..48284ece 100644
+--- a/kde/kioslave/kio_recoll/dirif.cpp
+++ b/kde/kioslave/kio_recoll/dirif.cpp
+@@ -35,7 +35,7 @@
+ #include <QDebug>
+ #include <QUrl>
+ #include <QStandardPaths>
+-#include <KF5/kio_version.h>
+#include <kio_version.h>
+ 
+ #include "kio_recoll.h"
+ #include "pathut.h"
--- a/packaging/debian/rclgrep/debian/README.Debian
+++ b/packaging/debian/rclgrep/debian/README.Debian
@ -0,0 +1,8 @@
+README for Debian
+-----------------
+
+ The rclgrep package is a partial installation of the recollcmd package, with no
+ Xapian dependency. It conflicts with recollcmd, which also provides the rclgrep
+ command.
+
+ -- Jean-Francois Dockes <jf@dockes.org>  Tue, 20 Sep 2022 08:32:00 +0200
--- a/packaging/debian/rclgrep/debian/changelog
+++ b/packaging/debian/rclgrep/debian/changelog
@ -0,0 +1,7 @@
+rclgrep (1.33.1-1~ppaPPAVERS~SERIES1) SERIES; urgency=low
+
+  * 1st version of rclgrep: a non-indexed search program using recoll
+    data extraction modules to effect grep-like function.
+  
+ -- Jean-Francois Dockes <jf@dockes.org>  Sun, 11 Sep 2022 10:59:00 +0200
+
--- a/packaging/debian/rclgrep/debian/compat
+++ b/packaging/debian/rclgrep/debian/compat
@ -0,0 +1 @@
+11
--- a/packaging/debian/rclgrep/debian/control
+++ b/packaging/debian/rclgrep/debian/control
@ -0,0 +1,44 @@
+Source: rclgrep
+Section: x11
+Priority: optional
+Maintainer: Jean-Francois Dockes <jfd@recoll.org>
+Build-Depends: debhelper (>= 10),
+               dh-python,
+               dpkg-dev (>= 1.16.1~),
+               libchm-dev,
+               libmagic-dev,
+               libxslt1-dev,
+               libz-dev,
+               pkg-config,
+               python3-all-dev,
+               python3-setuptools
+X-Python3-Version: >= 3.6
+Homepage: https://www.lesbonscomptes.com/recoll
+Standards-Version: 4.2.1
+
+Package: rclgrep
+Architecture: any
+Depends: python3, ${misc:Depends}, ${shlibs:Depends}
+Conflicts: recollcmd
+Recommends: antiword,
+            groff,
+            libimage-exiftool-perl,
+            poppler-utils,
+            python3-lxml,
+            python3-six,
+            python3-mutagen,
+            python3-rarfile,
+            unrtf,
+            unzip,sfami
+            xdg-utils
+Suggests: ghostscript,
+          untex,
+          wv
+Description: grep-like program based on recoll data extraction modules.
+ The program supports most grep options and aims at supplying a very similar
+ output format. It will search all formats supported by Recoll, including
+ compound documents and nested archives (mbox, zip, ....) with full
+ regexp support (unlike recoll). It does not not create
+ an index and the package has no dependency on Xapian. In consequence, 
+ searching is vastly slower than when using recoll.
+ 
--- a/packaging/debian/rclgrep/debian/copyright
+++ b/packaging/debian/rclgrep/debian/copyright
@ -0,0 +1,141 @@
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: recoll
+Upstream-Contact: Jean-Francois Dockes <jfd@recoll.org>
+Source: https://www.lesbonscomptes.com/recoll/
+
+Files: *
+Copyright: 2005-2014, Jean-Francois Dockes <jfd@recoll.org>
+License: GPL-2+
+
+Files: bincimapmime/*
+Copyright: 2002-2005, Andreas Aardal Hanssen <andreas-binc@bincimap.org>
+License: GPL-2+
+
+Files: filters/rcl* internfile/htmlparse.cpp
+Copyright: 2000-2004, Mikio Hirabayashi
+License: GPL-2+
+
+Files: filters/rclpython
+Copyright: J\xfcrgen Hermann, Mike Brown, Christopher Arndt
+ <http://chrisarndt.de/en/software/python/colorize.html>
+License: GPL-2+
+
+Files: internfile/htmlparse.cpp internfile/mh_html.cpp
+Copyright: 1999-2001, BrightStation PLC,
+           2001, Ananova Ltd,
+           2002-2004, Olly Betts.
+License: GPL-2+
+
+Files: unac/*
+Copyright: 2000-2002, Loic Dachary <loic@senga.org>
+License: GPL-2+
+
+Files: common/*
+Copyright: 2004-2005, J.F.Dockes
+License: GPL-2+
+
+Files: debian/*
+Copyright: 2007-2014, Kartik Mistry <kartik@debian.org>
+License: GPL-2+
+
+License: GPL-2+
+ This package is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any later
+ version.
+ .
+ This package is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+ .
+ You should have received a copy of the GNU General Public License along with
+ this package; if not, write to the Free Software Foundation, Inc., 51 Franklin
+ St, Fifth Floor, Boston, MA 02110-1301 USA
+ .
+ On Debian systems, the complete text of the GNU General Public License can be
+ found in `/usr/share/common-licenses/GPL-2' and
+ `/usr/share/common-licenses/GPL-3'.
+
+Files: aspell/*
+Copyright: 2001-2002, by Kevin Atkinson
+License: LGPL-2+
+
+License: LGPL-2+
+ This package is free software; you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your option) any
+ later version.
+ .
+ This package is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ details.
+ .
+ You should have received a copy of the GNU Lesser General Public License along
+ with this package; if not, write to the Free Software Foundation, Inc., 51
+ Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ .
+ On Debian systems, the complete text of the GNU Lesser General Public License
+ can be found in `/usr/share/common-licenses/LGPL-2' and
+ `/usr/share/common-licenses/LGPL-2.1' and `/usr/share/common-licenses/LGPL-3'.
+
+Files: common/uproplist.h
+Copyright: 1991-2006, Unicode, Inc.
+License: Unicode
+
+License: Unicode
+ All rights reserved. Distributed under the Terms of Use in
+ https://www.unicode.org/copyright.html
+ .
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of the Unicode data files and any associated documentation (the "Data Files")
+ or Unicode software and any associated documentation (the "Software") to deal
+ in the Data Files or Software without restriction, including without
+ limitation the rights to use, copy, modify, merge, publish, distribute, and/or
+ sell copies of the Data Files or Software, and to permit persons to whom the
+ Data Files or Software are furnished to do so, provided that (a) the above
+ copyright notice(s) and this permission notice appear with all copies of the
+ Data Files or Software, (b) both the above copyright notice(s) and this
+ permission notice appear in associated documentation, and (c) there is clear
+ notice in each modified Data File or in the Software as well as in the
+ documentation associated with the Data File(s) or Software that the data or
+ software has been modified.
+ .
+ THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD
+ PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
+ THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+ DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE
+ DATA FILES OR SOFTWARE.
+ .
+ Except as contained in this notice, the name of a copyright holder shall not
+ be used in advertising or otherwise to promote the sale, use or other dealings
+ in these Data Files or Software without prior written authorization of the
+ copyright holder.
+
+Files: utils/md5.*
+Copyright: 1991-1992, RSA Data Security, Inc. All rights reserved.
+License: RSA
+
+License: RSA
+ MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
+ .
+ License to copy and use this software is granted provided that it is
+ identified as the "RSA Data Security, Inc. MD5 Message-Digest Algorithm" in
+ all material mentioning or referencing this software or this function.
+ .
+ License is also granted to make and use derivative works provided that such
+ works are identified as "derived from the RSA Data Security, Inc. MD5
+ Message-Digest Algorithm" in all material mentioning or referencing the
+ derived work.
+ .
+ RSA Data Security, Inc. makes no representations concerning either the
+ merchantability of this software or the suitability of this software for any
+ particular purpose. It is provided "as is" without express or implied warranty
+ of any kind.
+ .
+ These notices must be retained in any copies of any part of this documentation
+ and/or software.
--- a/packaging/debian/rclgrep/debian/rclgrep.install
+++ b/packaging/debian/rclgrep/debian/rclgrep.install
@ -0,0 +1,2 @@
+usr/lib/python*/*-packages/recollchm/*
+usr/lib/python*/*-packages/recollchm-*/*
--- a/packaging/debian/rclgrep/debian/rules
+++ b/packaging/debian/rclgrep/debian/rules
@ -0,0 +1,44 @@
+#!/usr/bin/make -f
+
+# Uncomment this to turn on verbose mode.
+#export DH_VERBOSE=1
+
+export DEB_BUILD_MAINT_OPTIONS = hardening=+all
+DPKG_EXPORT_BUILDFLAGS = 1
+include /usr/share/dpkg/buildflags.mk
+
+DEB_HOST_GNU_TYPE   ?= $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE)
+DEB_BUILD_GNU_TYPE  ?= $(shell dpkg-architecture -qDEB_BUILD_GNU_TYPE)
+
+build3vers := $(shell py3versions -sv)
+
+#build qt5 UI
+export QT_SELECT := qt5
+
+ifneq (,$(filter parallel=%,$(DEB_BUILD_OPTIONS)))
+	NJOBS := -j $(patsubst parallel=%,%,$(filter parallel=%,$(DEB_BUILD_OPTIONS)))
+endif
+
+# main packaging script based on dh7 syntax
+%:
+	dh $@ --with python3
+
+override_dh_auto_configure:
+	dh_auto_configure -- --enable-rclgrep --disable-python-module --disable-indexer \
+				--disable-qtgui --disable-recollq --disable-testmains \
+				--disable-xadump
+
+build3vers := $(shell py3versions -sv)
+
+override_dh_auto_install:
+	dh_auto_install
+	set -e && for i in $(build3vers); do \
+	(cd python/pychm; python$$i ./setup.py install \
+	                                --install-layout=deb \
+					--prefix=/usr \
+	                                --root=$(CURDIR)/debian/tmp/ ) ; \
+	done
+	find $(CURDIR) -type f -name '*.la' -exec rm -f '{}' \;
+	find $(CURDIR) -type f -name '*.pyc' -exec rm -f '{}' \;
+	rm -rf $(CURDIR)/debian/rclgrep/usr/lib/python*/*/*/__pycache__
+	rm -rf $(CURDIR)/debian/tmp/usr/lib/python*/*/*/__pycache__
--- a/packaging/debian/rclgrep/debian/source/format
+++ b/packaging/debian/rclgrep/debian/source/format
@ -0,0 +1 @@
+3.0 (quilt)
--- a/packaging/debian/rclgrep/debian/upstream/signing-key.asc
+++ b/packaging/debian/rclgrep/debian/upstream/signing-key.asc
@ -0,0 +1,41 @@
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBFbJ6UABEADLsFg8qXTrNrYUnNS5UXlAWUH7/ZHNRgr/EIkhKAbdlzVAywTM
+fX6wo9crKzlqT3IcEOFe0RVJoh0FSNEQQlUhyJAFNlbcocsDYNqk7pDjxnUBUMM2
+U3ikLEPzRxWDhVepAVQPeloD1i8b4MJrSHnLb49PMmXg+6MHA+dzOS59onE5QDcz
+kw1RF0N0gl7693rOMP/ATefA2KPQyKCIweKB/3NbOcv4/T1XDyag0G7xYkT4stEl
+TN2P8c6HSyhWDxp2slZ04kdf17TuoeOqMO9gKE+eEC17lllLuhSrbBdfYTYt05pN
+Y1eRup+6oamoMc3ITD2U2GtY+65AHw5MxjGigpZ3kj5DwF/f2IgtDBSoXjm8aaRb
+iYMvt3kXnb3Ai/oVvSlkIQMlDDpdAQmzB0FO0MCzzykq5mQVbl3Uw3i2q5vg1IIL
+fGOB1USa0JOVRSq8C66ncijYO6Jafx3uYCGVdIypoLs332kGsyQaIatoJRbPkKT/
+Wu/DGE8kHOaCo5795HbRk0O/Up5wQP3N/OXGmrQPtbafRz9bkjXOKGtq660VJ67K
+ttgY9L1fD7jb+zDoUaY33K8Trfqaxm5aGkI6Pj3VvQSF2CAaJuEnh/c0r9UdGn0e
+e1L0yP1kUj80Qv99QFEoH2UtBrfLsXAiRvcr/PfyGTp/+Q7wkCHsHC84TwARAQAB
+tCRKZWFuLUZyYW5jb2lzIERvY2tlcyA8amZAZG9ja2VzLm9yZz6JAkAEEwEKACoC
+GwMFCwkIBwMFFQoJCAsFFgIDAQACHgECF4ACGQEFAlitGxQFCQWlmM4ACgkQeAjO
+ltOLkgEE4xAAqGOSt6U+CGdI333Yx7KaCA+XgJPsiaqfG2AIuv4Y0/LW8467uy4u
+DdbgJ3GQ6kWUZD0b/nrp74Ly5ZM9DCIZzOX9FQ3R9FBhbBS1fVfqFD2yZQv4lze9
+Bjj7EMRieRGUtVIb7BiUrmJOyIbiGktEOuqqTj7RehN/2sflv5jH2NW33+i3t/x7
+YWTAPHxieiOYO0Z0JtXe+ZXJ92LNaR+5DOsGItTSeJKzbh1oUtAcbt5DDDQKMJwb
+tIRg+9Mjj3IUqCsiFkKOfq34TXDu2paKWkdGuOJ8u2DqvgUYkqFfY4JOpWrax+Mt
+hsS6VSDIxL7H9UqaJpXWMMhUN2gFM+wy/y1OeNo5bKM4KiLbaugEvOb1RCQm2R6h
+HNcDO52KSFJMZSCzO/jjN2qJjDcLu2DAsQzWI+bzZgP+tpr3yWvW2OCCY+PdT4ZA
+5lwnd85P5x1wYhb/eoXi9QyWburu3vaNGdDWUljUkBB29l09hoDbAYPwWujLDGyT
+0j6+NWh27dLe8bnwe5YEBPHcwvuLnSBVVXY+UM/0toEWwpRdTvnxZUKKxtN+yiCA
+k82qRNXaUGaWpaL0xYPfanZSXi1dSNNEertS/BrF5PpmNdQsK1+sibNF1KKhR5ge
+2QSrjoNzL4kBgJq4ojJBcBd75p9HzheXCHdG1EHQBTeetDqiwEPbO/W0KUplYW4t
+RnJhbmNvaXMgRG9ja2VzIDxqZmRvY2tlc0BnbWFpbC5jb20+iQI9BBMBCgAnAhsD
+BQsJCAcDBRUKCQgLBRYCAwEAAh4BAheABQJYrRsbBQkFpZjOAAoJEHgIzpbTi5IB
+GKYP/09nsWnDCqv+3OKzmbHlMKCLvGU8IGU1q909sUelKmyjSFXmh3BsgR4DrfXu
+hGWtmu/mPYzCWzbK8TWYsU1O6em7YRY8lt/q/8gciSahl+xFT/G5GJHf7KFgtsSn
+QcbA18dzXKpxmTOTMEmWLh4zZlaUbaH2gmpXBQvH4smu/FV6rq5YYYDG9A3PDujr
+QmOyguD7wNvb6ahrgpTbMawsj6zLIT1pkC6t1Orz/gmYsuk47EJFfvaO3+YIUQ0D
+pFN9EkDjhcIa3vSsd+EBMbmweFB6y9gs7LmolqwiddUlYv5wGOLgiE1EJEI4bcvr
+vm8RWHziWytmpTPjzFpETaBVMC8xTt6tiNWNeTUkjbBX0Fek9GEvzAJIpe18LnM8
+raFREUriHuUwIGrrFrhj7rBAX51DiiJUguDi+842SjlzVE2SCwyjXVlglDItBPKO
+Y284KpI+wLhJCggtwtzZOQcAc5l8j3JpXjhm1tjSKggEONdBu2l7mWZRAJCBziMK
+mnUPL8q44l2hc/sDu4cCpsHW+pssGDQqtR+t/fPMGFuXd+WnfYskhyQVms44yAYJ
+Y/cx43tgYLHDx2TraTQZqh1qgmrXesS2DiT+5pCjQh0ChwTEBjGCz41WcQkD9nTL
+k3E6amPE6WAPS07bX9zkLHYYIOu8wd8nRoPKlVjhMpBvz8LE
+=2J/9
+-----END PGP PUBLIC KEY BLOCK-----
--- a/packaging/debian/rclgrep/debian/watch
+++ b/packaging/debian/rclgrep/debian/watch
@ -0,0 +1,4 @@
+version=4
+opts=pgpmode=auto \
+https://www.lesbonscomptes.com/recoll/download.html \
+(?:|.*/)recoll(?:[_\-]v?|)(\d[^\s/]*)\.(?:tar\.xz|txz|tar\.bz2|tbz2|tar\.gz|tgz)
--- a/packaging/homebrew/recoll.rb
+++ b/packaging/homebrew/recoll.rb
@ -60,40 +60,40 @@ index f41a9f39..dc3085a4 100755
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
-diff --git filters/rcl7z filters/rcl7z
+diff --git filters/rcl7z.py filters/rcl7z.py
 index c68c8bcb..ac50c4ec 100755
--- filters/rcl7z
-+++ filters/rcl7z
+--- filters/rcl7z.py
+++ filters/rcl7z.py
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
 
 # 7-Zip file filter for Recoll
 
-diff --git filters/rclaudio filters/rclaudio
+diff --git filters/rclaudio.py filters/rclaudio.py
 index 94ca0be7..08d6375a 100755
--- filters/rclaudio
-+++ filters/rclaudio
+--- filters/rclaudio.py
+++ filters/rclaudio.py
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
 
 # Audio tag filter for Recoll, using mutagen
 
-diff --git filters/rclchm filters/rclchm
+diff --git filters/rclchm.py filters/rclchm.py
 index f9811c37..3bc9b16d 100755
--- filters/rclchm
-+++ filters/rclchm
+--- filters/rclchm.py
+++ filters/rclchm.py
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
 """Extract Html files from a Microsoft Compiled Html Help file (.chm)
 Needs at least python 2.2 for HTMLParser (chmlib needs 2.2 too)"""
 
-diff --git filters/rcldia filters/rcldia
+diff --git filters/rcldia.py filters/rcldia.py
 index 282148eb..a480294b 100755
--- filters/rcldia
-+++ filters/rcldia
+--- filters/rcldia.py
+++ filters/rcldia.py
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
@ -120,30 +120,30 @@ index e8fa1831..b92b185d 100755
 from __future__ import print_function
 
 import rclexecm
-diff --git filters/rclepub filters/rclepub
+diff --git filters/rclepub.py filters/rclepub.py
 index 8042d7f9..51786af1 100755
--- filters/rclepub
-+++ filters/rclepub
+--- filters/rclepub.py
+++ filters/rclepub.py
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
 """Extract Html content from an EPUB file (.epub)"""
 from __future__ import print_function
 
-diff --git filters/rclepub1 filters/rclepub1
+diff --git filters/rclepub.py1 filters/rclepub.py1
 index bd44f635..a7ea6c06 100755
--- filters/rclepub1
-+++ filters/rclepub1
+--- filters/rclepub.py1
+++ filters/rclepub.py1
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
 """Extract Html content from an EPUB file (.chm), concatenating all sections"""
 from __future__ import print_function
 
-diff --git filters/rclics filters/rclics
+diff --git filters/rclics.py filters/rclics.py
 index 0ef04f2d..de177024 100755
--- filters/rclics
-+++ filters/rclics
+--- filters/rclics.py
+++ filters/rclics.py
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
@ -160,20 +160,20 @@ index 7eb1da91..4eb6c9b0 100755
 
 # Python-based Image Tag extractor for Recoll. This is less thorough
 # than the Perl-based rclimg script, but useful if you don't want to
-diff --git filters/rclinfo filters/rclinfo
+diff --git filters/rclinfo.py filters/rclinfo.py
 index f353d19e..36cf34e0 100755
--- filters/rclinfo
-+++ filters/rclinfo
+--- filters/rclinfo.py
+++ filters/rclinfo.py
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
 
 # Read a file in GNU info format and output its nodes as subdocs,
 # interfacing with recoll execm
-diff --git filters/rclkar filters/rclkar
+diff --git filters/rclkar.py filters/rclkar.py
 index d6570dd5..34b8d2a2 100755
--- filters/rclkar
-+++ filters/rclkar
+--- filters/rclkar.py
+++ filters/rclkar.py
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
@ -230,10 +230,10 @@ index 615455b3..1e411890 100755
 # -*- coding: iso-8859-1 -*-
 """
     MoinMoin - Python source parser and colorizer
-diff --git filters/rclrar filters/rclrar
+diff --git filters/rclrar.py filters/rclrar.py
 index 8f723fa5..5f6adfb0 100755
--- filters/rclrar
-+++ filters/rclrar
+--- filters/rclrar.py
+++ filters/rclrar.py
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
@ -280,10 +280,10 @@ index 8c1b8aea..cee17324 100755
 # Copyright (C) 2014 J.F.Dockes
 #   This program is free software; you can redistribute it and/or modify
 #   it under the terms of the GNU General Public License as published by
-diff --git filters/rcltar filters/rcltar
+diff --git filters/rcltar.py filters/rcltar.py
 index d8bf100d..ab4b306e 100755
--- filters/rcltar
-+++ filters/rcltar
+--- filters/rcltar.py
+++ filters/rcltar.py
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
@ -320,10 +320,10 @@ index 32a11c1a..eab3b257 100644
 from __future__ import print_function
 
 import rclexecm
-diff --git filters/rclwar filters/rclwar
+diff --git filters/rclwar.py filters/rclwar.py
 index b654f3b3..301e28e9 100755
--- filters/rclwar
-+++ filters/rclwar
+--- filters/rclwar.py
+++ filters/rclwar.py
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
@ -360,10 +360,10 @@ index 158e1222..602769af 100755
 # Copyright (C) 2016 J.F.Dockes
 #   This program is free software; you can redistribute it and/or modify
 #   it under the terms of the GNU General Public License as published by
-diff --git filters/rclzip filters/rclzip
+diff --git filters/rclzip.py filters/rclzip.py
 index 35739625..0c597fbd 100755
--- filters/rclzip
-+++ filters/rclzip
+--- filters/rclzip.py
+++ filters/rclzip.py
@@ -1,4 +1,4 @@
 -#!/usr/bin/env python2
 +#!/usr/bin/env python2.7
--- a/packaging/mac/make-recoll-dmg.sh
+++ b/packaging/mac/make-recoll-dmg.sh
@ -12,27 +12,31 @@ usage()

 # Adjustable things
 top=~/Recoll
-qtversion=5.14.2
-# Will probably need adjustment on M1
-path_clang=clang_64
-deploy=~/Qt/${qtversion}/${path_clang}/bin/macdeployqt
+# The possibly bogus version we have in paths (may be harcoded in the .pro)
+# qcbuildloc=Desktop_Qt_5_15_2_clang_64bit
+qcbuildloc=Qt_6_2_4_for_macOS

+# qtversion=5.15.2
+qtversion=6.2.4

-qt_ver_sion=`echo $qtversion | sed -e 's/\./_/g'`
+#deploy=~/Qt/${qtversion}/macos/clang_64bit/macdeployqt
+deploy=~/Qt/${qtversion}/macos/bin/macdeployqt

 toprecoll=$top/recoll/src
-appdir=$toprecoll/build-recoll-win-Desktop_Qt_${qt_ver_sion}_${path_clang}bit-Release/recoll.app
-rclindexdir=$toprecoll/windows/build-recollindex-Desktop_Qt_${qt_ver_sion}_${path_clang}bit-Release
+appdir=$toprecoll/build-recoll-win-${qcbuildloc}-Release/recoll.app
+rclindexdir=$toprecoll/windows/build-recollindex-${qcbuildloc}-Release
+rclqdir=$toprecoll/windows/build-recollq-${qcbuildloc}-Release
 bindir=$appdir/Contents/MacOS
 datadir=$appdir/Contents/Resources

 dmg=$appdir/../recoll.dmg

-version=`cat $toprecoll/VERSION`
+version=`cat $toprecoll/RECOLL-VERSION.txt`

 test -d $appdir || fatal Must first have built recoll in $appdir

 cp $rclindexdir/recollindex $bindir || exit 1
+cp $rclqdir/recollq $bindir || exit 1

 cp $top/antiword/antiword $bindir || exit 1
 mkdir -p $datadir/antiword || exit 1
@ -45,7 +49,7 @@ $deploy $appdir -dmg || exit 1


 hash=`(cd $top/recoll;git log -n 1  | head -1  | awk '{print $2}' |cut -b 1-8)`
-
-mv $dmg ~/Documents/recoll-$version-$hash.dmg || exit 1
+dte=`date +%Y%m%d`
+mv $dmg ~/Documents/recoll-$version-$dte-$hash.dmg || exit 1
 ls -l ~/Documents/recoll-$version-*.dmg

--- a/packaging/rpm/recoll.spec
+++ b/packaging/rpm/recoll.spec
@ -3,7 +3,7 @@

 Summary:        Desktop full text search tool with Qt GUI
 Name:           recoll
-Version:        1.29.2
+Version:        1.32.7
 Release:        2%{?dist}
 Group:          Applications/Databases
 License:        GPLv2+
@ -13,15 +13,26 @@ Source10:       qmake-qt5.sh
 BuildRequires:  aspell-devel
 BuildRequires:  bison
 BuildRequires:  desktop-file-utils
-# kio
-BuildRequires:  kdelibs4-devel
+
+#BuildRequires:  kdelibs4-devel
+
+# Fedora
 BuildRequires:  qt5-qtbase-devel
 BuildRequires:  qt5-qtwebkit-devel
-BuildRequires:  extra-cmake-modules
-BuildRequires:  kf5-kio-devel
-BuildRequires:  python2-devel
+#BuildRequires: qt5-qtwebengine-devel
 BuildRequires:  python3-devel
 BuildRequires:  xapian-core-devel
+BuildRequires:  kf5-kio-devel
+
+# Opensuse
+#BuildRequires:  libQt5Gui-devel
+#BuildRequires:  libqt5-qtwebengine-devel
+#BuildRequires:  python310-devel
+#BuildRequires:  libxapian-devel
+#BuildRequires:  kio-devel
+
+BuildRequires:  extra-cmake-modules
+BuildRequires:  python2-devel
 BuildRequires:  zlib-devel
 BuildRequires:  chmlib-devel
 BuildRequires:  libxslt-devel
@ -55,7 +66,7 @@ LDFLAGS="%{?__global_ldflags}"; export LDFLAGS
 install -m755 -D %{SOURCE10} qmake-qt5.sh
 export QMAKE=qmake-qt5

-%configure
+%configure --enable-webengine
 make %{?_smp_mflags}

 %install
@ -70,7 +81,7 @@ rm -f %{buildroot}/usr/share/recoll/filters/xdg-open

 # kio_recoll -kde5
 (
-mkdir kde/kioslave/kio_recoll/build && pushd kde/kioslave/kio_recoll/build
+#mkdir kde/kioslave/kio_recoll/build && pushd kde/kioslave/kio_recoll/build
 %cmake ..
 make %{?_smp_mflags} VERBOSE=1
 make install DESTDIR=%{buildroot}
@ -137,12 +148,12 @@ exit 0
 %{_datadir}/icons/hicolor/48x48/apps/%{name}.png
 %{_datadir}/pixmaps/%{name}.png
 %{_libdir}/recoll
-%{python_sitearch}/recoll
-%{python_sitearch}/Recoll*.egg-info
+%{python2_sitearch}/recoll
+%{python2_sitearch}/Recoll*.egg-info
 %{python3_sitearch}/recoll
 %{python3_sitearch}/Recoll*.egg-info
-%{python_sitearch}/recollchm
-%{python_sitearch}/recollchm*.egg-info
+%{python2_sitearch}/recollchm
+%{python2_sitearch}/recollchm*.egg-info
 %{python3_sitearch}/recollchm
 %{python3_sitearch}/recollchm*.egg-info
 %{_mandir}/man1/%{name}.1*
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -4,12 +4,16 @@
 if COND_TESTMAINS
  MAYBE_TESTMAINS = testmains
 endif
-SUBDIRS = . $(MAYBE_TESTMAINS)
+if COND_RCLGREP
+  MAYBE_RCLGREP = rclgrep
+endif
+SUBDIRS = . $(MAYBE_TESTMAINS) $(MAYBE_RCLGREP)
+
 DIST_SUBDIRS = .

 CXXFLAGS ?= @CXXFLAGS@
-LIBXAPIAN=@LIBXAPIAN@
-XAPIANCXXFLAGS=@XAPIANCXXFLAGS@
+XAPIAN_LIBS=@XAPIAN_LIBS@
+XAPIAN_CFLAGS=@XAPIAN_CFLAGS@
 XSLT_CFLAGS=@XSLT_CFLAGS@
 XSLT_LIBS=@XSLT_LIBS@
 LIBICONV=@LIBICONV@
@ -38,7 +42,7 @@ COMMONCPPFLAGS = -I. \
 AM_CPPFLAGS = -Wall -Wno-unused -std=c++11 \
    $(COMMONCPPFLAGS) \
    $(INCICONV) \
-    $(XAPIANCXXFLAGS) \
+    $(XAPIAN_CFLAGS) \
    $(XSLT_CFLAGS) \
    $(X_CFLAGS) \
    -DRECOLL_DATADIR=\"${pkgdatadir}\" \
@ -55,7 +59,10 @@ else
 endif

 librcldir = $(libdir)/recoll
-librcl_LTLIBRARIES = librecoll.la
+librcl_LTLIBRARIES =
+if MAKE_RECOLL_LIB
+librcl_LTLIBRARIES += librecoll.la
+endif

 librecoll_la_SOURCES = \
 aspell/rclaspell.cpp \
@ -287,7 +294,7 @@ AM_YFLAGS = -d
 # need it
 librecoll_la_LDFLAGS = -release $(VERSION) -no-undefined @NO_UNDEF_LINK_FLAG@

-librecoll_la_LIBADD = $(XSLT_LIBS) $(LIBXAPIAN) $(LIBICONV) $(X_LIBX11) $(LIBTHREADS)
+librecoll_la_LIBADD = $(XSLT_LIBS) $(XAPIAN_LIBS) $(LIBICONV) $(X_LIBX11) $(LIBTHREADS)

 # There is probably a better way to do this. The KIO needs to be linked
 # with librecoll, but librecoll is installed into a non-standard place
@ -306,7 +313,10 @@ PicStatic: $(librecoll_la_OBJECTS)
 	$(LIBTOOL) --tag=LD --mode=link gcc -g -O -o librecoll.la \
 		$(librecoll_la_OBJECTS)

-bin_PROGRAMS = recollindex
+bin_PROGRAMS =
+if MAKEINDEXER
+    bin_PROGRAMS += recollindex
+endif
 if MAKECMDLINE
    bin_PROGRAMS += recollq
 endif
@ -334,10 +344,8 @@ recollq_SOURCES = query/recollqmain.cpp
 recollq_LDADD = librecoll.la

 xadump_SOURCES = query/xadump.cpp
-xadump_LDADD = librecoll.la $(LIBXAPIAN) $(LIBICONV)
+xadump_LDADD = librecoll.la $(XAPIAN_LIBS) $(LIBICONV)

-# Note: I'd prefer the generated query parser files not to be distributed
-# at all, but failed to achieve this
 EXTRA_DIST = \
 bincimapmime/00README.recoll bincimapmime/AUTHORS bincimapmime/COPYING \
 \
@ -357,8 +365,8 @@ doc/user/custom.xsl doc/user/usermanual.xml \
 filters/injectcommon.sh filters/recfiltcommon filters/rcltxtlines.py \
 \
 index/rclmon.sh \
-index/recollindex-system.service \
-index/recollindex-user.service \
+index/recollindex@.service \
+index/recollindex.service \
 \
 kde/kioslave/kio_recoll/00README.txt \
 kde/kioslave/kio_recoll/CMakeLists.txt \
@ -369,9 +377,10 @@ kde/kioslave/kio_recoll/dirif.cpp \
 kde/kioslave/kio_recoll/htmlif.cpp \
 kde/kioslave/kio_recoll/kio_recoll.cpp \
 kde/kioslave/kio_recoll/kio_recoll.h \
+kde/kioslave/kio_recoll/recoll.json \
+kde/kioslave/kio_recoll/recoll.protocol \
 kde/kioslave/kio_recoll/recollf.protocol \
 kde/kioslave/kio_recoll/recollnolist.protocol \
-kde/kioslave/kio_recoll/recoll.protocol \
 \
 kde/kioslave/kio_recoll-kde4/00README.txt \
 kde/kioslave/kio_recoll-kde4/CMakeLists.txt \
@ -388,6 +397,9 @@ kde/kioslave/kio_recoll-kde4/recoll.protocol \
 \
 query/location.hh  query/position.hh  query/stack.hh \
 \
+qtgui/actsearch.ui \
+qtgui/actsearch_w.cpp \
+qtgui/actsearch_w.h \
 qtgui/advsearch.ui \
 qtgui/advsearch_w.cpp \
 qtgui/advsearch_w.h \
@ -407,6 +419,8 @@ qtgui/fragbuts.h \
 qtgui/guiutils.cpp \
 qtgui/guiutils.h \
 qtgui/i18n/*.qm qtgui/i18n/*.ts \
+qtgui/idxmodel.cpp \
+qtgui/idxmodel.h \
 qtgui/idxsched.h \
 qtgui/idxsched.ui \
 qtgui/images/asearch.png \
@ -475,6 +489,7 @@ qtgui/rclm_idx.cpp \
 qtgui/rclm_menus.cpp \
 qtgui/rclm_preview.cpp \
 qtgui/rclm_saveload.cpp \
+qtgui/rclm_sidefilters.cpp \
 qtgui/rclm_view.cpp \
 qtgui/rclm_wins.cpp \
 qtgui/rclmain.ui \
@ -567,16 +582,20 @@ python/samples/recollgui/rclmain.ui \
 python/samples/recollq.py \
 python/samples/recollqsd.py \
 \
-sampleconf/fields sampleconf/fragbuts.xml sampleconf/mimeconf \
-sampleconf/mimemap sampleconf/mimeview sampleconf/mimeview.mac \
+rclgrep/Makefile.am \
+rclgrep/rclgrep.cpp \
+\
+sampleconf/fields sampleconf/fragment-buttons.xml sampleconf/mimeconf \
+sampleconf/mimemap sampleconf/mimeview sampleconf/macos/mimeview \
 sampleconf/recoll.conf sampleconf/recoll.qss \
+sampleconf/recoll-common.css sampleconf/recoll-common.qss \
 sampleconf/recoll-dark.qss sampleconf/recoll-dark.css \
 \
 testmains/Makefile.am \
 \
 unac/AUTHORS unac/COPYING unac/README unac/README.recoll unac/unac.c \
 \
-VERSION
+RECOLL-VERSION.txt

 # EXTRA_DIST: The Php Code does not build anymore. No need to ship it until
 # someone fixes it:
@ -618,13 +637,13 @@ install-exec-local:: rclpychm-install
 clean-local:: rclpychm-clean
 rclpychm:
 	(cd python/pychm; set -x; \
-        for v in 2 3;do \
+        for v in 3;do \
        test -n "`which python$${v}`" && python$${v} setup.py build;\
        done \
        )
 rclpychm-install:
 	(cd python/pychm; set -x; \
-        for v in 2 3;do test -n "`which python$${v}`" && \
+        for v in 3;do test -n "`which python$${v}`" && \
        python$${v} setup.py install \
         --prefix=${prefix} --root=$${DESTDIR:-/} $(OPTSFORPYTHON); \
        done \
@ -654,17 +673,19 @@ defconfdir = $(pkgdatadir)/examples
 defconf_DATA = \
 desktop/recollindex.desktop \
 index/rclmon.sh \
-index/recollindex-system.service \
-index/recollindex-user.service \
-sampleconf/fragbuts.xml \
+index/recollindex.service \
+index/recollindex@.service \
 sampleconf/fields \
-sampleconf/recoll.conf \
+sampleconf/fragment-buttons.xml \
 sampleconf/mimeconf \
-sampleconf/recoll.qss \
-sampleconf/recoll-dark.qss \
-sampleconf/recoll-dark.css \
 sampleconf/mimemap \
-sampleconf/mimeview 
+sampleconf/mimeview \
+sampleconf/recoll-common.css \
+sampleconf/recoll-common.qss \
+sampleconf/recoll-dark.css \
+sampleconf/recoll-dark.qss \
+sampleconf/recoll.conf \
+sampleconf/recoll.qss

 filterdir = $(pkgdatadir)/filters
 dist_filter_DATA = \
@ -683,30 +704,31 @@ filters/openxml-xls-body.xsl \
 filters/openxml-word-body.xsl \
 filters/openxml-meta.xsl \
 filters/ppt-dump.py \
-filters/rcl7z \
+filters/rcl7z.py \
 filters/rclaptosidman \
-filters/rclaudio \
+filters/rclaudio.py \
 filters/rclbasehandler.py \
 filters/rclbibtex.sh \
 filters/rclcheckneedretry.sh \
-filters/rclchm \
-filters/rcldia \
+filters/rclchm.py \
+filters/rcldia.py \
 filters/rcldjvu.py \
 filters/rcldoc.py \
 filters/rcldvi \
-filters/rclepub \
-filters/rclepub1 \
+filters/rclepub.py \
+filters/rclepub1.py \
 filters/rclexec1.py \
 filters/rclexecm.py \
 filters/rclfb2.py \
 filters/rclgaim \
 filters/rclgenxslt.py \
 filters/rclhwp.py \
-filters/rclics \
+filters/rclics.py \
 filters/rclimg \
 filters/rclimg.py \
-filters/rclinfo \
-filters/rclkar \
+filters/rclinfo.py \
+filters/rclipynb.py \
+filters/rclkar.py \
 filters/rclkwd \
 filters/rcllatinclass.py \
 filters/rcllatinstops.zip \
@ -725,21 +747,21 @@ filters/rclps \
 filters/rclpst.py \
 filters/rclpurple \
 filters/rclpython.py \
-filters/rclrar \
+filters/rclrar.py \
 filters/rclrtf.py \
 filters/rclscribus \
 filters/rclshowinfo \
-filters/rcltar \
+filters/rcltar.py \
 filters/rcltex \
 filters/rcltext.py \
 filters/rcluncomp \
 filters/rcluncomp.py \
-filters/rclwar \
+filters/rclwar.py \
 filters/rclxls.py \
 filters/rclxml.py \
 filters/rclxmp.py \
 filters/rclxslt.py \
-filters/rclzip \
+filters/rclzip.py \
 filters/recoll-we-move-files.py \
 filters/recollepub.zip \
 filters/svg.xsl \
@ -749,6 +771,13 @@ filters/xml.xsl \
 python/recoll/recoll/conftree.py \
 python/recoll/recoll/rclconfig.py 

+if INSTALL_SYSTEMD_UNITS
+systemd_system_unitdir = @SYSTEMD_SYSTEM_UNIT_DIR@
+systemd_user_unitdir = @SYSTEMD_USER_UNIT_DIR@
+systemd_system_unit_DATA = index/recollindex@.service
+systemd_user_unit_DATA = index/recollindex.service
+endif
+
 install-data-hook: 
 	(cd $(DESTDIR)/$(filterdir); \
 	chmod a+x rcl* ppt-dump.py xls-dump.py xlsxmltocsv.py hotrecoll.py; \
@ -767,14 +796,14 @@ doc/user/usermanual.html: doc/user/usermanual.xml
 endif

 dist_man1_MANS = doc/man/recoll.1 doc/man/recollq.1 \
-               doc/man/recollindex.1 doc/man/xadump.1
+               doc/man/recollindex.1 doc/man/xadump.1 doc/man/rclgrep.1
 dist_man5_MANS = doc/man/recoll.conf.5

 dist-hook:
 	(cd $(top_srcdir); find . \
            \( -name '*.pyc' -o -name '#*' -o -name '*~' \) -delete)
 	if test -z "$(NOTAG)";then \
-	  test -z "`git status -s|grep -v recoll-$(VERSION)`"||exit 1; \
+	  test -z "`git status -s|grep -v recoll-$(RECOLL-VERSION.txt)`"||exit 1; \
 	  vers=`echo $(VERSION) | sed -e 's/~/_/g'`;\
 	  git tag -a RECOLL-$$vers -m "Release $$vers tagged"; \
 	fi
--- a/src/README
+++ b/src/README
@ -2763,8 +2763,8 @@ Chapter 4. Programming interface

   If you can program and want to write an execm handler, it should not be
   too difficult to make sense of one of the existing modules. For example,
-   look at rclzip which uses Zip file paths as identifiers (ipath), and
-   rclics, which uses an integer index. Also have a look at the comments
+   look at rclzip.py which uses Zip file paths as identifiers (ipath), and
+   rclics.py, which uses an integer index. Also have a look at the comments
   inside the internfile/mh_execm.h file and possibly at the corresponding
   module.

@ -2819,7 +2819,7 @@ Chapter 4. Programming interface

 text/rtf = exec unrtf --nopict --html; charset=iso-8859-1; mimetype=text/html

- application/x-chm = execm rclchm
+ application/x-chm = execm rclchm.py

   The fragment specifies that:

--- a/src/RECOLL-VERSION.txt
+++ b/src/RECOLL-VERSION.txt
@ -0,0 +1 @@
+1.33.1
--- a/src/VERSION
+++ b/src/VERSION
@ -1 +0,0 @@
-1.31.0
--- a/src/aspell/rclaspell.h
+++ b/src/aspell/rclaspell.h
@ -41,6 +41,8 @@ class Aspell {
 public:
    Aspell(const RclConfig *cnf);
    ~Aspell();
+    Aspell(const Aspell &) = delete;
+    Aspell& operator=(const Aspell &) = delete;

    /** Check health */
    bool ok() const;
--- a/src/autogen.sh
+++ b/src/autogen.sh
@ -4,15 +4,18 @@ set -x

 aclocal

-if test X"$HOMEBREW_ENV" != X; then
-    glt=`which glibtoolize`
-fi
-if test X"$glt" != X; then
-    $glt --copy
+# detect libtoolize on linux or glibtoolize in some systems
+if (libtoolize --version) < /dev/null > /dev/null 2>&1; then
+  LIBTOOLIZE=libtoolize
+elif (glibtoolize --version) < /dev/null > /dev/null 2>&1; then
+  LIBTOOLIZE=glibtoolize
 else
-    libtoolize --copy
+  echo "libtoolize or glibtoolize was not found! Please install libtool." 1>&2
+  exit 1
 fi

+$LIBTOOLIZE --copy
+
 automake --add-missing --force-missing --copy
 autoconf
 # Our ylwrap gets clobbered by the above.
--- a/src/common/autoconfig-mac.h
+++ b/src/common/autoconfig-mac.h
@ -11,7 +11,7 @@
 /* #undef AC_APPLE_UNIVERSAL_BUILD */

 /* Path to the aspell program */
-#define ASPELL_PROG "/opt/local/bin/aspell"
+#undef ASPELL_PROG

 /* No X11 session monitoring support */
 #define DISABLE_X11MON 1
@ -125,7 +125,7 @@
 #define PACKAGE_NAME "Recoll"

 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Recoll 1.30.2"
+#define PACKAGE_STRING "Recoll 1.33.0"

 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "recoll"
@ -134,7 +134,7 @@
 #define PACKAGE_URL ""

 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.30.2"
+#define PACKAGE_VERSION "1.33.0"

 /* putenv parameter is const */
 /* #undef PUTENV_ARG_CONST */
--- a/src/common/autoconfig-win.h
+++ b/src/common/autoconfig-win.h
@ -118,7 +118,7 @@
 #define PACKAGE_NAME "Recoll"

 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Recoll 1.30.2"
+#define PACKAGE_STRING "Recoll 1.33.0"

 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "recoll"
@ -127,13 +127,13 @@
 #define PACKAGE_URL ""

 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.30.2"
+#define PACKAGE_VERSION "1.33.0"

 /* putenv parameter is const */
 /* #undef PUTENV_ARG_CONST */

 /* Real time monitoring option */
-#undef RCL_MONITOR
+#define RCL_MONITOR 1

 /* Split camelCase words */
 /* #undef RCL_SPLIT_CAMELCASE */
--- a/src/common/conf_post.h
+++ b/src/common/conf_post.h
@ -67,4 +67,18 @@ typedef int ssize_t;
 #  define PRETEND_USE(expr) ((void)(expr))
 #endif /* PRETEND_USE */

+// It's complicated to really detect gnu gcc because other compilers define __GNUC__
+// See stackoverflow questions/38499462/how-to-tell-clang-to-stop-pretending-to-be-other-compilers
+#if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER)
+#define REAL_GCC   __GNUC__ // probably
+#endif
+
+#ifdef REAL_GCC
+// Older gcc versions pretended to supply std::regex, but the resulting programs mostly crashed.
+#include <features.h>
+#if ! __GNUC_PREREQ(6,0)
+#define NO_STD_REGEX 1
+#endif
+#endif
+
 #endif /* INCLUDED */
--- a/src/common/rclconfig.cpp
+++ b/src/common/rclconfig.cpp
@ -1,4 +1,4 @@
-/* Copyright (C) 2004 J.F.Dockes 
+/* Copyright (C) 2004-2022 J.F.Dockes 
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
@ -33,14 +33,11 @@
 #endif

 #include <algorithm>
-#include <list>
 #include <iostream>
 #include <sstream>
-#include <fstream>
 #include <cstdlib>
 #include <cstring>
 #include <unordered_map>
-#include <iterator>

 #include "cstr.h"
 #include "pathut.h"
@ -58,6 +55,16 @@

 using namespace std;

+// Naming the directory for platform-specific default config files, overriding the top-level ones
+// E.g. /usr/share/recoll/examples/windows
+#ifdef _WIN32
+static const string confsysdir{"windows"};
+#elif defined(_APPLE__)
+static const string confsysdir{"macos"};
+#else
+static const string confsysdir;
+#endif
+
 // Static, logically const, RclConfig members or module static
 // variables are initialized once from the first object build during
 // process initialization.
@ -90,9 +97,8 @@ void RclConfig::setPlusMinus(const string& sbase, const set<string>& upd,
    stringToStrings(sbase, base);

    vector<string> diff;
-    auto it =
-        set_difference(base.begin(), base.end(), upd.begin(), upd.end(),
-                       std::inserter(diff, diff.begin()));
+    auto it = set_difference(base.begin(), base.end(), upd.begin(), upd.end(),
+                             std::inserter(diff, diff.begin()));
    sminus = stringsToString(diff);

    diff.clear();
@ -139,7 +145,7 @@ bool ParamStale::needrecompute()
            string newvalue;
            conffile->get(paramnames[i], newvalue, parent->m_keydir);
            LOGDEB1("ParamStale::needrecompute: " << paramnames[i] << " -> " <<
-                    newvalue << " keydir " << parent->m_keydir << endl);
+                    newvalue << " keydir " << parent->m_keydir << "\n");
            if (newvalue.compare(savedvalues[i])) {
                savedvalues[i] = newvalue;
                needrecomp = true;
@ -176,8 +182,7 @@ void ParamStale::init(ConfNull *cnf)

 bool RclConfig::isDefaultConfig() const
 {
-    string defaultconf = path_cat(path_homedata(),
-                                  path_defaultrecollconfsubdir());
+    string defaultconf = path_cat(path_homedata(), path_defaultrecollconfsubdir());
    path_catslash(defaultconf);
    string specifiedconf = path_canon(m_confdir);
    path_catslash(specifiedconf);
@ -187,8 +192,7 @@ bool RclConfig::isDefaultConfig() const

 RclConfig::RclConfig(const RclConfig &r) 
    : m_oldstpsuffstate(this, "recoll_noindex"),
-      m_stpsuffstate(this, {"noContentSuffixes", "noContentSuffixes+",
-                  "noContentSuffixes-"}),
+      m_stpsuffstate(this, {"noContentSuffixes", "noContentSuffixes+", "noContentSuffixes-"}),
      m_skpnstate(this, {"skippedNames", "skippedNames+", "skippedNames-"}),
      m_onlnstate(this, "onlyNames"),
      m_rmtstate(this, "indexedmimetypes"),
@ -200,8 +204,7 @@ RclConfig::RclConfig(const RclConfig &r)

 RclConfig::RclConfig(const string *argcnf)
    : m_oldstpsuffstate(this, "recoll_noindex"),
-      m_stpsuffstate(this, {"noContentSuffixes", "noContentSuffixes+",
-                  "noContentSuffixes-"}),
+      m_stpsuffstate(this, {"noContentSuffixes", "noContentSuffixes+", "noContentSuffixes-"}),
      m_skpnstate(this, {"skippedNames", "skippedNames+", "skippedNames-"}),
      m_onlnstate(this, "onlyNames"),
      m_rmtstate(this, "indexedmimetypes"),
@ -230,8 +233,7 @@ RclConfig::RclConfig(const string *argcnf)
    if (argcnf && !argcnf->empty()) {
        m_confdir = path_absolute(*argcnf);
        if (m_confdir.empty()) {
-            m_reason = 
-                string("Cant turn [") + *argcnf + "] into absolute path";
+            m_reason = string("Cant turn [") + *argcnf + "] into absolute path";
            return;
        }
    } else {
@ -249,9 +251,8 @@ RclConfig::RclConfig(const string *argcnf)
    // this is the default conf
    if (!autoconfdir && !isDefaultConfig()) {
        if (!path_exists(m_confdir)) {
-            m_reason = "Explicitly specified configuration "
-                "directory must exist"
-                " (won't be automatically created). Use mkdir first";
+            m_reason = std::string("Explicitly specified configuration [") + m_confdir +
+                "] directory must exist (won't be automatically created). Use mkdir first";
            return;
        }
    }
@ -291,8 +292,7 @@ RclConfig::RclConfig(const string *argcnf)
            o_localecharset = string(cstr_cp1252);
        }
 #endif
-        LOGDEB1("RclConfig::getDefCharset: localecharset ["  <<
-                o_localecharset << "]\n");
+        LOGDEB1("RclConfig::getDefCharset: localecharset ["  << o_localecharset << "]\n");
    }

    const char *cp;
@ -310,8 +310,15 @@ RclConfig::RclConfig(const string *argcnf)
        m_cdirs.push_back(cp);
    } 

-    // Base/installation config
-    m_cdirs.push_back(path_cat(m_datadir, "examples"));
+    // Base/installation config, and its platform-specific overrides
+    std::string defaultsdir = path_cat(m_datadir, "examples");
+    if (!confsysdir.empty()) {
+        std::string sdir = path_cat(defaultsdir, confsysdir);
+        if (path_isdir(sdir)) {
+            m_cdirs.push_back(sdir);
+        }
+    }
+    m_cdirs.push_back(defaultsdir);

    string cnferrloc;
    for (const auto& dir : m_cdirs) {
@ -339,17 +346,14 @@ RclConfig::RclConfig(const string *argcnf)
    // there are several. This only uses the distributed file, not any
    // local customization (too complicated).
    if (mime_suffixes.empty()) {
-        ConfSimple mm(
-            path_cat(path_cat(m_datadir, "examples"), "mimemap").c_str());
+        ConfSimple mm(path_cat(path_cat(m_datadir, "examples"), "mimemap").c_str());
        vector<ConfLine> order = mm.getlines();
        for (const auto& entry: order) {
            if (entry.m_kind == ConfLine::CFL_VAR) {
-                LOGDEB1("CONFIG: " << entry.m_data << " -> " << entry.m_value <<
-                        endl);
+                LOGDEB1("CONFIG: " << entry.m_data << " -> " << entry.m_value << "\n");
                // Remember: insert() only does anything for new keys,
                // so we only have the first value in the map
-                mime_suffixes.insert(
-                    pair<string,string>(entry.m_value, entry.m_data));
+                mime_suffixes.insert(pair<string,string>(entry.m_value, entry.m_data));
            }
        }
    }
@ -384,9 +388,9 @@ RclConfig::RclConfig(const string *argcnf)

 bool RclConfig::updateMainConfig()
 {
-    ConfStack<ConfTree> *newconf = 
-        new ConfStack<ConfTree>("recoll.conf", m_cdirs, true);
+    ConfStack<ConfTree> *newconf = new ConfStack<ConfTree>("recoll.conf", m_cdirs, true);
    if (newconf == 0 || !newconf->ok()) {
+        std::cerr << "updateMainConfig: new Confstack not ok\n";
        if (m_conf)
            return false;
        m_ok = false;
@ -516,8 +520,7 @@ bool RclConfig::getConfParam(const string &name, vector<int> *vip,
        char *ep;
        vip->push_back(strtol(vs[i].c_str(), &ep, 0));
        if (ep == vs[i].c_str()) {
-            LOGDEB("RclConfig::getConfParam: bad int value in [" << name <<
-                   "]\n");
+            LOGDEB("RclConfig::getConfParam: bad int value in [" << name << "]\n");
            return false;
        }
    }
@ -586,12 +589,10 @@ void RclConfig::initThrConf()
 out:
    ostringstream sconf;
    for (unsigned int i = 0; i < 3; i++) {
-        sconf << "(" << m_thrConf[i].first << ", " << m_thrConf[i].second <<
-            ") ";
+        sconf << "(" << m_thrConf[i].first << ", " << m_thrConf[i].second << ") ";
    }

-    LOGDEB("RclConfig::initThrConf: chosen config (ql,nt): " << sconf.str() <<
-           "\n");
+    LOGDEB("RclConfig::initThrConf: chosen config (ql,nt): " << sconf.str() << "\n");
 }

 pair<int,int> RclConfig::getThrConf(ThrStage who) const
@ -684,7 +685,7 @@ public:
 class SuffCmp {
 public:
    int operator()(const SfString& s1, const SfString& s2) const {
-        //cout << "Comparing " << s1.m_str << " and " << s2.m_str << endl;
+        //cout << "Comparing " << s1.m_str << " and " << s2.m_str << "\n";
        string::const_reverse_iterator 
            r1 = s1.m_str.rbegin(), re1 = s1.m_str.rend(),
            r2 = s2.m_str.rbegin(), re2 = s2.m_str.rend();
@ -734,8 +735,7 @@ vector<string>& RclConfig::getStopSuffixes()
                m_maxsufflen = int(entry.length());
        }
    }
-    LOGDEB1("RclConfig::getStopSuffixes: ->" <<
-            stringsToString(m_stopsuffvec) << endl);
+    LOGDEB1("RclConfig::getStopSuffixes: ->" << stringsToString(m_stopsuffvec) << "\n");
    return m_stopsuffvec;
 }

@ -845,16 +845,22 @@ string RclConfig::getMimeHandlerDef(const string &mtype, bool filtertypes, const
        if (!m_excludeMTypes.empty() && m_excludeMTypes.count(stringtolower(mtype))) {
            IdxDiags::theDiags().record(IdxDiags::ExcludedMime, fn, mtype);
            LOGDEB1("RclConfig::getMimeHandlerDef: " << mtype << " in excluded mime list (fn " <<
-                   fn << ")\n");
+                    fn << ")\n");
            return hs;
        }
    }

    if (!mimeconf->get(mtype, hs, "index")) {
+        if (mtype.find("text/") == 0) {
+            bool alltext{false};
+            getConfParam("textunknownasplain", &alltext);
+            if (alltext && mimeconf->get("text/plain", hs, "index")) {
+                return hs;
+            }
+        }
        if (mtype != "inode/directory") {
            IdxDiags::theDiags().record(IdxDiags::NoHandler, fn, mtype);
-            LOGDEB1("getMimeHandlerDef: no handler for '" << mtype << "' (fn " <<
-                    fn << ")\n");
+            LOGDEB1("getMimeHandlerDef: no handler for '" << mtype << "' (fn " << fn << ")\n");
        }
    }
    return hs;
@ -873,12 +879,11 @@ const vector<MDReaper>& RclConfig::getMDReapers()
        ConfSimple attrs;
        valueSplitAttributes(sreapers, value, attrs);
        vector<string> nmlst = attrs.getNames(cstr_null);
-        for (vector<string>::const_iterator it = nmlst.begin();
-             it != nmlst.end(); it++) {
+        for (const auto& nm : nmlst) {
            MDReaper reaper;
-            reaper.fieldname = fieldCanon(*it);
+            reaper.fieldname = fieldCanon(nm);
            string s;
-            attrs.get(*it, s);
+            attrs.get(nm, s);
            stringToStrings(s, reaper.cmdv);
            m_mdreapers.push_back(reaper);
        }
@ -904,11 +909,17 @@ bool RclConfig::getGuiFilter(const string& catfiltername, string& frag) const
    return true;
 }

-bool RclConfig::valueSplitAttributes(const string& whole, string& value, 
-                                     ConfSimple& attrs)
+bool RclConfig::valueSplitAttributes(const string& whole, string& value, ConfSimple& attrs)
 {
-    /* There is currently no way to escape a semi-colon */
-    string::size_type semicol0 = whole.find_first_of(";");
+    bool inquote{false};
+    string::size_type semicol0;    
+    for (semicol0 = 0; semicol0 < whole.size(); semicol0++) {
+        if (whole[semicol0] == '"') {
+            inquote = !inquote;
+        } else if (whole[semicol0] == ';' && !inquote) {
+            break;
+        }
+    }
    value = whole.substr(0, semicol0);
    trimstring(value);
    string attrstr;
@ -1014,15 +1025,14 @@ bool RclConfig::readFieldsConfig(const string& cnferrloc)
                valuetype = FieldTraits::INT;
            } else {
                LOGERR("readFieldsConfig: bad type for value for " <<
-                       fieldname << " : " << tval << endl);
+                       fieldname << " : " << tval << "\n");
                return 0;
            }
        }
        int valuelen = (int)attrs.getInt("len", 0);
        // Find or insert traits entry
        const auto pit =
-            m_fldtotraits.insert(
-                pair<string, FieldTraits>(canonic, FieldTraits())).first;
+            m_fldtotraits.insert(pair<string, FieldTraits>(canonic, FieldTraits())).first;
        pit->second.valueslot = valueslot;
        pit->second.valuetype = valuetype;
        pit->second.valuelen = valuelen;
@ -1099,8 +1109,7 @@ bool RclConfig::getFieldTraits(const string& _fld, const FieldTraits **ftpp,
                pit->second.pfx << "]\n");
        return true;
    } else {
-        LOGDEB1("RclConfig::getFieldTraits: no prefix for field [" << fld <<
-                "]\n");
+        LOGDEB1("RclConfig::getFieldTraits: no prefix for field [" << fld << "]\n");
        *ftpp = 0;
        return false;
    }
@ -1122,8 +1131,7 @@ string RclConfig::fieldCanon(const string& f) const
    string fld = stringtolower(f);
    const auto it = m_aliastocanon.find(fld);
    if (it != m_aliastocanon.end()) {
-        LOGDEB1("RclConfig::fieldCanon: [" << f << "] -> [" << it->second <<
-                "]\n");
+        LOGDEB1("RclConfig::fieldCanon: [" << f << "] -> [" << it->second << "]\n");
        return it->second;
    }
    LOGDEB1("RclConfig::fieldCanon: [" << f << "] -> [" << fld << "]\n");
@ -1134,8 +1142,7 @@ string RclConfig::fieldQCanon(const string& f) const
 {
    const auto it = m_aliastoqcanon.find(stringtolower(f));
    if (it != m_aliastoqcanon.end()) {
-        LOGDEB1("RclConfig::fieldQCanon: [" << f << "] -> ["  << it->second <<
-                "]\n");
+        LOGDEB1("RclConfig::fieldQCanon: [" << f << "] -> ["  << it->second << "]\n");
        return it->second;
    }
    return fieldCanon(f);
@ -1165,15 +1172,14 @@ set<string> RclConfig::getMimeViewerAllEx() const

    string base, plus, minus;
    mimeview->get("xallexcepts", base, "");
-    LOGDEB1("RclConfig::getMimeViewerAllEx(): base: " << base << endl);
+    LOGDEB1("RclConfig::getMimeViewerAllEx(): base: " << base << "\n");
    mimeview->get("xallexcepts+", plus, "");
-    LOGDEB1("RclConfig::getMimeViewerAllEx(): plus: " << plus << endl);
+    LOGDEB1("RclConfig::getMimeViewerAllEx(): plus: " << plus << "\n");
    mimeview->get("xallexcepts-", minus, "");
-    LOGDEB1("RclConfig::getMimeViewerAllEx(): minus: " << minus << endl);
+    LOGDEB1("RclConfig::getMimeViewerAllEx(): minus: " << minus << "\n");

    computeBasePlusMinus(res, base, plus, minus);
-    LOGDEB1("RclConfig::getMimeViewerAllEx(): res: " << stringsToString(res)
-            << endl);
+    LOGDEB1("RclConfig::getMimeViewerAllEx(): res: " << stringsToString(res) << "\n");
    return res;
 }

@ -1200,11 +1206,9 @@ bool RclConfig::setMimeViewerAllEx(const set<string>& allex)
    return true;
 }

-string RclConfig::getMimeViewerDef(const string &mtype, const string& apptag,
-                                   bool useall) const
+string RclConfig::getMimeViewerDef(const string &mtype, const string& apptag, bool useall) const
 {
-    LOGDEB2("RclConfig::getMimeViewerDef: mtype [" << mtype << "] apptag ["
-            << apptag << "]\n");
+    LOGDEB2("RclConfig::getMimeViewerDef: mtype [" << mtype << "] apptag [" << apptag << "]\n");
    string hs;
    if (mimeview == 0)
        return hs;
@ -1231,9 +1235,18 @@ string RclConfig::getMimeViewerDef(const string &mtype, const string& apptag,
        // Fallthrough to normal case.
    }

-    if (apptag.empty() || !mimeview->get(mtype + string("|") + apptag,
-                                         hs, "view"))
+    if (apptag.empty() || !mimeview->get(mtype + string("|") + apptag, hs, "view"))
        mimeview->get(mtype, hs, "view");
+
+    // Last try for text/xxx if alltext is set
+    if (hs.empty() && mtype.find("text/") == 0 && mtype != "text/plain") {
+        bool alltext{false};
+        getConfParam("textunknownasplain", &alltext);
+        if (alltext) {
+            return getMimeViewerDef("text/plain", apptag, useall);
+        }
+    }
+        
    return hs;
 }

@ -1242,9 +1255,8 @@ bool RclConfig::getMimeViewerDefs(vector<pair<string, string> >& defs) const
    if (mimeview == 0)
        return false;
    vector<string>tps = mimeview->getNames("view");
-    for (vector<string>::const_iterator it = tps.begin(); 
-         it != tps.end();it++) {
-        defs.push_back(pair<string, string>(*it, getMimeViewerDef(*it, "", 0)));
+    for (const auto& tp : tps) {
+        defs.push_back(pair<string, string>(tp, getMimeViewerDef(tp, "", 0)));
    }
    return true;
 }
@ -1398,17 +1410,39 @@ string RclConfig::getIdxStatusFile() const
 // Thanks to user Madhu for this fix.
 string RclConfig::getPidfile() const
 {
-    const char *p = getenv("XDG_RUNTIME_DIR");
-    if (p) {
-        string base = path_canon(p);
-        string digest, hex;
-        string cfdir = path_canon(getConfDir());
-        path_catslash(cfdir);
-        MD5String(cfdir, digest);
-        MD5HexPrint(digest, hex);
-        return path_cat(base, "/recoll-" + hex + "-index.pid");
-    } 
-    return path_cat(getCacheDir(), "index.pid");
+    static string fn;
+    if (fn.empty()) {
+#ifndef _WIN32
+        const char *p = getenv("XDG_RUNTIME_DIR");
+        string rundir;
+        if (nullptr == p) {
+            // Problem is, we may have been launched outside the desktop, maybe by cron. Basing
+            // everything on XDG_RUNTIME_DIR was a mistake, sometimes resulting in different pidfiles
+            // being used by recollindex instances. So explicitely test for /run/user/$uid, still
+            // leaving open the remote possibility that XDG_RUNTIME_DIR would be set to something
+            // else...
+            rundir = path_cat("/run/user", lltodecstr(getuid()));
+            if (path_isdir(rundir)) {
+                p = rundir.c_str();
+            }
+        }
+        if (p) {
+            string base = path_canon(p);
+            string digest, hex;
+            string cfdir = path_canon(getConfDir());
+            path_catslash(cfdir);
+            MD5String(cfdir, digest);
+            MD5HexPrint(digest, hex);
+            fn =  path_cat(base, "recoll-" + hex + "-index.pid");
+            goto out;
+        }
+#endif // ! _WIN32
+    
+        fn = path_cat(getCacheDir(), "index.pid");
+    out:
+        LOGINF("RclConfig: pid/lock file: " << fn << "\n");
+    }
+    return fn;
 }


@ -1441,7 +1475,7 @@ static string path_diffstems(const string& p1, const string& p2,
            break;
        }
    }
-    //cerr << "Common length = " << cl << endl;
+    //cerr << "Common length = " << cl << "\n";
    if (cl == 0) {
        reason = "Input paths are empty or have no common part";
        return reason;
@ -1473,13 +1507,12 @@ void RclConfig::urlrewrite(const string& dbdir, string& url) const
            cur_confdir = m_confdir;
        }
        LOGDEB1("RclConfig::urlrewrite: orgidxconfdir: " << orig_confdir <<
-                " cur_confdir " << cur_confdir << endl);
-        string reason = path_diffstems(orig_confdir, cur_confdir,
-                                       confstemorg, confstemrep);
+                " cur_confdir " << cur_confdir << "\n");
+        string reason = path_diffstems(orig_confdir, cur_confdir, confstemorg, confstemrep);
        if (!reason.empty()) {
            LOGERR("urlrewrite: path_diffstems failed: " << reason <<
                   " : orig_confdir [" << orig_confdir <<
-                   "] cur_confdir [" << cur_confdir << endl);
+                   "] cur_confdir [" << cur_confdir << "\n");
            confstemorg = confstemrep = "";
        }
    }
@ -1487,8 +1520,7 @@ void RclConfig::urlrewrite(const string& dbdir, string& url) const
    // Do path translations exist for this index ?
    bool needptrans = true;
    if (m_ptrans == 0 || !m_ptrans->hasSubKey(dbdir)) {
-        LOGDEB2("RclConfig::urlrewrite: no paths translations (m_ptrans " <<
-                m_ptrans << ")\n");
+        LOGDEB2("RclConfig::urlrewrite: no paths translations (m_ptrans " << m_ptrans << ")\n");
        needptrans = false;
    }

@ -1639,6 +1671,7 @@ vector<string> RclConfig::getDaemSkippedPaths() const
 // and filtersdir from the config file to the PATH, then use execmd::which()
 string RclConfig::findFilter(const string &icmd) const
 {
+    LOGDEB2("findFilter: " << icmd << "\n");
    // If the path is absolute, this is it
    if (path_isabsolute(icmd))
        return icmd;
@ -1686,13 +1719,19 @@ bool RclConfig::processFilterCmd(std::vector<std::string>& cmd) const
    LOGDEB0("processFilterCmd: in: " << stringsToString(cmd) << "\n");
    auto it = cmd.begin();

-    // Special-case python and perl on windows: we need to also locate the
-    // first argument which is the script name "python somescript.py". 
-    // On Unix, thanks to #!, we usually just run "somescript.py", but need
-    // the same change if we ever want to use the same cmd line as windows
-    bool hasinterp = !stringlowercmp("python", *it) ||
-        !stringlowercmp("perl", *it);
-
+#ifdef _WIN32
+    // Special-case interpreters on windows: we used to have an additional 1st argument "python" in
+    // mimeconf, but we now rely on the .py extension for better sharing of mimeconf.
+    std::string ext = path_suffix(*it);
+    if ("py" == ext) {
+        it = cmd.insert(it, findFilter("python"));
+        it++;
+    } else if ("pl" == ext) {
+        it = cmd.insert(it, findFilter("perl"));
+        it++;
+    }
+#endif
+    
    // Note that, if the cmd vector size is 1, post-incrementing the
    // iterator in the following statement, which works on x86, leads
    // to a crash on ARM with gcc 6 and 8 (at least), which does not
@ -1700,25 +1739,15 @@ bool RclConfig::processFilterCmd(std::vector<std::string>& cmd) const
    // whatever... We do it later then.
    *it = findFilter(*it);

-    if (hasinterp) {
-        if (cmd.size() < 2) {
-            LOGERR("processFilterCmd: python/perl cmd: no script?. [" <<
-                   stringsToString(cmd) << "]\n");
-            return false;
-        } else {
-            ++it;
-            *it = findFilter(*it);
-        }
-    }
    LOGDEB0("processFilterCmd: out: " << stringsToString(cmd) << "\n");
    return true;
 }

-bool RclConfig::pythonCmd(const std::string& scriptname,
-                          std::vector<std::string>& cmd) const
+// This now does nothing more than processFilterCmd (after we changed to relying on the py extension)
+bool RclConfig::pythonCmd(const std::string& scriptname, std::vector<std::string>& cmd) const
 {
 #ifdef _WIN32
-    cmd = {"python", scriptname};
+    cmd = {scriptname};
 #else
    cmd = {scriptname};
 #endif
--- a/src/common/rclconfig.h
+++ b/src/common/rclconfig.h
@ -103,6 +103,14 @@ public:
        freeAll();
    }

+    RclConfig& operator=(const RclConfig &r) {
+        if (this != &r) {
+            freeAll();
+            initFrom(r);
+        }
+        return *this;
+    }
+
    // Return a writable clone of the main config. This belongs to the
    // caller (must delete it when done)
    ConfNull *cloneMainConfig();
@ -251,12 +259,18 @@ public:
    string getMimeHandlerDef(const string &mimetype, bool filtertypes=false,
                             const std::string& fn = std::string());

-    /** For lines like: "name = some value; attr1 = value1; attr2 = val2"
+    /** For lines like: [name = some value; attr1 = value1; attr2 = val2]
     * Separate the value and store the attributes in a ConfSimple 
-     * @param whole the raw value. No way to escape a semi-colon in there.
+     *
+     * In the value part, semi-colons inside double quotes are ignored, and double quotes are
+     * conserved. In the common case where the string is then processed by stringToStrings() to
+     * build a command line, this allows having semi-colons inside arguments. However, no backslash
+     * escaping is possible, so that, for example "bla\"1;2\"" would not work (the value part
+     * would stop at the semi-colon).
+     *
+     * @param whole the raw value.
     */
-    static bool valueSplitAttributes(const string& whole, string& value, 
-                                     ConfSimple& attrs) ;
+    static bool valueSplitAttributes(const string& whole, string& value, ConfSimple& attrs) ;

    /** Compute difference between 'base' and 'changed', as elements to be
     * added and substracted from base. Input and output strings are in
@ -362,14 +376,6 @@ public:
        return o_origcwd;
    }

-    RclConfig& operator=(const RclConfig &r) {
-        if (this != &r) {
-            freeAll();
-            initFrom(r);
-        }
-        return *this;
-    }
-
    friend class ParamStale;

 private:
--- a/src/common/rclinit.cpp
+++ b/src/common/rclinit.cpp
@ -312,7 +312,7 @@ RclConfig *recollinit(int flags,
 #if defined(MACPORTS)
    PATH = string("/opt/local/bin/") + ":" + PATH;
 #elif defined(HOMEBREW)
-    PATH = string("/usr/local/bin/") + ":" + PATH;
+    PATH = string("/opt/homebrew/bin:/usr/local/bin/") + ":" + PATH;
 #else
    // Native qt build. Add our own directory to the path so that
    // recoll finds recollindex pkgdatadir:
--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@ -81,6 +81,8 @@ unsigned int  TextSplit::o_CJKNgramLen{2};
 bool          TextSplit::o_noNumbers{false};
 bool          TextSplit::o_deHyphenate{false};
 int           TextSplit::o_maxWordLength{40};
+int           TextSplit::o_maxWordsInSpan{6};
+
 static const int o_CJKMaxNgramLen{5};
 bool o_exthangultagger{false};

@ -90,6 +92,7 @@ static char underscoreatend = '_';
 void TextSplit::staticConfInit(RclConfig *config)
 {
    config->getConfParam("maxtermlength", &o_maxWordLength);
+    config->getConfParam("maxwordsinspan", &o_maxWordsInSpan);

    bool bvalue{false};
    if (config->getConfParam("nocjk", &bvalue) && bvalue == true) {
@ -206,32 +209,26 @@ public:
 };
 static const CharClassInit charClassInitInstance;

-static inline int whatcc(unsigned int c, char *asciirep = nullptr)
+static inline bool isvisiblewhite(int c)
+{
+    return visiblewhite.find(c) != visiblewhite.end();
+}
+
+static inline int whatcc(unsigned int c)
 {
    if (c <= 127) {
        return charclasses[c]; 
    } else {
-        if (c == 0x2010) {
-            // Special treatment for hyphen: handle as ascii minus. See
-            // doc/notes/minus-hyphen-dash.txt
-            if (asciirep)
-                *asciirep = '-';
-            return c;
-        } else if (c == 0x2019 || c == 0x275c || c == 0x02bc) {
-            // Things sometimes replacing a single quote. Use single
-            // quote so that span processing works ok
-            if (asciirep)
-                *asciirep = '\'';
+        if (c == 0x2010 || c == 0x2019 || c == 0x275c || c == 0x02bc) {
            return c;
        } else if (sskip.find(c) != sskip.end()) {
            return SKIP;
        } else if (spunc.find(c) != spunc.end()) {
            return SPACE;
        } else {
-            vector<unsigned int>::iterator it = 
-                lower_bound(vpuncblocks.begin(), vpuncblocks.end(), c);
-                if (it == vpuncblocks.end())
-                        return LETTER;
+            auto it = lower_bound(vpuncblocks.begin(), vpuncblocks.end(), c);
+            if (it == vpuncblocks.end())
+                return LETTER;
            if (c == *it)
                return SPACE;
            if ((it - vpuncblocks.begin()) % 2 == 1) {
@ -245,16 +242,16 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)

 // testing whatcc...
 #if 0
-  unsigned int testvalues[] = {'a', '0', 0x80, 0xbf, 0xc0, 0x05c3, 0x1000, 
-                               0x2000, 0x2001, 0x206e, 0x206f, 0x20d0, 0x2399, 
-                               0x2400, 0x2401, 0x243f, 0x2440, 0xff65};
-  int ntest = sizeof(testvalues) / sizeof(int);
-  for (int i = 0; i < ntest; i++) {
-      int ret = whatcc(testvalues[i]);
-      printf("Tested value 0x%x, returned value %d %s\n",
-             testvalues[i], ret, ret == LETTER ? "LETTER" : 
-             ret == SPACE ? "SPACE" : "OTHER");
-  }
+unsigned int testvalues[] = {'a', '0', 0x80, 0xbf, 0xc0, 0x05c3, 0x1000, 
+                             0x2000, 0x2001, 0x206e, 0x206f, 0x20d0, 0x2399, 
+                             0x2400, 0x2401, 0x243f, 0x2440, 0xff65};
+int ntest = sizeof(testvalues) / sizeof(int);
+for (int i = 0; i < ntest; i++) {
+    int ret = whatcc(testvalues[i]);
+    printf("Tested value 0x%x, returned value %d %s\n",
+           testvalues[i], ret, ret == LETTER ? "LETTER" : 
+           ret == SPACE ? "SPACE" : "OTHER");
+}
 #endif

 // CJK Unicode character detection. CJK text is indexed using an n-gram
@ -287,16 +284,16 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
 // FF00..FFEF; Halfwidth and Fullwidth Forms
 // 20000..2A6DF; CJK Unified Ideographs Extension B
 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
-#define UNICODE_IS_CJK(p)                                               \
-    (((p) >= 0x1100 && (p) <= 0x11FF) ||                                \
-     ((p) >= 0x2E80 && (p) <= 0x2EFF) ||                                \
-     ((p) >= 0x3000 && (p) <= 0x9FFF) ||                                \
-     ((p) >= 0xA700 && (p) <= 0xA71F) ||                                \
-     ((p) >= 0xAC00 && (p) <= 0xD7AF) ||                                \
-     ((p) >= 0xF900 && (p) <= 0xFAFF) ||                                \
-     ((p) >= 0xFE30 && (p) <= 0xFE4F) ||                                \
-     ((p) >= 0xFF00 && (p) <= 0xFFEF) ||                                \
-     ((p) >= 0x20000 && (p) <= 0x2A6DF) ||                              \
+#define UNICODE_IS_CJK(p)                       \
+    (((p) >= 0x1100 && (p) <= 0x11FF) ||        \
+     ((p) >= 0x2E80 && (p) <= 0x2EFF) ||        \
+     ((p) >= 0x3000 && (p) <= 0x9FFF) ||        \
+     ((p) >= 0xA700 && (p) <= 0xA71F) ||        \
+     ((p) >= 0xAC00 && (p) <= 0xD7AF) ||        \
+     ((p) >= 0xF900 && (p) <= 0xFAFF) ||        \
+     ((p) >= 0xFE30 && (p) <= 0xFE4F) ||        \
+     ((p) >= 0xFF00 && (p) <= 0xFFEF) ||        \
+     ((p) >= 0x20000 && (p) <= 0x2A6DF) ||      \
     ((p) >= 0x2F800 && (p) <= 0x2FA1F))

 // We should probably map 'fullwidth ascii variants' and 'halfwidth
@ -304,9 +301,9 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
 // filter, KuromojiNormalizeFilter.java
 // 309F is Hiragana.
 #ifdef KATAKANA_AS_WORDS
-#define UNICODE_IS_KATAKANA(p)                                          \
-    ((p) != 0x309F &&                                                   \
-     (((p) >= 0x3099 && (p) <= 0x30FF) ||                               \
+#define UNICODE_IS_KATAKANA(p)                  \
+    ((p) != 0x309F &&                           \
+     (((p) >= 0x3099 && (p) <= 0x30FF) ||       \
      ((p) >= 0x31F0 && (p) <= 0x31FF)))
 #else
 #define UNICODE_IS_KATAKANA(p) false
@ -315,14 +312,14 @@ static inline int whatcc(unsigned int c, char *asciirep = nullptr)
 #ifdef HANGUL_AS_WORDS
 // If no external tagger is configured, we process HANGUL as generic
 // cjk (n-grams)
-#define UNICODE_IS_HANGUL(p) (                 \
-        o_exthangultagger &&                   \
-        (((p) >= 0x1100 && (p) <= 0x11FF) ||   \
-         ((p) >= 0x3130 && (p) <= 0x318F) ||   \
-         ((p) >= 0x3200 && (p) <= 0x321e) ||   \
-         ((p) >= 0x3248 && (p) <= 0x327F) ||   \
-         ((p) >= 0x3281 && (p) <= 0x32BF) ||   \
-         ((p) >= 0xAC00 && (p) <= 0xD7AF))     \
+#define UNICODE_IS_HANGUL(p) (                  \
+        o_exthangultagger &&                    \
+        (((p) >= 0x1100 && (p) <= 0x11FF) ||    \
+         ((p) >= 0x3130 && (p) <= 0x318F) ||    \
+         ((p) >= 0x3200 && (p) <= 0x321e) ||    \
+         ((p) >= 0x3248 && (p) <= 0x327F) ||    \
+         ((p) >= 0x3281 && (p) <= 0x32BF) ||    \
+         ((p) >= 0xAC00 && (p) <= 0xD7AF))      \
        )
 #else
 #define UNICODE_IS_HANGUL(p) false
@ -351,19 +348,16 @@ bool TextSplit::isNGRAMMED(int c)
 }


-// This is used to detect katakana/other transitions, which must
-// trigger a word split (there is not always a separator, and katakana
-// is otherwise treated like other, in the same routine, unless cjk
+// This is used to detect katakana/other transitions, which must trigger a word split (there is not
+// always a separator, and katakana is otherwise treated like other, in the same routine, unless cjk
 // which has its span reader causing a word break)
 enum CharSpanClass {CSC_HANGUL, CSC_CJK, CSC_KATAKANA, CSC_OTHER};
-std::vector<CharFlags> csc_names {CHARFLAGENTRY(CSC_HANGUL),
-        CHARFLAGENTRY(CSC_CJK), CHARFLAGENTRY(CSC_KATAKANA),
-        CHARFLAGENTRY(CSC_OTHER)};
+std::vector<CharFlags> csc_names {CHARFLAGENTRY(CSC_HANGUL), CHARFLAGENTRY(CSC_CJK),
+                                  CHARFLAGENTRY(CSC_KATAKANA), CHARFLAGENTRY(CSC_OTHER)};

-// Final term checkpoint: do some checking (the kind which is simpler
-// to do here than in the main loop), then send term to our client.
-inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
-                                size_t btstart, size_t btend)
+// Final term checkpoint: do some checking (the kind which is simpler to do here than in the main
+// loop), then send term to our client.
+inline bool TextSplit::emitterm(bool isspan, string &w, int pos, size_t btstart, size_t btend)
 {
    LOGDEB2("TextSplit::emitterm: [" << w << "] pos " << pos << "\n");

@ -378,39 +372,38 @@ inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
    PRETEND_USE(isspan);
 #endif

-    if (l > 0 && l <= o_maxWordLength) {
-        // 1 byte word: we index single ascii letters and digits, but
-        // nothing else. We might want to turn this into a test for a
-        // single utf8 character instead ?
-        if (l == 1) {
-            unsigned int c = ((unsigned int)w[0]) & 0xff;
-            if (charclasses[c] != A_ULETTER && charclasses[c] != A_LLETTER && 
-                charclasses[c] != DIGIT &&
-                (!(m_flags & TXTS_KEEPWILD) || charclasses[c] != WILD)
-                ) {
-                //cerr << "ERASING single letter term " << c << endl;
-                return true;
-            }
-        }
-        if (pos != m_prevpos || l != m_prevlen) {
-            bool ret = takeword(w, pos, int(btstart), int(btend));
-            m_prevpos = pos;
-            m_prevlen = int(w.length());
-            return ret;
-        }
-        LOGDEB2("TextSplit::emitterm:dup: [" << w << "] pos " << pos << "\n");
+    if (l == 0 || l > o_maxWordLength) {
+        return true;
    }
+    if (l == 1) {
+        // 1 byte word: we index single ascii letters and digits, but nothing else. We might want to
+        // turn this into a test for a single utf8 character instead ?
+        unsigned int c = ((unsigned int)w[0]) & 0xff;
+        if (charclasses[c] != A_ULETTER && charclasses[c] != A_LLETTER && 
+            charclasses[c] != DIGIT &&
+            (!(m_flags & TXTS_KEEPWILD) || charclasses[c] != WILD)
+            ) {
+            //cerr << "ERASING single letter term " << c << endl;
+            return true;
+        }
+    }
+    if (pos != m_prevpos || l != m_prevlen) {
+        bool ret = takeword(w, pos, int(btstart), int(btend));
+        m_prevpos = pos;
+        m_prevlen = int(w.length());
+        return ret;
+    }
+    LOGDEB2("TextSplit::emitterm:dup: [" << w << "] pos " << pos << "\n");
    return true;
 }

-// Check for an acronym/abbreviation ie I.B.M. This only works with
-// ascii (no non-ascii utf-8 acronym are possible)
+// Check for an acronym/abbreviation ie I.B.M. This only works with ascii (we do not detect
+// non-ascii utf-8 acronyms)
 bool TextSplit::span_is_acronym(string *acronym)
 {
    bool acron = false;

-    if (m_wordLen != m_span.length() && 
-        m_span.length() > 2 && m_span.length() <= 20) {
+    if (m_wordLen != m_span.length() && m_span.length() > 2 && m_span.length() <= 20) {
        acron = true;
        // Check odd chars are '.'
        for (unsigned int i = 1 ; i < m_span.length(); i += 2) {
@ -439,27 +432,23 @@ bool TextSplit::span_is_acronym(string *acronym)
 }


-// Generate terms from span. Have to take into account the
-// flags: ONLYSPANS, NOSPANS, noNumbers
+// Generate terms from span. Have to take into account the flags: ONLYSPANS, NOSPANS, noNumbers
 bool TextSplit::words_from_span(size_t bp)
 {
 #if 0
-    cerr << "Span: [" << m_span << "] " << " w_i_s size: " << 
-        m_words_in_span.size() <<  " : ";
+    cerr << "Span: [" << m_span << "] " << " bp " << bp <<
+        " w_i_s size: " << m_words_in_span.size() <<  " : ";
    for (unsigned int i = 0; i < m_words_in_span.size(); i++) {
-        cerr << " [" << m_words_in_span[i].first << " " <<
-            m_words_in_span[i].second << "] ";
+        cerr << " [" << m_words_in_span[i].first << " " << m_words_in_span[i].second << "] ";
                
    }
    cerr << endl;
 #endif
    int spanwords = int(m_words_in_span.size());
-    // It seems that something like: tv_combo-sample_util.Po@am_quote
-    // can get the splitter to call doemit with a span of '@' and
-    // words_in_span==0, which then causes a crash when accessing
-    // words_in_span[0] if the stl assertions are active (e.g. Fedora
-    // RPM build). Not too sure what the right fix would be, but for
-    // now, just defend against it
+    // It seems that something like: tv_combo-sample_util.Po@am_quote can get the splitter to call
+    // doemit with a span of '@' and words_in_span==0, which then causes a crash when accessing
+    // words_in_span[0] if the stl assertions are active (e.g. Fedora RPM build). Not too sure what
+    // the right fix would be, but for now, just defend against it
    if (spanwords == 0) {
        return true;
    }
@ -467,21 +456,17 @@ bool TextSplit::words_from_span(size_t bp)
    // Byte position of the span start
    size_t spboffs = bp - m_span.size();

-    if (o_deHyphenate && spanwords == 2 && 
-        m_span[m_words_in_span[0].second] == '-') {
+    if (o_deHyphenate && spanwords == 2 && m_span[m_words_in_span[0].second] == '-') {
        unsigned int s0 = m_words_in_span[0].first;
        unsigned int l0 = m_words_in_span[0].second - m_words_in_span[0].first;
        unsigned int s1 = m_words_in_span[1].first;
        unsigned int l1 = m_words_in_span[1].second - m_words_in_span[1].first;
        string word = m_span.substr(s0, l0) + m_span.substr(s1, l1);
        if (l0 && l1) 
-            emitterm(false, word,
-                     m_spanpos, spboffs, spboffs + m_words_in_span[1].second);
+            emitterm(false, word, m_spanpos, spboffs, spboffs + m_words_in_span[1].second);
    }

-    for (int i = 0; 
-         i < ((m_flags&TXTS_ONLYSPANS) ? 1 : spanwords); 
-         i++) {
+    for (int i = 0; i < ((m_flags&TXTS_ONLYSPANS) ? 1 : spanwords); i++) {

        int deb = m_words_in_span[i].first;
        bool noposinc = m_words_in_span[i].second == deb;
@ -490,8 +475,7 @@ bool TextSplit::words_from_span(size_t bp)
             j++) {

            int fin = m_words_in_span[j].second;
-            //cerr << "i " << i << " j " << j << " deb " << deb << 
-            //" fin " << fin << endl;
+            //cerr << "i " << i << " j " << j << " deb " << deb << " fin " << fin << endl;
            if (fin - deb > int(m_span.size()))
                break;
            string word(m_span.substr(deb, fin-deb));
@ -519,7 +503,7 @@ bool TextSplit::words_from_span(size_t bp)
 * 
 * @return true if ok, false for error. Splitting should stop in this case.
 * @param spanerase Set if the current span is at its end. Process it.
- * @param bp        The current BYTE position in the stream
+ * @param bp        The current BYTE position in the stream (it's beyond the current span data).
 */
 inline bool TextSplit::doemit(bool spanerase, size_t _bp)
 {
@ -532,7 +516,7 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
    if (m_wordLen) {
        // We have a current word. Remember it

-        if (m_words_in_span.size() >= 6) {
+        if (int(m_words_in_span.size()) >= o_maxWordsInSpan) {
            // Limit max span word count
            spanerase = true;
        }
@ -550,38 +534,13 @@ inline bool TextSplit::doemit(bool spanerase, size_t _bp)
        return true;
    }

-
-    // Span is done (too long or span-terminating character). Produce
-    // terms and reset it.
+    // Span is done (too long or span-terminating character). Produce terms and reset it.
    string acronym;
    if (span_is_acronym(&acronym)) {
        if (!emitterm(false, acronym, m_spanpos, bp - m_span.length(), bp))
            return false;
    }

-    // Maybe trim at end. These are chars that we might keep
-    // inside a span, but not at the end.
-    string::size_type trimsz{0};
-    while (trimsz < m_span.length()) {
-        auto c = m_span[m_span.length() - 1 - trimsz];
-        if (c == '.' || c == '-' || c == ',' || c == '@' || c == '\'' ||
-            c == underscoreatend) {
-            trimsz++;
-            if (m_words_in_span.size() &&
-                m_words_in_span.back().second > int(m_span.size())) {
-                m_words_in_span.back().second = int(m_span.size());
-            }
-            if (--bp < 0) {
-                bp = 0;
-            }
-        } else {
-            break;
-        }
-    }
-    if (trimsz > 0) {
-        m_span.resize(m_span.length() - trimsz);
-    }
-
    if (!words_from_span(bp)) {
        return false;
    }
@ -640,6 +599,7 @@ bool TextSplit::text_to_words(const string &in)
    clearsplitstate();
    
    bool pagepending = false;
+    bool nlpending = false;
    bool softhyphenpending = false;

    // Running count of non-alphanum chars. Reset when we see one;
@ -713,8 +673,7 @@ bool TextSplit::text_to_words(const string &in)
        prev_csc = csc;
 #endif

-        char asciirep = 0;
-        int cc = whatcc(c, &asciirep);
+        int cc = whatcc(c);

        switch (cc) {
        case SKIP:
@ -750,6 +709,10 @@ bool TextSplit::text_to_words(const string &in)
                pagepending = false;
                newpage(m_wordpos);
            }
+            if (nlpending) {
+                nlpending = false;
+                newline(m_wordpos);
+            }
            break;

        case WILD:
@ -773,7 +736,7 @@ bool TextSplit::text_to_words(const string &in)
                } 
            } else if (m_inNumber) {
                if ((m_span[m_span.length() - 1] == 'e' ||
-                                      m_span[m_span.length() - 1] == 'E')) {
+                     m_span[m_span.length() - 1] == 'E')) {
                    if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
                        m_wordLen += it.appendchartostring(m_span);
                        STATS_INC_WORDCHARS;
@ -781,17 +744,24 @@ bool TextSplit::text_to_words(const string &in)
                    }
                }
            } else {
+                int nextc = it[it.getCpos()+1];
                if (cc == '+') {
-                    int nextc = it[it.getCpos()+1];
-                    if (nextc == '+' || nextc == -1 || visiblewhite.find(nextc) 
-                        != visiblewhite.end()) {
+                    if (nextc == '+' || nextc == -1 || isvisiblewhite(nextc)) {
                        // someword++[+...] !
                        m_wordLen += it.appendchartostring(m_span);
                        STATS_INC_WORDCHARS;
                        break;
                    }
                } else {
-                    // Treat '-' inside span as glue char
+                    // Note about dangling hyphens: we always strip '-' found before whitespace,
+                    // even before a newline, then generate two terms, before and after the line
+                    // break. We have no way to know if '-' is there because a word was broken by
+                    // justification or if it was part of an actual compound word (would need a
+                    // dictionary to check). As soft-hyphen *should* be used if the '-' is not part
+                    // of the text.
+                    if (nextc == -1 || isvisiblewhite(nextc)) {
+                        goto SPACE;
+                    }
                    if (!doemit(false, it.getBpos()))
                        return false;
                    m_inNumber = false;
@ -827,8 +797,7 @@ bool TextSplit::text_to_words(const string &in)
                        m_inNumber = true;
                        m_wordLen += it.appendchartostring(m_span);
                    } else {
-                        m_words_in_span.
-                            push_back(pair<int,int>(m_wordStart, m_wordStart));
+                        m_words_in_span.push_back(pair<int,int>(m_wordStart, m_wordStart));
                        m_wordStart += it.appendchartostring(m_span);
                    }
                    STATS_INC_WORDCHARS;
@ -845,38 +814,28 @@ bool TextSplit::text_to_words(const string &in)
        }
        break;

-        case 0x2010:
-        case 0x2019:
+        case 0x2010: // hyphen
+        case 0x2019: // variations on single quote
        case 0x275c:
        case 0x02bc:
-            // Unicode chars which we replace with ascii for
-            // processing (2010 -> -,others -> '). It happens that
-            // they all work as glue chars and use the same code, but
-            // there might be cases needing different processing.
-            // Hyphen is replaced with ascii minus
-            if (m_wordLen) {
-                // Inside span: glue char
-                if (!doemit(false, it.getBpos()))
-                    return false;
-                m_inNumber = false;
-                m_span += asciirep;
-                m_wordStart++;
-                break;
-            }
-            goto SPACE;
-
        case '@':
        case '_': // If underscoreasletter is set, we'll never get this
        case '\'':
+        {
            // If in word, potential span: o'brien, jf@dockes.org,
            // else just ignore
+            int nextc = it[it.getCpos()+1];
+            if (nextc == -1 || isvisiblewhite(nextc)) {
+                goto SPACE;
+            }
            if (m_wordLen) {
                if (!doemit(false, it.getBpos()))
                    return false;
                m_inNumber = false;
                m_wordStart += it.appendchartostring(m_span);
            }
-            break;
+        }
+        break;

        case '#':  {
            int w = whatcc(it[it.getCpos()+1]);
@ -899,19 +858,10 @@ bool TextSplit::text_to_words(const string &in)
            break;

        case '\n':
+            nlpending = true;
+            /* FALLTHROUGH */
        case '\r':
-            if (m_span.length() && *m_span.rbegin() == '-') {
-                // if '-' is the last char before end of line, we
-                // strip it.  We have no way to know if this is added
-                // because of the line split or if it was part of an
-                // actual compound word (would need a dictionary to
-                // check).  As soft-hyphen *should* be used if the '-'
-                // is not part of the text, it is better to properly
-                // process a real compound word, and produce wrong
-                // output from wrong text. The word-emitting routine
-                // will strip the trailing '-'.
-                goto SPACE;
-            } else if (softhyphenpending) {
+            if (softhyphenpending) {
                // Don't reset soft-hyphen
                continue;
            } else {
@ -1107,7 +1057,7 @@ bool TextSplit::cjk_to_words(Utf8Iter& it, unsigned int *cp)

 // Specialization for countWords 
 class TextSplitCW : public TextSplit {
- public:
+public:
    int wcnt;
    TextSplitCW(Flags flags) : TextSplit(flags), wcnt(0) {}
    bool takeword(const string &, int, int, int) {
@ -1132,7 +1082,7 @@ bool TextSplit::hasVisibleWhite(const string &in)
            LOGERR("hasVisibleWhite: error while scanning UTF-8 string\n");
            return false;
        }
-        if (visiblewhite.find(c) != visiblewhite.end())
+        if (isvisiblewhite(c))
            return true;
    }
    return false;
@ -1157,7 +1107,7 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
        }

        switch (c) {
-            case '"': 
+        case '"': 
            switch(state) {
            case SPACE: state = INQUOTE; continue;
            case TOKEN: goto push_char;
@ -1166,7 +1116,7 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
                state = SPACE; continue;
            }
            break;
-            case '\\': 
+        case '\\': 
            switch(state) {
            case SPACE: 
            case TOKEN: state=TOKEN; goto push_char;
@ -1175,25 +1125,25 @@ template <class T> bool u8stringToStrings(const string &s, T &tokens)
            }
            break;

-            case ' ': 
-            case '\t': 
-            case '\n': 
-            case '\r': 
+        case ' ': 
+        case '\t': 
+        case '\n': 
+        case '\r': 
            switch(state) {
-              case SPACE: continue;
-              case TOKEN: tokens.push_back(current); current.clear();
+            case SPACE: continue;
+            case TOKEN: tokens.push_back(current); current.clear();
                state = SPACE; continue; 
            case INQUOTE: 
            case ESCAPE: goto push_char;
            }
            break;

-            default:
+        default:
            switch(state) {
-              case ESCAPE: state = INQUOTE; break;
-              case SPACE:  state = TOKEN;  break;
-              case TOKEN: 
-              case INQUOTE: break;
+            case ESCAPE: state = INQUOTE; break;
+            case SPACE:  state = TOKEN;  break;
+            case TOKEN: 
+            case INQUOTE: break;
            }
        push_char:
            it.appendchartostring(current);
@ -1214,4 +1164,3 @@ bool TextSplit::stringToStrings(const string &s, vector<string> &tokens)
 {
    return u8stringToStrings<vector<string> >(s, tokens);
 }
-
--- a/src/common/textsplit.h
+++ b/src/common/textsplit.h
@ -50,6 +50,8 @@ public:
    TextSplit(Flags flags = Flags(TXTS_NONE))
        : m_flags(flags) {}
    virtual ~TextSplit() {}
+    TextSplit(const TextSplit&) = delete;
+    TextSplit& operator=(const TextSplit&) = delete;

    /** Call at program initialization to read non default values from the 
        configuration */
@ -71,6 +73,9 @@ public:
     * just don't know about pages. */
    virtual void newpage(int /*pos*/) {}

+    /** Called when we encounter newline \n 0x0a. Override to use the event. */
+    virtual void newline(int /*pos*/) {}
+
    // Static utility functions:

    /** Count words in string, as the splitter would generate them */
@ -157,12 +162,16 @@ private:
    static bool o_deHyphenate; // false
    static unsigned int o_CJKNgramLen; // 2
    static int o_maxWordLength; // 40
+    static int o_maxWordsInSpan; // 6

    Flags         m_flags;

    // Current span. Might be jf.dockes@wanadoo.f
    std::string        m_span; 

+    // Words in span: byte positions of start and end of words in m_span. For example:
+    // 0   4    9
+    // bill@some.com -> (0,4) (5,9) (10,13)
    std::vector <std::pair<int, int> > m_words_in_span;

    // Current word: no punctuation at all in there. Byte offset
--- a/src/common/textsplitko.cpp
+++ b/src/common/textsplitko.cpp
@ -133,7 +133,7 @@ bool TextSplit::ko_to_words(Utf8Iter *itp, unsigned int *cp)
    unordered_map<string, string> args;

    args.insert(pair<string,string>{"data", string()});
-    string& inputdata{args.begin()->second};
+    string& inputdata(args.begin()->second);

    // We send the tagger name every time but it's only used the first
    // one: can't change it after init. We could avoid sending it
--- a/src/common/unacpp.cpp
+++ b/src/common/unacpp.cpp
@ -1,4 +1,4 @@
-/* Copyright (C) 2004-2019 J.F.Dockes
+/* Copyright (C) 2004-2021 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
@ -29,7 +29,7 @@
 using namespace std;

 bool unacmaybefold(const string &in, string &out,
-           const char *encoding, UnacOp what)
+                   const char *encoding, UnacOp what)
 {
    char *cout = 0;
    size_t out_len;
@ -37,16 +37,13 @@ bool unacmaybefold(const string &in, string &out,

    switch (what) {
    case UNACOP_UNAC:
-        status = unac_string(encoding, in.c_str(), in.length(), 
-                             &cout, &out_len);
+        status = unac_string(encoding, in.c_str(), in.length(), &cout, &out_len);
        break;
    case UNACOP_UNACFOLD:
-        status = unacfold_string(encoding, in.c_str(), in.length(), 
-                                 &cout, &out_len);
+        status = unacfold_string(encoding, in.c_str(), in.length(), &cout, &out_len);
        break;
    case UNACOP_FOLD:
-        status = fold_string(encoding, in.c_str(), in.length(), 
-                             &cout, &out_len);
+        status = fold_string(encoding, in.c_str(), in.length(), &cout, &out_len);
        break;
    }

--- a/src/common/webstore.h
+++ b/src/common/webstore.h
@ -34,6 +34,8 @@ class WebStore {
 public:
    WebStore(RclConfig *config);
    ~WebStore();
+    WebStore(const WebStore&) = delete;
+    WebStore& operator=(const WebStore&) = delete;

    bool getFromCache(const std::string& udi, Rcl::Doc &doc, std::string& data,
                      std::string *hittype = 0);
--- a/src/configure.ac
+++ b/src/configure.ac
@ -1,7 +1,7 @@
-AC_INIT([Recoll], m4_esyscmd_s(cat VERSION))
+AC_INIT([Recoll],[m4_esyscmd_s(cat RECOLL-VERSION.txt)])
 AC_CONFIG_HEADERS([common/autoconfig.h])
 AH_BOTTOM([#include "conf_post.h"])
-AC_PREREQ(2.53)
+AC_PREREQ([2.69])
 AC_CONFIG_SRCDIR(index/recollindex.cpp)

 AM_INIT_AUTOMAKE([1.10 no-define subdir-objects foreign])
@ -21,7 +21,7 @@ if test C$CXX = C ; then
   AC_MSG_ERROR([C++ compiler needed. Please install one (ie: gnu g++)])
 fi
 AC_LANG_PUSH([C++])
-AC_TRY_LINK([],[], rcl_link_ok=yes, rcl_link_ok=no)
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[]], [[]])],[rcl_link_ok=yes],[rcl_link_ok=no])
 if test "$rcl_link_ok" = "no" ; then
   AC_MSG_ERROR([No working C++ compiler was found])
 fi
@ -42,7 +42,7 @@ esac

 AC_PROG_YACC

-AC_PROG_LIBTOOL
+LT_INIT
 AC_C_BIGENDIAN

 AC_SYS_LARGEFILE
@ -53,8 +53,7 @@ AC_CHECK_HEADERS([sys/param.h, spawn.h])

 if test "x$ac_cv_func_posix_spawn" = xyes; then :
   AC_ARG_ENABLE(posix_spawn,
-    AC_HELP_STRING([--enable-posix_spawn],
-   [Enable the use of posix_spawn().]),
+    AS_HELP_STRING([--enable-posix_spawn],[Enable the use of posix_spawn().]),
        posixSpawnEnabled=$enableval, posixSpawnEnabled=no)
 fi
 if test X$posixSpawnEnabled = Xyes ; then
@ -69,11 +68,35 @@ AC_CHECK_HEADERS([sys/mount.h sys/statfs.h sys/statvfs.h sys/vfs.h malloc.h mall

 AC_CHECK_FUNCS([posix_spawn setrlimit kqueue vsnprintf malloc_trim posix_fadvise])

+AC_CHECK_FUNCS(mkdtemp)
+AC_CHECK_LIB([pthread], [pthread_create], [], [])
+AC_SEARCH_LIBS([dlopen], [dl], [], [])
+if test X$ac_cv_search_function != Xno ; then
+   AC_DEFINE(HAVE_DLOPEN, 1, [dlopen function is available])
+fi   
+AC_CHECK_LIB([z], [zlibVersion], [], [])
+
+############# Putenv
+AC_MSG_CHECKING(for type of string parameter to putenv)
+AC_LANG_PUSH([C++])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+    #include <stdlib.h>
+  ]], [[
+    putenv((const char *)0);
+  ]])],[rcl_putenv_string_const="1"],[rcl_putenv_string_const="0"])
+if test X$rcl_putenv_string_const = X1 ; then
+  AC_DEFINE(PUTENV_ARG_CONST, 1, [putenv parameter is const])
+fi
+AC_LANG_POP([C++])
+
+
+PKG_CHECK_MODULES([XSLT], [libxslt], [], AC_MSG_ERROR([libxslt]))
+
+
 # Use specific 'file' command ? (Useful on solaris to specify
 # /usr/local/bin/file instead of the system's which doesn't understand '-i'
 AC_ARG_WITH(file-command, 
-    AC_HELP_STRING([--with-file-command],
-   [Specify version of 'file' command (ie: --with-file-command=/usr/local/bin/file)]),
+    AS_HELP_STRING([--with-file-command],[Specify version of 'file' command (ie: --with-file-command=/usr/local/bin/file)]),
        withFileCommand=$withval, withFileCommand=file)
 case $withFileCommand in
  file)
@ -96,8 +119,7 @@ AC_DEFINE(USE_SYSTEM_FILE_COMMAND, 1,
 # we do compile the aspell module using an internal copy of aspell.h
 # Only --with-aspell=no will completely disable aspell support
 AC_ARG_WITH(aspell, 
-    AC_HELP_STRING([--without-aspell],
-   [Disable use of aspell spelling package to provide term expansion to other spellings]),
+    AS_HELP_STRING([--without-aspell],[Disable use of aspell spelling package to provide term expansion to other spellings]),
        withAspell=$withval, withAspell=yes)
 case $withAspell in
     no);;
@ -126,8 +148,7 @@ fi

 # Real time monitoring with inotify
 AC_ARG_WITH(inotify, 
-    AC_HELP_STRING([--with-inotify],
-   [Use inotify for almost real time indexing of modified files (the default
+    AS_HELP_STRING([--with-inotify],[Use inotify for almost real time indexing of modified files (the default
    is yes on Linux).]),
        withInotify=$withval, withInotify=$inot_default)

@ -141,8 +162,7 @@ fi

 # Real time monitoring with FAM
 AC_ARG_WITH(fam, 
-    AC_HELP_STRING([--with-fam],
-   [Use File Alteration Monitor for almost real time indexing of modified files. Give the fam/gamin library as argument (ie: /usr/lib/libfam.so) if configure does not find the right one.]),
+    AS_HELP_STRING([--with-fam],[Use File Alteration Monitor for almost real time indexing of modified files. Give the fam/gamin library as argument (ie: /usr/lib/libfam.so) if configure does not find the right one.]),
        withFam=$withval, withFam=yes)

 if test X$withFam != Xno -a X$withInotify != Xno ; then
@ -206,21 +226,15 @@ if test X$idxthreadsEnabled = Xyes ; then
  AC_DEFINE(IDX_THREADS, 1, [Use multiple threads for indexing])
 fi

-AC_ARG_ENABLE(testmains,
-    AC_HELP_STRING([--enable-testmains],
-   [Enable building small test drivers. These are not unit tests.]),
-        buildtestmains=$enableval, buildtestmains=no)
-AM_CONDITIONAL([COND_TESTMAINS], [test "$buildtestmains" = yes])
-
 # Enable CamelCase word splitting. This is optional because it causes 
 # problems with phrases: with camelcase enabled, "MySQL manual"
 # will be matched by "MySQL manual" and "my sql manual" but not 
 # "mysql manual" (which would need increased slack as manual is now at pos
 # 2 instead of 1
 AC_ARG_ENABLE(camelcase,
-    AC_HELP_STRING([--enable-camelcase],
-   [Enable splitting camelCase words. This is not enabled by default as
-   this makes phrase matches more difficult: you need to use matching
+    AS_HELP_STRING([--enable-camelcase],
+    [Enable splitting camelCase words. This is not enabled by default as
+   it makes phrase matches more difficult: you need to use matching
   case in the phrase query to get a match. Ie querying for 
   "MySQL manual" and "my sql manual" are the same, but not the same as
   "mysql manual" (in phrases only and you could raise the phrase slack to
@ -230,109 +244,46 @@ if test X$camelcaseEnabled = Xyes ; then
  AC_DEFINE(RCL_SPLIT_CAMELCASE, 1, [Split camelCase words])
 fi

+
+AC_ARG_ENABLE(testmains,
+    AS_HELP_STRING([--enable-testmains],[Enable building small test drivers. These are not unit tests.]),
+        buildtestmains=$enableval, buildtestmains=no)
+AM_CONDITIONAL([COND_TESTMAINS], [test "$buildtestmains" = yes])
+
+AC_ARG_ENABLE(rclgrep,
+    AS_HELP_STRING([--enable-rclgrep],[Enable building the index-less search tool.]),
+        buildrclgrep=$enableval, buildrclgrep=no)
+AM_CONDITIONAL([COND_RCLGREP], [test "$buildrclgrep" = yes])
+
 # Disable building the python module.
 AC_ARG_ENABLE(python-module,
-    AC_HELP_STRING([--disable-python-module],
-    [Do not build the Python module.]),
+    AS_HELP_STRING([--disable-python-module],[Do not build the Python module.]),
        pythonEnabled=$enableval, pythonEnabled=yes)
-
 AM_CONDITIONAL(MAKEPYTHON, [test X$pythonEnabled = Xyes])

 # Disable building the libchm python wrapper
-AC_ARG_ENABLE(python-chm, AC_HELP_STRING([--disable-python-chm],
-    [Do not build the libchm Python wrapper.]),
+AC_ARG_ENABLE(python-chm,
+    AS_HELP_STRING([--disable-python-chm], [Do not build the libchm Python wrapper.]),
    pythonChmEnabled=$enableval, pythonChmEnabled=yes)
-
 if test X$pythonChmEnabled = Xyes; then
   AC_CHECK_LIB([chm], [chm_resolve_object], [],
   [AC_MSG_ERROR([--enable-python-chm is set but libchm is not found])])
 fi
-
 AM_CONDITIONAL(MAKEPYTHONCHM, [test X$pythonChmEnabled = Xyes])


-AC_CHECK_FUNCS(mkdtemp)
-AC_CHECK_LIB([pthread], [pthread_create], [], [])
-AC_SEARCH_LIBS([dlopen], [dl], [], [])
-if test X$ac_cv_search_function != Xno ; then
-   AC_DEFINE(HAVE_DLOPEN, 1, [dlopen function is available])
-fi   
-AC_CHECK_LIB([z], [zlibVersion], [], [])
-
-############# Putenv
-AC_MSG_CHECKING(for type of string parameter to putenv)
-AC_LANG_PUSH([C++])
-AC_TRY_COMPILE([
-    #include <stdlib.h>
-  ],[
-    putenv((const char *)0);
-  ], rcl_putenv_string_const="1", rcl_putenv_string_const="0")
-if test X$rcl_putenv_string_const = X1 ; then
-  AC_DEFINE(PUTENV_ARG_CONST, 1, [putenv parameter is const])
-fi
-AC_LANG_POP([C++])
-
-
-#### Look for Xapian. Done in a strange way to work around autoconf
-# cache
-XAPIAN_CONFIG=${XAPIAN_CONFIG:-no}
-if test "$XAPIAN_CONFIG" = "no"; then 
-    AC_PATH_PROG(XAPIAN_CONFIG0, [xapian-config], no)
-    XAPIAN_CONFIG=$XAPIAN_CONFIG0
-fi
-if test "$XAPIAN_CONFIG" = "no"; then 
-   AC_PATH_PROG(XAPIAN_CONFIG1, [xapian-config-1.3], no)
-   XAPIAN_CONFIG=$XAPIAN_CONFIG1
-fi
-if test "$XAPIAN_CONFIG" = "no"; then 
-   AC_PATH_PROG(XAPIAN_CONFIG2, [xapian-config-1.1], no)
-   XAPIAN_CONFIG=$XAPIAN_CONFIG2
-fi
-
-if test "$XAPIAN_CONFIG" = "no" ; then
-   AC_MSG_ERROR([Cannot find xapian-config command in $PATH. Is
-xapian-core installed ?])
-   exit 1
-fi
-LIBXAPIAN=`$XAPIAN_CONFIG --libs`
-# The --static thing fails with older Xapians. Happily enough they don't
-# need it either (because there are no needed libraries (no uuid and we
-# deal explicitly with libz)
-LIBXAPIANSTATICEXTRA=`$XAPIAN_CONFIG --static --libs 2> /dev/null`
-# Workaround for problem in xapian-config in some versions: wrongly lists
-# libstdc++.la in the lib list
-for i in $LIBXAPIAN ; do
-    case $i in
-    *stdc++*|-lm|-lgcc_s|-lc);;
-    *) tmpxaplib="$tmpxaplib $i";;
-    esac
-done
-LIBXAPIAN=$tmpxaplib
-LIBXAPIANDIR=`$XAPIAN_CONFIG --libs | awk '{print $1}'`
-case A"$LIBXAPIANDIR" in
-  A-L*) LIBXAPIANDIR=`echo $LIBXAPIANDIR | sed -e 's/-L//'`;;
-  *) LIBXAPIANDIR="";;
-esac
-XAPIANCXXFLAGS=`$XAPIAN_CONFIG --cxxflags`
-
-#echo XAPIAN_CONFIG: $XAPIAN_CONFIG 
-#echo LIBXAPIAN: $LIBXAPIAN
-#echo LIBXAPIANDIR: $LIBXAPIANDIR
-#echo LIBXAPIANSTATICEXTRA: $LIBXAPIANSTATICEXTRA
-#echo XAPIANCXXFLAGS: $XAPIANCXXFLAGS
-
-
-PKG_CHECK_MODULES([XSLT], [libxslt], [], AC_MSG_ERROR([libxslt]))
+AC_ARG_ENABLE(indexer, 
+    AS_HELP_STRING([--disable-indexer],[Disable building the recollindex indexer.]),
+        enableINDEXER=$enableval, enableINDEXER="yes")
+AM_CONDITIONAL(MAKEINDEXER, [test X$enableINDEXER = Xyes])

 AC_ARG_ENABLE(xadump, 
-    AC_HELP_STRING([--enable-xadump],
-   [Enable building the xadump low level Xapian access program.]),
+    AS_HELP_STRING([--enable-xadump],[Enable building the xadump low level Xapian access program.]),
        enableXADUMP=$enableval, enableXADUMP="no")
 AM_CONDITIONAL(MAKEXADUMP, [test X$enableXADUMP = Xyes])

 AC_ARG_ENABLE(userdoc,
-    AC_HELP_STRING([--disable-userdoc],
-       [Disable building the user manual. (Avoids the need for docbook xml/xsl files and TeX tools.]),
+    AS_HELP_STRING([--disable-userdoc],[Disable building the user manual. (Avoids the need for docbook xml/xsl files and TeX tools.]),
        enableUserdoc=$enableval, enableUserdoc="yes")
 AM_CONDITIONAL(MAKEUSERDOC, [test X$enableUserdoc = Xyes])

@ -360,14 +311,12 @@ AM_CONDITIONAL(MAKEUSERDOC, [test X$enableUserdoc = Xyes])
 # will have failed, and we tell the user to check his environment.
 #
 AC_ARG_ENABLE(qtgui, 
-    AC_HELP_STRING([--disable-qtgui],
-   [Disable the QT-based graphical user interface.]),
+    AS_HELP_STRING([--disable-qtgui],[Disable the QT-based graphical user interface.]),
        enableQT=$enableval, enableQT="yes")
 AM_CONDITIONAL(MAKEQT, [test X$enableQT = Xyes])

 AC_ARG_ENABLE(recollq, 
-    AC_HELP_STRING([--enable-recollq],
-   [Enable building the recollq command line query tool (recoll -t without
+    AS_HELP_STRING([--enable-recollq],[Enable building the recollq command line query tool (recoll -t without
   need for Qt). This is done by default if --disable-qtgui is set but this
   option enables forcing it.]),
        enableRECOLLQ=$enableval, enableRECOLLQ="no")
@ -398,28 +347,11 @@ if test X$enableQT = Xyes ; then
  qt development files and tools and/or set the QTDIR environment variable?])
  fi
  QMAKE=$QMAKEPATH
-  
-  # Check Qt version
-  qmakevers="`${QMAKE} --version 2>&1`"
-  #echo "qmake version: $qmakevers"
-  v4=`expr "$qmakevers" : '.*Qt[ ][ ]*version[ ][ ]*4.*'`
-  v5=`expr "$qmakevers" : '.*Qt[ ][ ]*version[ ][ ]*5.*'`
-  if test X$v4 = X0 -a X$v5 = X0; then 
-     AC_MSG_ERROR([Bad qt/qmake version string (not 4 or 5?): $qmakevers])
-  else
-    if test X$v4 != X0 ; then
-       AC_MSG_ERROR([Qt version (from qmake found with QMAKE/QTDIR/PATH) is 4 but Recoll now needs version 5])
-    else
-       AC_MSG_NOTICE([using qt version 5 user interface])
-    fi
-    QTGUI=qtgui
-  fi
-
+  QTGUI=qtgui

 ##### Using Qt webkit for reslist display? Else Qt textbrowser
  AC_ARG_ENABLE(webkit,
-    AC_HELP_STRING([--disable-webkit],
-      [Disable use of qt-webkit (only meaningful if qtgui is enabled).]),
+    AS_HELP_STRING([--disable-webkit],[Disable use of qt-webkit (only meaningful if qtgui is enabled).]),
        enableWebkit=$enableval, enableWebkit="yes")

  if test "$enableWebkit" = "yes" ; then
@ -431,8 +363,7 @@ if test X$enableQT = Xyes ; then
  fi

  AC_ARG_ENABLE(webengine,
-    AC_HELP_STRING([--enable-webengine],
-      [Enable use of qt-webengine (only meaningful if qtgui is enabled), in
+    AS_HELP_STRING([--enable-webengine],[Enable use of qt-webengine (only meaningful if qtgui is enabled), in
      place or qt-webkit.]),
        enableWebengine=$enableval, enableWebengine="no")

@ -448,8 +379,7 @@ if test X$enableQT = Xyes ; then

 ##### Using QZeitGeist lib ? Default no for now
  AC_ARG_WITH(qzeitgeist,
-    AC_HELP_STRING([--with-qzeitgeist],
-      [Enable the use of the qzeitgeist library to send zeitgeist events.]),
+    AS_HELP_STRING([--with-qzeitgeist],[Enable the use of the qzeitgeist library to send zeitgeist events.]),
        withQZeitgeist=$withval, withQZeitgeist="no")

  case "$withQZeitgeist" in 
@ -466,22 +396,73 @@ if test X$enableQT = Xyes ; then
   QMAKE_DISABLE_ZEITGEIST=""
  fi

+ # Retain debugging symbols in GUI recoll ? This makes it enormous (~50MB)
+  AC_ARG_ENABLE(guidebug,
+    AS_HELP_STRING([--enable-guidebug],[Generate and retain debug symbols in GUI program (makes the file very big).]),
+        enableGuiDebug=$enableval, enableGuiDebug="no")
+
+  if test "$enableGuiDebug" = "yes" ; then
+   QMAKE_ENABLE_GUIDEBUG=""
+  else
+   QMAKE_ENABLE_GUIDEBUG="#"
+  fi
+
  AC_CONFIG_FILES($QTGUI/recoll.pro)

  ##################### End QT stuff
 fi

-### X11: this is needed for the session monitoring code (in recollindex -m)
-AC_ARG_ENABLE(x11mon, 
-    AC_HELP_STRING([--disable-x11mon],
-   [Disable recollindex support for X11 session monitoring.]),
-        enableX11mon=$enableval, enableX11mon="yes")
+dnl Borrow a macro definition from pkg.config,
+dnl for older installs that lack it.
+m4_ifndef([PKG_CHECK_VAR], [
+dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE,
+dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+dnl -------------------------------------------
+dnl Retrieves the value of the pkg-config variable for the given module.
+AC_DEFUN([PKG_CHECK_VAR],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl

-if test X$withInotify = Xno -a X$withFam = Xno ; then
-  enableX11mon=no
+_PKG_CONFIG([$1], [variable="][$3]["], [$2])
+AS_VAR_COPY([$1], [pkg_cv_][$1])
+
+AS_VAR_IF([$1], [""], [$5], [$4])dnl
+])dnl PKG_CHECK_VAR
+])
+
+### Systemd
+AC_ARG_WITH([systemd],
+    AS_HELP_STRING([--without-systemd],[Disable installation of the systemd unit files.]))
+AC_ARG_WITH([system-unit-dir],
+    AS_HELP_STRING([--with-system-unit-dir=DIR],[Install location for systemd system unit files]),
+    [SYSTEMD_SYSTEM_UNIT_DIR="$withval"],
+    [PKG_CHECK_VAR([SYSTEMD_SYSTEM_UNIT_DIR], [systemd], [systemdsystemunitdir])])
+AC_ARG_WITH([user-unit-dir],
+    AS_HELP_STRING([--with-user-unit-dir=DIR],[Install location for systemd user unit files]),
+    [SYSTEMD_USER_UNIT_DIR="$withval"],
+    [PKG_CHECK_VAR([SYSTEMD_USER_UNIT_DIR], [systemd], [systemduserunitdir])])
+
+if test X$enableINDEXER = Xno -o "x$SYSTEMD_SYSTEM_UNIT_DIR" = "x" -o \
+        "x$SYSTEMD_USER_UNIT_DIR" = "x"; then
+    with_systemd="no"
 fi

-if test "$enableX11mon" = "yes" ; then
+AM_CONDITIONAL([INSTALL_SYSTEMD_UNITS], [test "X$with_systemd" != "Xno"])
+
+### X11: this is needed for the session monitoring code (in recollindex -m)
+AC_ARG_ENABLE(x11mon, 
+    AS_HELP_STRING([--disable-x11mon],[Disable recollindex support for X11 session monitoring.]),
+        enableX11mon=$enableval, enableX11mon="yes")
+
+if test X$enableINDEXER = Xno ; then 
+  enableX11mon=no
+else
+  if test X$withInotify = Xno -a X$withFam = Xno ; then
+    enableX11mon=no
+  fi
+fi
+
+if test "$enableX11mon" = yes ; then
  AC_PATH_XTRA
  X_LIBX11=-lX11
 else
@ -491,6 +472,17 @@ fi
 #echo X_CFLAGS "'$X_CFLAGS'" X_PRE_LIBS "'$X_PRE_LIBS'" X_LIBS \
 #      "'$X_LIBS'" X_LIBX11 "'$X_LIBX11'" X_EXTRA_LIBS "'$X_EXTRA_LIBS'"

+# Check if anything needs Xapian. We also need to build the shared lib if this is the case.
+xapian_needed=yes
+if test X$buildtestmains = Xno -a X$pythonEnabled = Xno -a X$enableINDEXER = Xno \
+   -a X$enableXADUMP = Xno -a X$enableQT = Xno -a X$enableRECOLLQ = Xno ; then
+   xapian_needed=no
+fi
+if test X$xapian_needed = Xyes; then
+   PKG_CHECK_MODULES([XAPIAN], xapian-core, [], AC_MSG_ERROR([libxapian]))
+fi
+AM_CONDITIONAL([MAKE_RECOLL_LIB], [test X$xapian_needed = Xyes])
+   
 # For communicating the value of RECOLL_DATADIR to non-make-based
 # subpackages like python-recoll, we have to expand prefix in here, because
 # things like "datadir = ${prefix}/share" (which is what we'd get by
@ -514,17 +506,17 @@ AC_SUBST(X_LIBX11)
 AC_SUBST(X_EXTRA_LIBS)
 AC_SUBST(INCICONV)
 AC_SUBST(LIBICONV)
-AC_SUBST(LIBXAPIAN)
-AC_SUBST(LIBXAPIANDIR)
-AC_SUBST(LIBXAPIANSTATICEXTRA)
+AC_SUBST(XAPIAN_LIBS)
+AC_SUBST(XAPIAN_CFLAGS)
 AC_SUBST(LIBFAM)
 AC_SUBST(QMAKE)
 AC_SUBST(QTGUI)
-AC_SUBST(XAPIANCXXFLAGS)
 AC_SUBST(QMAKE_ENABLE_WEBKIT)
 AC_SUBST(QMAKE_DISABLE_WEBKIT)
 AC_SUBST(QMAKE_ENABLE_WEBENGINE)
 AC_SUBST(QMAKE_DISABLE_WEBENGINE)
+AC_SUBST(QMAKE_ENABLE_GUIDEBUG)
+AC_SUBST(QMAKE_DISABLE_GUIDEBUG)
 AC_SUBST(QMAKE_ENABLE_ZEITGEIST)
 AC_SUBST(QMAKE_DISABLE_ZEITGEIST)
 AC_SUBST(LIBQZEITGEIST)
@ -532,6 +524,8 @@ AC_SUBST(RCLVERSION)
 AC_SUBST(RCLLIBVERSION)
 AC_SUBST(XSLT_CFLAGS)
 AC_SUBST(XSLT_LIBS)
+AC_SUBST([SYSTEMD_SYSTEM_UNIT_DIR])
+AC_SUBST([SYSTEMD_USER_UNIT_DIR])

 AC_CONFIG_FILES([Makefile python/recoll/setup.py
 python/pychm/setup.py])
@ -539,5 +533,8 @@ AC_CONFIG_FILES([Makefile python/recoll/setup.py
 if test X$buildtestmains = Xyes ; then
   AC_CONFIG_FILES([testmains/Makefile])
 fi
+if test X$buildrclgrep = Xyes ; then
+   AC_CONFIG_FILES([rclgrep/Makefile])
+fi

 AC_OUTPUT
--- a/src/doc/man/rclgrep.1
+++ b/src/doc/man/rclgrep.1
@ -0,0 +1,12 @@
+.TH RCLGREP 1 "20 September 2022"
+.SH NAME
+rclgrep \- grep-like program based on the recoll data extraction functions
+.SH SYNOPSIS
+.B rclgrep
+[
+.B \--config
+<configdir>
+]
+
+.SH DESCRIPTION
+Some bla bla
--- a/src/doc/man/recoll.conf.5
+++ b/src/doc/man/recoll.conf.5
@ -148,7 +148,7 @@ not set, the daemon uses skippedPaths.
 .TP
 .BI "zipUseSkippedNames = "bool
 Use skippedNames inside Zip archives. Fetched
-directly by the rclzip handler. Skip the patterns defined by skippedNames
+directly by the rclzip.py handler. Skip the patterns defined by skippedNames
 inside Zip archives. Can be redefined for subdirectories.
 See https://www.lesbonscomptes.com/recoll/faqsandhowtos/FilteringOutZipArchiveMembers.html

@ -195,7 +195,7 @@ lets you turn off md5 computation for selected types. It is global (no
 redefinition for subtrees). At the moment, it only has an effect for
 external handlers (exec and execm). The file types can be specified by
 listing either MIME types (e.g. audio/mpeg) or handler names
-(e.g. rclaudio).
+(e.g. rclaudio.py).
 .TP
 .BI "compressedfilemaxkbs = "int
 Size limit for compressed
@ -613,8 +613,7 @@ location before copy, to allow path translation computations.  For
 example if a dataset originally indexed as '/home/me/mydata/config' has
 been mounted to '/media/me/mydata', and the GUI is running from a copied
 configuration, orgidxconfdir would be '/home/me/mydata/config', and
-curidxconfdir (as set in the copied configuration) would be
-'/media/me/mydata/config'.
+curidxconfdir (as set in the copied configuration) would be '/media/me/mydata/config'.
 .TP
 .BI "idxrundir = "dfn
 Indexing process current directory. The input
--- a/src/doc/man/recollq.1
+++ b/src/doc/man/recollq.1
@ -59,6 +59,10 @@ recollq \- command line / standard output Recoll query command.
 .B \-F
 <quoted space separated field name list>
 ]
+[
+.B \--extract-to
+<file path>
+]
 <query string>

 .B recollq \-P
@ -120,9 +124,10 @@ sorts the results according to the specified field. Use
 for descending order.
 .PP
 .B \-n
-<cnt>
+<[first-]cnt>
 can be used to set the maximum number of results that should be
-printed. The default is 2000. Use a value of 0 for no limit.
+printed. The default is 2000. Use a value of 0 for no limit. If the argument is of the form
+first-cnt, it also defines the first result to output (from 0).
 .PP
 .B \-s
 <language>
@ -144,6 +149,11 @@ base64 and separated by one space character. Empty fields are indicated by
 consecutive space characters. There is one additional space character at
 the end of each line.
 .PP
+.B \--extract-to
+<file path>
+Will extract the first result document of the query to the argument path, which must not exist. Use
+-n first-cnt to select the document.
+.PP
 .B recollq \-P
 (Period) will print the minimum and maximum modification years for
 documents in the index.
--- a/src/doc/user/Makefile
+++ b/src/doc/user/Makefile
@ -13,6 +13,7 @@
 #XSLDIR="/opt/local/share/xsl/docbook-xsl/"
 #Linux
 XSLDIR="/usr/share/xml/docbook/stylesheet/docbook-xsl/"
+UTILBUILDS=/home/dockes/tmp/builds/medocutils/


 # Options common to the single-file and chunked versions
@ -48,11 +49,10 @@ index.html: usermanual.xml recoll.conf.xml
 usermanual.pdf: usermanual.xml recoll.conf.xml
 	dblatex --xslt-opts="--xinclude" -tpdf $<

-UTILBUILDS=/home/dockes/tmp/builds/medocutils/
-recoll-conf-xml:
-	$(UTILBUILDS)/confxml --docbook \
+recoll.conf.xml: ../../sampleconf/recoll.conf
+	test -x $(UTILBUILDS)/confxml && $(UTILBUILDS)/confxml --docbook \
        --idprefix=RCL.INSTALL.CONFIG.RECOLLCONF  \
-        ../../sampleconf/recoll.conf  > recoll.conf.xml
+        ../../sampleconf/recoll.conf  > recoll.conf.xml || touch recoll.conf.xml

 # Generating a restructured text version, for uploading to readthedocs.
 # Does not really work, the internal links are botched. pandoc
@ -65,7 +65,7 @@ recoll-conf-xml:
 # script. 
 # Also could not get readthedocs to generate the left pane TOC? could
 # probably be fixed...
-#usermanual-rst: recoll-conf-xml
+#usermanual-rst: recoll.conf.xml
 #	tail -n +2 recoll.conf.xml > rcl-conf-tail.xml
 #	sed -e '/xi:include/r rcl-conf-tail.xml' \
 #		< usermanual.xml > full-man.xml
--- a/src/doc/user/recoll.conf.xml
+++ b/src/doc/user/recoll.conf.xml
@ -8,28 +8,34 @@
 <listitem><para>Space-separated list of files or
 directories to recursively index. Default to ~ (indexes
 $HOME). You can use symbolic links in the list, they will be followed,
-independently of the value of the followLinks variable.</para></listitem></varlistentry>
+independently of the value of the followLinks variable.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MONITORDIRS">
 <term><varname>monitordirs</varname></term>
 <listitem><para>Space-separated list of files or directories to monitor for
 updates. When running the real-time indexer, this allows monitoring only a
 subset of the whole indexed area. The elements must be included in the
-tree defined by the 'topdirs' members.</para></listitem></varlistentry>
+tree defined by the 'topdirs' members.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.SKIPPEDNAMES">
 <term><varname>skippedNames</varname></term>
-<listitem><para>Files and directories which should be ignored. 
-White space separated list of wildcard patterns (simple ones, not paths,
-must contain no / ), which will be tested against file and directory
-names.  The list in the default configuration does not exclude hidden
-directories (names beginning with a dot), which means that it may index
-quite a few things that you do not want. On the other hand, email user
-agents like Thunderbird usually store messages in hidden directories, and
-you probably want this indexed. One possible solution is to have ".*" in
-"skippedNames", and add things like "~/.thunderbird" "~/.evolution" to
-"topdirs".  Not even the file names are indexed for patterns in this
-list, see the "noContentSuffixes" variable for an alternative approach
-which indexes the file names. Can be redefined for any
-subtree.</para></listitem></varlistentry>
+<listitem><para>Files and directories which should be ignored.  White space separated list of wildcard patterns (simple ones, not paths, must contain no
+'/' characters), which will be tested against file and directory names.
+</para><para>
+Have a look at the default configuration for the initial value, some entries may not suit your
+situation. The easiest way to see it is through the GUI Index configuration "local parameters"
+panel.
+</para><para>
+The list in the default configuration does not exclude hidden directories (names beginning with a
+dot), which means that it may index quite a few things that you do not want. On the other hand,
+email user agents like Thunderbird usually store messages in hidden directories, and you probably
+want this indexed. One possible solution is to have ".*" in "skippedNames", and add things like
+"~/.thunderbird" "~/.evolution" to "topdirs".
+</para><para>
+Not even the file names are indexed for patterns in this list, see the "noContentSuffixes"
+variable for an alternative approach which indexes the file names. Can be redefined for any
+subtree.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.SKIPPEDNAMES-">
 <term><varname>skippedNames-</varname></term>
 <listitem><para>List of name endings to remove from the default skippedNames
@ -42,7 +48,8 @@ list. </para></listitem></varlistentry>
 <term><varname>onlyNames</varname></term>
 <listitem><para>Regular file name filter patterns If this is set, only the file names not in skippedNames and
 matching one of the patterns will be considered for indexing. Can be
-redefined per subtree. Does not apply to directories.</para></listitem></varlistentry>
+redefined per subtree. Does not apply to directories.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.NOCONTENTSUFFIXES">
 <term><varname>noContentSuffixes</varname></term>
 <listitem><para>List of name endings (not necessarily dot-separated suffixes) for
@ -53,7 +60,8 @@ which will go away in a future release (the move from mimemap to
 recoll.conf allows editing the list through the GUI). This is different
 from skippedNames because these are name ending matches only (not
 wildcard patterns), and the file name itself gets indexed normally. This
-can be redefined for subdirectories.</para></listitem></varlistentry>
+can be redefined for subdirectories.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.NOCONTENTSUFFIXES-">
 <term><varname>noContentSuffixes-</varname></term>
 <listitem><para>List of name endings to remove from the default noContentSuffixes
@ -64,19 +72,26 @@ list. </para></listitem></varlistentry>
 list. </para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.SKIPPEDPATHS">
 <term><varname>skippedPaths</varname></term>
-<listitem><para>Absolute paths we should not go into. Space-separated list of wildcard expressions for absolute
-filesystem paths. Must be defined at the top level of the configuration
-file, not in a subsection. Can contain files and directories. The database and
-configuration directories will automatically be added. The expressions
-are matched using 'fnmatch(3)' with the FNM_PATHNAME flag set by
-default. This means that '/' characters must be matched explicitly. You
-can set 'skippedPathsFnmPathname' to 0 to disable the use of FNM_PATHNAME
-(meaning that '/*/dir3' will match '/dir1/dir2/dir3'). The default value
-contains the usual mount point for removable media to remind you that it
-is a bad idea to have Recoll work on these (esp. with the monitor: media
-gets indexed on mount, all data gets erased on unmount). Explicitly
-adding '/media/xxx' to the 'topdirs' variable will override
-this.</para></listitem></varlistentry>
+<listitem><para>Absolute paths we should not go into. Space-separated list of wildcard expressions for absolute filesystem paths (for files or
+directories). The variable must be defined at the top level of the configuration file, not in a
+subsection.
+</para><para>
+Any value in the list must be textually consistent with the values in topdirs, no attempts are
+made to resolve symbolic links. In practise, if, as is frequently the case, /home is a link to
+/usr/home, your default topdirs will have a single entry '~' which will be translated to
+'/home/yourlogin'. In this case, any skippedPaths entry should start with '/home/yourlogin' *not*
+with '/usr/home/yourlogin'.
+</para><para>
+The index and configuration directories will automatically be added to the list.
+</para><para>
+The expressions are matched using 'fnmatch(3)' with the FNM_PATHNAME flag set by default. This
+means that '/' characters must be matched explicitly. You can set 'skippedPathsFnmPathname' to 0
+to disable the use of FNM_PATHNAME (meaning that '/*/dir3' will match '/dir1/dir2/dir3').
+</para><para>
+The default value contains the usual mount point for removable media to remind you that it is in
+most cases a bad idea to have Recoll work on these Explicitly adding '/media/xxx' to the 'topdirs'
+variable will override this.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.SKIPPEDPATHSFNMPATHNAME">
 <term><varname>skippedPathsFnmPathname</varname></term>
 <listitem><para>Set to 0 to
@ -85,17 +100,19 @@ paths. </para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.NOWALKFN">
 <term><varname>nowalkfn</varname></term>
 <listitem><para>File name which will cause its parent directory to be skipped. Any directory containing a file with this name will be skipped as
-if it was part of the skippedPaths list. Ex: .recoll-noindex</para></listitem></varlistentry>
+if it was part of the skippedPaths list. Ex: .recoll-noindex
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.DAEMSKIPPEDPATHS">
 <term><varname>daemSkippedPaths</varname></term>
 <listitem><para>skippedPaths equivalent specific to
 real time indexing. This enables having parts of the tree
 which are initially indexed but not monitored. If daemSkippedPaths is
-not set, the daemon uses skippedPaths.</para></listitem></varlistentry>
+not set, the daemon uses skippedPaths.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ZIPUSESKIPPEDNAMES">
 <term><varname>zipUseSkippedNames</varname></term>
 <listitem><para>Use skippedNames inside Zip archives. Fetched
-directly by the rclzip handler. Skip the patterns defined by skippedNames
+directly by the rclzip.py handler. Skip the patterns defined by skippedNames
 inside Zip archives. Can be redefined for subdirectories.
 See https://www.lesbonscomptes.com/recoll/faqsandhowtos/FilteringOutZipArchiveMembers.html
 </para></listitem></varlistentry>
@ -117,7 +134,8 @@ multiple indexing of linked files. No effort is made to avoid duplication
 when this option is set to true. This option can be set individually for
 each of the 'topdirs' members by using sections. It can not be changed
 below the 'topdirs' level. Links in the 'topdirs' list itself are always
-followed.</para></listitem></varlistentry>
+followed.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.INDEXEDMIMETYPES">
 <term><varname>indexedmimetypes</varname></term>
 <listitem><para>Restrictive list of
@ -126,14 +144,16 @@ supported types are indexed). If it is set, only the types from the list
 will have their contents indexed. The names will be indexed anyway if
 indexallfilenames is set (default). MIME type names should be taken from
 the mimemap file (the values may be different from xdg-mime or file -i
-output in some cases). Can be redefined for subtrees.</para></listitem></varlistentry>
+output in some cases). Can be redefined for subtrees.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.EXCLUDEDMIMETYPES">
 <term><varname>excludedmimetypes</varname></term>
 <listitem><para>List of excluded MIME
 types. Lets you exclude some types from indexing. MIME type
 names should be taken from the mimemap file (the values may be different
 from xdg-mime or file -i output in some cases) Can be redefined for
-subtrees.</para></listitem></varlistentry>
+subtrees.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.NOMD5TYPES">
 <term><varname>nomd5types</varname></term>
 <listitem><para>Don't compute md5 for these types. md5 checksums are used only for deduplicating results, and can be
@ -142,32 +162,43 @@ lets you turn off md5 computation for selected types. It is global (no
 redefinition for subtrees). At the moment, it only has an effect for
 external handlers (exec and execm). The file types can be specified by
 listing either MIME types (e.g. audio/mpeg) or handler names
-(e.g. rclaudio).</para></listitem></varlistentry>
+(e.g. rclaudio.py).
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.COMPRESSEDFILEMAXKBS">
 <term><varname>compressedfilemaxkbs</varname></term>
 <listitem><para>Size limit for compressed
 files. We need to decompress these in a
 temporary directory for identification, which can be wasteful in some
 cases. Limit the waste. Negative means no limit. 0 results in no
-processing of any compressed file. Default 50 MB.</para></listitem></varlistentry>
+processing of any compressed file. Default 100 MB.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.TEXTFILEMAXMBS">
 <term><varname>textfilemaxmbs</varname></term>
-<listitem><para>Size limit for text
-files. Mostly for skipping monster
-logs. Default 20 MB.</para></listitem></varlistentry>
+<listitem><para>Size limit for text files. Mostly for skipping monster logs. Default 20 MB. Use a value of -1 to
+disable.
+</para></listitem></varlistentry>
+<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.TEXTUNKNOWNASPLAIN">
+<term><varname>textunknownasplain</varname></term>
+<listitem><para>Process unknown text/xxx files as text/plain Allows indexing misc. text files identified as text/whatever by 'file' or 'xdg-mime'
+without having to explicitely set config entries for them. This works fine for indexing (but will
+cause processing of a lot of garbage though), but the documents indexed this way will be opened by
+the desktop viewer, even if text/plain has a specific editor.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.INDEXALLFILENAMES">
 <term><varname>indexallfilenames</varname></term>
 <listitem><para>Index the file names of
 unprocessed files Index the names of files the contents of
 which we don't index because of an excluded or unsupported MIME
-type.</para></listitem></varlistentry>
+type.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.USESYSTEMFILECOMMAND">
 <term><varname>usesystemfilecommand</varname></term>
 <listitem><para>Use a system command
 for file MIME type guessing as a final step in file type
 identification This is generally useful, but will usually
 cause the indexing of many bogus 'text' files. See 'systemfilecommand'
-for the command used.</para></listitem></varlistentry>
+for the command used.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.SYSTEMFILECOMMAND">
 <term><varname>systemfilecommand</varname></term>
 <listitem><para>Command used to guess
@ -175,12 +206,14 @@ MIME types if the internal methods fails This should be a
 "file -i" workalike.  The file path will be added as a last parameter to
 the command line. "xdg-mime" works better than the traditional "file"
 command, and is now the configured default (with a hard-coded fallback to
-"file")</para></listitem></varlistentry>
+"file")
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PROCESSWEBQUEUE">
 <term><varname>processwebqueue</varname></term>
 <listitem><para>Decide if we process the
 Web queue. The queue is a directory where the Recoll Web
-browser plugins create the copies of visited pages.</para></listitem></varlistentry>
+browser plugins create the copies of visited pages.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.TEXTFILEPAGEKBS">
 <term><varname>textfilepagekbs</varname></term>
 <listitem><para>Page size for text
@ -189,12 +222,14 @@ into documents of approximately this size. Will reduce memory usage at
 index time and help with loading data in the preview window at query
 time. Particularly useful with very big files, such as application or
 system logs. Also see textfilemaxmbs and
-compressedfilemaxkbs.</para></listitem></varlistentry>
+compressedfilemaxkbs.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MEMBERMAXKBS">
 <term><varname>membermaxkbs</varname></term>
 <listitem><para>Size limit for archive
 members. This is passed to the filters in the environment
-as RECOLL_FILTER_MAXMEMBERKB.</para></listitem></varlistentry>
+as RECOLL_FILTER_MAXMEMBERKB.
+</para></listitem></varlistentry>
 </variablelist></sect3>
 <sect3 id="RCL.INSTALL.CONFIG.RECOLLCONF.TERMS">
 <title>Parameters affecting how we generate terms and organize the index </title><variablelist>
@ -206,28 +241,34 @@ searches sensitive to case and diacritics can be performed, but the index
 will be bigger, and some marginal weirdness may sometimes occur. The
 default is a stripped index. When using multiple indexes for a search,
 this parameter must be defined identically for all. Changing the value
-implies an index reset.</para></listitem></varlistentry>
+implies an index reset.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.INDEXSTOREDOCTEXT">
 <term><varname>indexStoreDocText</varname></term>
 <listitem><para>Decide if we store the
 documents' text content in the index. Storing the text
 allows extracting snippets from it at query time, instead of building
 them from index position data.
+</para><para>
 Newer Xapian index formats have rendered our use of positions list
 unacceptably slow in some cases. The last Xapian index format with good
 performance for the old method is Chert, which is default for 1.2, still
 supported but not default in 1.4 and will be dropped in 1.6.
+</para><para>
 The stored document text is translated from its original format to UTF-8
 plain text, but not stripped of upper-case, diacritics, or punctuation
 signs. Storing it increases the index size by 10-20% typically, but also
 allows for nicer snippets, so it may be worth enabling it even if not
 strictly needed for performance if you can afford the space.
+</para><para>
 The variable only has an effect when creating an index, meaning that the
 xapiandb directory must not exist yet. Its exact effect depends on the
 Xapian version.
+</para><para>
 For Xapian 1.4, if the variable is set to 0, the Chert format will be
 used, and the text will not be stored. If the variable is 1, Glass will
 be used, and the text stored.
+</para><para>
 For Xapian 1.2, and for versions after 1.5 and newer, the index format is
 always the default, but the variable controls if the text is stored or
 not, and the abstract generation method. With Xapian 1.5 and later, and
@ -244,26 +285,31 @@ still be). Numbers are often quite interesting to search for, and this
 should probably not be set except for special situations, ie, scientific
 documents with huge amounts of numbers in them, where setting nonumbers
 will reduce the index size. This can only be set for a whole index, not
-for a subtree.</para></listitem></varlistentry>
+for a subtree.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.DEHYPHENATE">
 <term><varname>dehyphenate</varname></term>
 <listitem><para>Determines if we index 'coworker'
 also when the input is 'co-worker'. This is new
 in version 1.22, and on by default. Setting the variable to off allows
-restoring the previous behaviour.</para></listitem></varlistentry>
+restoring the previous behaviour.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.BACKSLASHASLETTER">
 <term><varname>backslashasletter</varname></term>
 <listitem><para>Process backslash as normal letter. This may make sense for people wanting to index TeX commands as
-such but is not of much general use.</para></listitem></varlistentry>
+such but is not of much general use.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.UNDERSCOREASLETTER">
 <term><varname>underscoreasletter</varname></term>
 <listitem><para>Process underscore as normal letter. This makes sense in so many cases that one wonders if it should
-not be the default.</para></listitem></varlistentry>
+not be the default.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMLENGTH">
 <term><varname>maxtermlength</varname></term>
 <listitem><para>Maximum term length. Words longer than this will be discarded.
 The default is 40 and used to be hard-coded, but it can now be
-adjusted. You need an index reset if you change the value.</para></listitem></varlistentry>
+adjusted. You need an index reset if you change the value.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.NOCJK">
 <term><varname>nocjk</varname></term>
 <listitem><para>Decides if specific East Asian
@ -271,20 +317,23 @@ adjusted. You need an index reset if you change the value.</para></listitem></va
 off. This will save a small amount of CPU if you have no CJK
 documents. If your document base does include such text but you are not
 interested in searching it, setting nocjk may be a
-significant time and space saver.</para></listitem></varlistentry>
+significant time and space saver.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.CJKNGRAMLEN">
 <term><varname>cjkngramlen</varname></term>
 <listitem><para>This lets you adjust the size of
 n-grams used for indexing CJK text. The default value of 2 is
 probably appropriate in most cases. A value of 3 would allow more precision
 and efficiency on longer words, but the index will be approximately twice
-as large.</para></listitem></varlistentry>
+as large.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.INDEXSTEMMINGLANGUAGES">
 <term><varname>indexstemminglanguages</varname></term>
 <listitem><para>Languages for which to create stemming expansion
 data. Stemmer names can be found by executing 'recollindex
 -l', or this can also be set from a list in the GUI. The values are full
-language names, e.g. english, french...</para></listitem></varlistentry>
+language names, e.g. english, french...
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.DEFAULTCHARSET">
 <term><varname>defaultcharset</varname></term>
 <listitem><para>Default character
@ -295,37 +344,39 @@ set, the default character set is the one defined by the NLS environment
 ($LC_ALL, $LC_CTYPE, $LANG), or ultimately iso-8859-1 (cp-1252 in fact).
 If for some reason you want a general default which does not match your
 LANG and is not 8859-1, use this variable. This can be redefined for any
-sub-directory.</para></listitem></varlistentry>
+sub-directory.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.UNAC_EXCEPT_TRANS">
 <term><varname>unac_except_trans</varname></term>
-<listitem><para>A list of characters,
-encoded in UTF-8, which should be handled specially
-when converting text to unaccented lowercase. For
-example, in Swedish, the letter a with diaeresis has full alphabet
-citizenship and should not be turned into an a.
-Each element in the space-separated list has the special character as
-first element and the translation following. The handling of both the
-lowercase and upper-case versions of a character should be specified, as
-appartenance to the list will turn-off both standard accent and case
-processing. The value is global and affects both indexing and querying.
+<listitem><para>A list of characters, encoded in UTF-8, which should be handled specially when converting
+text to unaccented lowercase. For example, in Swedish, the letter a with diaeresis has full alphabet citizenship and
+should not be turned into an a.  Each element in the space-separated list has the special
+character as first element and the translation following. The handling of both the lowercase and
+upper-case versions of a character should be specified, as appartenance to the list will turn-off
+both standard accent and case processing. The value is global and affects both indexing and
+querying.  We also convert a few confusing Unicode characters (quotes, hyphen) to their ASCII
+equivalent to avoid "invisible" search failures.
+</para><para>
 Examples:
 Swedish:
-unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl åå Åå
+unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl åå Åå ’' ❜' ʼ' ‐-
 . German:
-unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl
-In French, you probably want to decompose oe and ae and nobody would type
+unac_except_trans = ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl ’' ❜' ʼ' ‐-
+. French: you probably want to decompose oe and ae and nobody would type
 a German ß
-unac_except_trans = ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl
+unac_except_trans = ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl ’' ❜' ʼ' ‐-
 . The default for all until someone protests follows. These decompositions
 are not performed by unac, but it is unlikely that someone would type the
 composed forms in a search.
-unac_except_trans = ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl</para></listitem></varlistentry>
+unac_except_trans = ßss œoe Œoe æae Æae ﬀff ﬁfi ﬂfl ’' ❜' ʼ' ‐-
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAILDEFCHARSET">
 <term><varname>maildefcharset</varname></term>
 <listitem><para>Overrides the default
 character set for email messages which don't specify
 one. This is mainly useful for readpst (libpst) dumps,
-which are utf-8 but do not say so.</para></listitem></varlistentry>
+which are utf-8 but do not say so.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.LOCALFIELDS">
 <term><varname>localfields</varname></term>
 <listitem><para>Set fields on all files
@ -333,7 +384,8 @@ which are utf-8 but do not say so.</para></listitem></varlistentry>
 name = value ; attr1 = val1 ; [...]
 value is empty so this needs an initial semi-colon. This is useful, e.g.,
 for setting the rclaptg field for application selection inside
-mimeview.</para></listitem></varlistentry>
+mimeview.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.TESTMODIFUSEMTIME">
 <term><varname>testmodifusemtime</varname></term>
 <listitem><para>Use mtime instead of
@ -355,12 +407,12 @@ undetected). Perform a full index reset after changing this.
 <term><varname>noxattrfields</varname></term>
 <listitem><para>Disable extended attributes
 conversion to metadata fields. This probably needs to be
-set if testmodifusemtime is set.</para></listitem></varlistentry>
+set if testmodifusemtime is set.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.METADATACMDS">
 <term><varname>metadatacmds</varname></term>
 <listitem><para>Define commands to
-gather external metadata, e.g. tmsu tags. 
-There can be several entries, separated by semi-colons, each defining
+gather external metadata, e.g. tmsu tags. There can be several entries, separated by semi-colons, each defining
 which field name the data goes into and the command to use. Don't forget the
 initial semi-colon. All the field names must be different. You can use
 aliases in the "field" file if necessary.
@ -385,13 +437,15 @@ cachedir is ~/.cache/recoll, the default dbdir would be
 mboxcachedir, aspellDicDir, which can still be individually specified to
 override cachedir.  Note that if you have multiple configurations, each
 must have a different cachedir, there is no automatic computation of a
-subpath under cachedir.</para></listitem></varlistentry>
+subpath under cachedir.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAXFSOCCUPPC">
 <term><varname>maxfsoccuppc</varname></term>
 <listitem><para>Maximum file system occupation
 over which we stop indexing. The value is a percentage,
 corresponding to what the "Capacity" df output column shows. The default
-value is 0, meaning no checking.</para></listitem></varlistentry>
+value is 0, meaning no checking.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.DBDIR">
 <term><varname>dbdir</varname></term>
 <listitem><para>Xapian database directory
@ -399,36 +453,43 @@ location. This will be created on first indexing. If the
 value is not an absolute path, it will be interpreted as relative to
 cachedir if set, or the configuration directory (-c argument or
 $RECOLL_CONFDIR).  If nothing is specified, the default is then
-~/.recoll/xapiandb/</para></listitem></varlistentry>
+~/.recoll/xapiandb/
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.IDXSTATUSFILE">
 <term><varname>idxstatusfile</varname></term>
 <listitem><para>Name of the scratch file where the indexer process updates its
 status. Default: idxstatus.txt inside the configuration
-directory.</para></listitem></varlistentry>
+directory.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MBOXCACHEDIR">
 <term><varname>mboxcachedir</varname></term>
 <listitem><para>Directory location for storing mbox message offsets cache
 files. This is normally 'mboxcache' under cachedir if set,
 or else under the configuration directory, but it may be useful to share
-a directory between different configurations.</para></listitem></varlistentry>
+a directory between different configurations.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MBOXCACHEMINMBS">
 <term><varname>mboxcacheminmbs</varname></term>
 <listitem><para>Minimum mbox file size over which we cache the offsets. There is really no sense in caching offsets for small files. The
-default is 5 MB.</para></listitem></varlistentry>
+default is 5 MB.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MBOXMAXMSGMBS">
 <term><varname>mboxmaxmsgmbs</varname></term>
 <listitem><para>Maximum mbox member message size in megabytes. Size over which we assume that the mbox format is bad or we
-misinterpreted it, at which point we just stop processing the file.</para></listitem></varlistentry>
+misinterpreted it, at which point we just stop processing the file.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.WEBCACHEDIR">
 <term><varname>webcachedir</varname></term>
 <listitem><para>Directory where we store the archived web pages. This is only used by the web history indexing code
 Default: cachedir/webcache if cachedir is set, else
-$RECOLL_CONFDIR/webcache</para></listitem></varlistentry>
+$RECOLL_CONFDIR/webcache
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.WEBCACHEMAXMBS">
 <term><varname>webcachemaxmbs</varname></term>
 <listitem><para>Maximum size in MB of the Web archive. This is only used by the web history indexing code.
 Default: 40 MB.
-Reducing the size will not physically truncate the file.</para></listitem></varlistentry>
+Reducing the size will not physically truncate the file.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.WEBQUEUEDIR">
 <term><varname>webqueuedir</varname></term>
 <listitem><para>The path to the Web indexing queue. This used to be
@ -436,29 +497,42 @@ hard-coded in the old plugin as ~/.recollweb/ToIndex so there would be no
 need or possibility to change it, but the WebExtensions plugin now downloads
 the files to the user Downloads directory, and a script moves them to
 webqueuedir. The script reads this value from the config so it has become
-possible to change it.</para></listitem></varlistentry>
+possible to change it.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.WEBDOWNLOADSDIR">
 <term><varname>webdownloadsdir</varname></term>
 <listitem><para>The path to browser downloads directory. This is
 where the new browser add-on extension has to create the files. They are
-then moved by a script to webqueuedir.</para></listitem></varlistentry>
+then moved by a script to webqueuedir.
+</para></listitem></varlistentry>
+<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.WEBCACHEKEEPINTERVAL">
+<term><varname>webcachekeepinterval</varname></term>
+<listitem><para>Page recycle interval By default, only one instance of an URL is kept in the cache. This
+can be changed by setting this to a value determining at what frequency
+we keep multiple instances ('day', 'week', 'month',
+'year'). Note that increasing the interval will not erase existing
+entries.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ASPELLDICDIR">
 <term><varname>aspellDicDir</varname></term>
 <listitem><para>Aspell dictionary storage directory location. The
 aspell dictionary (aspdict.(lang).rws) is normally stored in the
 directory specified by cachedir if set, or under the configuration
-directory.</para></listitem></varlistentry>
+directory.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.FILTERSDIR">
 <term><varname>filtersdir</varname></term>
 <listitem><para>Directory location for executable input handlers. If
 RECOLL_FILTERSDIR is set in the environment, we use it instead. Defaults
 to $prefix/share/recoll/filters. Can be redefined for
-subdirectories.</para></listitem></varlistentry>
+subdirectories.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ICONSDIR">
 <term><varname>iconsdir</varname></term>
 <listitem><para>Directory location for icons. The only reason to
 change this would be if you want to change the icons displayed in the
-result list. Defaults to $prefix/share/recoll/images</para></listitem></varlistentry>
+result list. Defaults to $prefix/share/recoll/images
+</para></listitem></varlistentry>
 </variablelist></sect3>
 <sect3 id="RCL.INSTALL.CONFIG.RECOLLCONF.PERFS">
 <title>Parameters affecting indexing performance and resource usage </title><variablelist>
@ -476,20 +550,24 @@ value (from this file) is now 50 MB, and should be ok in many cases.
 You can set it as low as 10 to conserve memory, but if you are looking
 for maximum speed, you may want to experiment with values between 20 and
 200. In my experience, values beyond this are always counterproductive. If
-you find otherwise, please drop me a note.</para></listitem></varlistentry>
+you find otherwise, please drop me a note.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.FILTERMAXSECONDS">
 <term><varname>filtermaxseconds</varname></term>
 <listitem><para>Maximum external filter execution time in
 seconds. Default 1200 (20mn). Set to 0 for no limit. This
 is mainly to avoid infinite loops in postscript files
-(loop.ps)</para></listitem></varlistentry>
+(loop.ps)
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.FILTERMAXMBYTES">
 <term><varname>filtermaxmbytes</varname></term>
 <listitem><para>Maximum virtual memory space for filter processes
-(setrlimit(RLIMIT_AS)), in megabytes. Note that this
-includes any mapped libs (there is no reliable Linux way to limit the
-data space only), so we need to be a bit generous here. Anything over
-2000 will be ignored on 32 bits machines.</para></listitem></varlistentry>
+(setrlimit(RLIMIT_AS)), in megabytes. Note that this includes any mapped libs (there is no reliable
+Linux way to limit the data space only), so we need to be a bit generous
+here. Anything over 2000 will be ignored on 32 bits machines. The
+previous default value of 2000 would prevent java pdftk to work when
+executed from Python rclpdf.py.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.THRQSIZES">
 <term><varname>thrQSizes</varname></term>
 <listitem><para>Stage input queues configuration. There are three
@ -501,7 +579,8 @@ next stage. In practise, deep queues have not been shown to increase
 performance. Default: a value of 0 for the first queue tells Recoll to
 perform autoconfiguration based on the detected number of CPUs (no need
 for the two other values in this case).  Use thrQSizes = -1 -1 -1 to
-disable multithreading entirely.</para></listitem></varlistentry>
+disable multithreading entirely.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.THRTCOUNTS">
 <term><varname>thrTCounts</varname></term>
 <listitem><para>Number of threads used for each indexing stage. The
@ -511,7 +590,8 @@ in thrQSizes: if the first queue depth is 0, all counts are ignored
 (autoconfigured); if a value of -1 is used for a queue depth, the
 corresponding thread count is ignored. It makes no sense to use a value
 other than 1 for the last stage because updating the Xapian index is
-necessarily single-threaded (and protected by a mutex).</para></listitem></varlistentry>
+necessarily single-threaded (and protected by a mutex).
+</para></listitem></varlistentry>
 </variablelist></sect3>
 <sect3 id="RCL.INSTALL.CONFIG.RECOLLCONF.MISC">
 <title>Miscellaneous parameters </title><variablelist>
@ -519,7 +599,8 @@ necessarily single-threaded (and protected by a mutex).</para></listitem></varli
 <term><varname>loglevel</varname></term>
 <listitem><para>Log file verbosity 1-6. A value of 2 will print
 only errors and warnings. 3 will print information like document updates,
-4 is quite verbose and 6 very verbose.</para></listitem></varlistentry>
+4 is quite verbose and 6 very verbose.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.LOGFILENAME">
 <term><varname>logfilename</varname></term>
 <listitem><para>Log file destination. Use 'stderr' (default) to write to the
@ -530,16 +611,25 @@ console. </para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.IDXLOGFILENAME">
 <term><varname>idxlogfilename</varname></term>
 <listitem><para>Override logfilename for the indexer. </para></listitem></varlistentry>
+<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.HELPERLOGFILENAME">
+<term><varname>helperlogfilename</varname></term>
+<listitem><para>Destination file for external helpers standard error output. The external program error output is left alone by default,
+e.g. going to the terminal when the recoll[index] program is executed
+from the command line. Use /dev/null or a file inside a non-existent
+directory to completely suppress the output.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.DAEMLOGLEVEL">
 <term><varname>daemloglevel</varname></term>
 <listitem><para>Override loglevel for the indexer in real time
 mode. The default is to use the idx... values if set, else
-the log... values.</para></listitem></varlistentry>
+the log... values.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.DAEMLOGFILENAME">
 <term><varname>daemlogfilename</varname></term>
 <listitem><para>Override logfilename for the indexer in real time
 mode. The default is to use the idx... values if set, else
-the log... values.</para></listitem></varlistentry>
+the log... values.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PYLOGLEVEL">
 <term><varname>pyloglevel</varname></term>
 <listitem><para>Override loglevel for the python module. </para></listitem></varlistentry>
@ -552,7 +642,8 @@ the log... values.</para></listitem></varlistentry>
 configuration directory inside the directory tree makes it possible to
 provide automatic query time path translations once the data set has
 moved (for example, because it has been mounted on another
-location).</para></listitem></varlistentry>
+location).
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.CURIDXCONFDIR">
 <term><varname>curidxconfdir</varname></term>
 <listitem><para>Current location of the configuration directory. Complement orgidxconfdir for movable datasets. This should be used
@ -564,7 +655,8 @@ example if a dataset originally indexed as '/home/me/mydata/config' has
 been mounted to '/media/me/mydata', and the GUI is running from a copied
 configuration, orgidxconfdir would be '/home/me/mydata/config', and
 curidxconfdir (as set in the copied configuration) would be
-'/media/me/mydata/config'.</para></listitem></varlistentry>
+'/media/me/mydata/config'.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.IDXRUNDIR">
 <term><varname>idxrundir</varname></term>
 <listitem><para>Indexing process current directory. The input
@ -573,17 +665,22 @@ makes sense to have recollindex chdir to some temporary directory. If the
 value is empty, the current directory is not changed. If the
 value is (literal) tmp, we use the temporary directory as set by the
 environment (RECOLL_TMPDIR else TMPDIR else /tmp). If the value is an
-absolute path to a directory, we go there.</para></listitem></varlistentry>
+absolute path to a directory, we go there.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.CHECKNEEDRETRYINDEXSCRIPT">
 <term><varname>checkneedretryindexscript</varname></term>
 <listitem><para>Script used to heuristically check if we need to retry indexing
 files which previously failed.  The default script checks
 the modified dates on /usr/bin and /usr/local/bin. A relative path will
 be looked up in the filters dirs, then in the path. Use an absolute path
-to do otherwise.</para></listitem></varlistentry>
+to do otherwise.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.RECOLLHELPERPATH">
 <term><varname>recollhelperpath</varname></term>
-<listitem><para>Additional places to search for helper executables. This is only used on Windows for now.</para></listitem></varlistentry>
+<listitem><para>Additional places to search for helper executables. This is used, e.g., on Windows by the Python code, and on Mac OS by the bundled recoll.app
+(because I could find no reliable way to tell launchd to set the PATH). The example below is for
+Windows. Use ':' as entry separator for Mac and Ux-like systems, ';' is for Windows only.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.IDXABSMLEN">
 <term><varname>idxabsmlen</varname></term>
 <listitem><para>Length of abstracts we store while indexing. Recoll stores an abstract for each indexed file.
@ -595,57 +692,72 @@ defines the size of the stored abstract. The default value is 250
 bytes. The search interface gives you the choice to display this stored
 text or a synthetic abstract built by extracting text around the search
 terms. If you always prefer the synthetic abstract, you can reduce this
-value and save a little space.</para></listitem></varlistentry>
+value and save a little space.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.IDXMETASTOREDLEN">
 <term><varname>idxmetastoredlen</varname></term>
 <listitem><para>Truncation length of stored metadata fields. This
 does not affect indexing (the whole field is processed anyway), just the
 amount of data stored in the index for the purpose of displaying fields
 inside result lists or previews. The default value is 150 bytes which
-may be too low if you have custom fields.</para></listitem></varlistentry>
+may be too low if you have custom fields.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.IDXTEXTTRUNCATELEN">
 <term><varname>idxtexttruncatelen</varname></term>
 <listitem><para>Truncation length for all document texts. Only index
 the beginning of documents. This is not recommended except if you are
 sure that the interesting keywords are at the top and have severe disk
-space issues.</para></listitem></varlistentry>
+space issues.
+</para></listitem></varlistentry>
+<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.IDXSYNONYMS">
+<term><varname>idxsynonyms</varname></term>
+<listitem><para>Name of the index-time synonyms file. This is used for indexing multiword synonyms as single terms,
+which in turn is only useful if you want to perform proximity searches
+with such terms.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ASPELLLANGUAGE">
 <term><varname>aspellLanguage</varname></term>
 <listitem><para>Language definitions to use when creating the aspell
 dictionary. The value must match a set of aspell language
 definition files. You can type "aspell dicts" to see a list The default
 if this is not set is to use the NLS environment to guess the value. The
-values are the 2-letter language codes (e.g. 'en', 'fr'...)</para></listitem></varlistentry>
+values are the 2-letter language codes (e.g. 'en', 'fr'...)
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ASPELLADDCREATEPARAM">
 <term><varname>aspellAddCreateParam</varname></term>
 <listitem><para>Additional option and parameter to aspell dictionary creation
 command. Some aspell packages may need an additional option
 (e.g. on Debian Jessie: --local-data-dir=/usr/lib/aspell). See Debian bug
-772415.</para></listitem></varlistentry>
+772415.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ASPELLKEEPSTDERR">
 <term><varname>aspellKeepStderr</varname></term>
 <listitem><para>Set this to have a look at aspell dictionary creation
 errors. There are always many, so this is mostly for
-debugging.</para></listitem></varlistentry>
+debugging.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.NOASPELL">
 <term><varname>noaspell</varname></term>
 <listitem><para>Disable aspell use. The aspell dictionary generation
 takes time, and some combinations of aspell version, language, and local
 terms, result in aspell crashing, so it sometimes makes sense to just
-disable the thing.</para></listitem></varlistentry>
+disable the thing.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MONAUXINTERVAL">
 <term><varname>monauxinterval</varname></term>
 <listitem><para>Auxiliary database update interval. The real time
 indexer only updates the auxiliary databases (stemdb, aspell)
 periodically, because it would be too costly to do it for every document
-change. The default period is one hour.</para></listitem></varlistentry>
+change. The default period is one hour.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MONIXINTERVAL">
 <term><varname>monixinterval</varname></term>
 <listitem><para>Minimum interval (seconds) between processings of the indexing
 queue. The real time indexer does not process each event
 when it comes in, but lets the queue accumulate, to diminish overhead and
 to aggregate multiple events affecting the same file. Default 30
-S.</para></listitem></varlistentry>
+S.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MONDELAYPATTERNS">
 <term><varname>mondelaypatterns</varname></term>
 <listitem><para>Timing parameters for the real time indexing. Definitions for files which get a longer delay before reindexing
@ -654,21 +766,25 @@ reindexed once in a while. A list of wildcardPattern:seconds pairs. The
 patterns are matched with fnmatch(pattern, path, 0) You can quote entries
 containing white space with double quotes (quote the whole entry, not the
 pattern). The default is empty.
-Example: mondelaypatterns = *.log:20 "*with spaces.*:30"</para></listitem></varlistentry>
+Example: mondelaypatterns = *.log:20 "*with spaces.*:30"
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.IDXNICEPRIO">
 <term><varname>idxniceprio</varname></term>
 <listitem><para>"nice" process priority for the indexing processes. Default: 19
-(lowest) Appeared with 1.26.5. Prior versions were fixed at 19.</para></listitem></varlistentry>
+(lowest) Appeared with 1.26.5. Prior versions were fixed at 19.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MONIONICECLASS">
 <term><varname>monioniceclass</varname></term>
 <listitem><para>ionice class for the indexing process. Despite the misleading name, and on platforms where this is
 supported, this affects all indexing processes,
 not only the real time/monitoring ones. The default value is 3 (use
-lowest "Idle" priority).</para></listitem></varlistentry>
+lowest "Idle" priority).
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MONIONICECLASSDATA">
 <term><varname>monioniceclassdata</varname></term>
 <listitem><para>ionice class level parameter if the class supports it. The default is empty, as the default "Idle" class has no
-levels.</para></listitem></varlistentry>
+levels.
+</para></listitem></varlistentry>
 </variablelist></sect3>
 <sect3 id="RCL.INSTALL.CONFIG.RECOLLCONF.QUERY">
 <title>Query-time parameters (no impact on the index) </title><variablelist>
@ -677,7 +793,8 @@ levels.</para></listitem></varlistentry>
 <listitem><para>auto-trigger diacritics sensitivity (raw index only). IF the index is not stripped, decide if we automatically trigger
 diacritics sensitivity if the search term has accented characters (not in
 unac_except_trans). Else you need to use the query language and the "D"
-modifier to specify diacritics sensitivity. Default is no.</para></listitem></varlistentry>
+modifier to specify diacritics sensitivity. Default is no.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.AUTOCASESENS">
 <term><varname>autocasesens</varname></term>
 <listitem><para>auto-trigger case sensitivity (raw index only). IF
@ -685,40 +802,46 @@ the index is not stripped (see indexStripChars), decide if we
 automatically trigger character case sensitivity if the search term has
 upper-case characters in any but the first position. Else you need to use
 the query language and the "C" modifier to specify character-case
-sensitivity. Default is yes.</para></listitem></varlistentry>
+sensitivity. Default is yes.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAXTERMEXPAND">
 <term><varname>maxTermExpand</varname></term>
 <listitem><para>Maximum query expansion count
 for a single term (e.g.: when using wildcards). This only
 affects queries, not indexing. We used to not limit this at all (except
 for filenames where the limit was too low at 1000), but it is
-unreasonable with a big index. Default 10000.</para></listitem></varlistentry>
+unreasonable with a big index. Default 10000.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MAXXAPIANCLAUSES">
 <term><varname>maxXapianClauses</varname></term>
 <listitem><para>Maximum number of clauses
 we add to a single Xapian query. This only affects queries,
 not indexing. In some cases, the result of term expansion can be
 multiplicative, and we want to avoid eating all the memory. Default
-50000.</para></listitem></varlistentry>
+50000.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.SNIPPETMAXPOSWALK">
 <term><varname>snippetMaxPosWalk</varname></term>
 <listitem><para>Maximum number of positions we walk while populating a snippet for
 the result list. The default of 1,000,000 may be
 insufficient for very big documents, the consequence would be snippets
-with possibly meaning-altering missing words.</para></listitem></varlistentry>
+with possibly meaning-altering missing words.
+</para></listitem></varlistentry>
 </variablelist></sect3>
 <sect3 id="RCL.INSTALL.CONFIG.RECOLLCONF.PDF">
 <title>Parameters for the PDF input script </title><variablelist>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR">
 <term><varname>pdfocr</varname></term>
 <listitem><para>Attempt OCR of PDF files with no text content. This can be defined in subdirectories. The default is off because
-OCR is so very slow.</para></listitem></varlistentry>
+OCR is so very slow.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFATTACH">
 <term><varname>pdfattach</varname></term>
 <listitem><para>Enable PDF attachment extraction by executing pdftk (if
 available). This is
 normally disabled, because it does slow down PDF indexing a bit even if
-not one attachment is ever found.</para></listitem></varlistentry>
+not one attachment is ever found.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFEXTRAMETA">
 <term><varname>pdfextrameta</varname></term>
 <listitem><para>Extract text from selected XMP metadata tags. This
@ -726,7 +849,8 @@ is a space-separated list of qualified XMP tag names. Each element can also
 include a translation to a Recoll field name, separated by a '|'
 character. If the second element is absent, the tag name is used as the
 Recoll field names. You will also need to add specifications to the
-"fields" file to direct processing of the extracted data.</para></listitem></varlistentry>
+"fields" file to direct processing of the extracted data.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.PDFEXTRAMETAFIX">
 <term><varname>pdfextrametafix</varname></term>
 <listitem><para>Define name of XMP field editing script. This
@ -735,7 +859,8 @@ values. The script should define a 'MetaFixer' class with a metafix()
 method which will be called with the qualified tag name and value of each
 selected field, for editing or erasing. A new instance is created for
 each document, so that the object can keep state for, e.g. eliminating
-duplicate values.</para></listitem></varlistentry>
+duplicate values.
+</para></listitem></varlistentry>
 </variablelist></sect3>
 <sect3 id="RCL.INSTALL.CONFIG.RECOLLCONF.OCR">
 <title>Parameters for OCR processing </title><variablelist>
@ -747,17 +872,20 @@ the input file. Modules for tesseract (tesseract) and ABBYY FineReader
 (abbyy) are present in the standard distribution. For compatibility with
 the previous version, if this is not defined at all, the default value is
 "tesseract". Use an explicit empty value if needed. A value of "abbyy
-tesseract" will try everything.</para></listitem></varlistentry>
+tesseract" will try everything.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.OCRCACHEDIR">
 <term><varname>ocrcachedir</varname></term>
 <listitem><para>Location for caching OCR data. The default if this is empty or undefined is to store the cached
-OCR data under $RECOLL_CONFDIR/ocrcache.</para></listitem></varlistentry>
+OCR data under $RECOLL_CONFDIR/ocrcache.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.TESSERACTLANG">
 <term><varname>tesseractlang</varname></term>
 <listitem><para>Language to assume for tesseract OCR. Important for improving the OCR accuracy. This can also be set
 through the contents of a file in
 the currently processed directory. See the rclocrtesseract.py
-script. Example values: eng, fra... See the tesseract documentation.</para></listitem></varlistentry>
+script. Example values: eng, fra... See the tesseract documentation.
+</para></listitem></varlistentry>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.TESSERACTCMD">
 <term><varname>tesseractcmd</varname></term>
 <listitem><para>Path for the tesseract command. Do not quote. This is mostly useful on Windows, or for specifying a non-default
@ -776,11 +904,19 @@ script. Typical values: English, French... See the ABBYY documentation.
 <listitem><para>Path for the abbyy command The ABBY directory is usually not in the path, so you should set this.
 </para></listitem></varlistentry>
 </variablelist></sect3>
+<sect3 id="RCL.INSTALL.CONFIG.RECOLLCONF.MISCHANDLERPARAMS">
+<title>Parameters for specific handlers </title><variablelist>
+<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ORGMODESUBDOCS">
+<term><varname>orgmodesubdocs</varname></term>
+<listitem><para>Index org-mode level 1 sections as separate sub-documents This is the default. If set to false, org-mode files will be indexed as plain text
+</para></listitem></varlistentry>
+</variablelist></sect3>
 <sect3 id="RCL.INSTALL.CONFIG.RECOLLCONF.SPECLOCATIONS">
 <title>Parameters set for specific locations </title><variablelist>
 <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.MHMBOXQUIRKS">
 <term><varname>mhmboxquirks</varname></term>
 <listitem><para>Enable thunderbird/mozilla-seamonkey mbox format quirks Set this for the directory where the email mbox files are
-stored.</para></listitem></varlistentry>
+stored.
+</para></listitem></varlistentry>
 </variablelist></sect3>
 </sect2>
--- a/src/doc/user/usermanual.html
+++ b/src/doc/user/usermanual.html
--- a/src/doc/user/usermanual.xml
+++ b/src/doc/user/usermanual.xml
--- a/src/filters/cmdtalk.py
+++ b/src/filters/cmdtalk.py
@ -20,8 +20,6 @@
 # All data is binary. This is important for Python3
 # All parameter names are converted to and processed as str/unicode

-from __future__ import print_function
-
 import sys
 import os
 import tempfile
@ -29,25 +27,13 @@ import shutil
 import getopt
 import traceback

-PY3 = sys.version > '3'
-
-if PY3:
-    def makebytes(data):
-        if data is None:
-            return b""
-        if isinstance(data, bytes):
-            return data
-        else:
-            return data.encode("UTF-8")
-else:
-    def makebytes(data):
-        if data is None:
-            return ""
-        if isinstance(data, unicode):
-            return data.encode("UTF-8")
-        else:
-            return data
-
+def makebytes(data):
+    if data is None:
+        return b""
+    if isinstance(data, bytes):
+        return data
+    else:
+        return data.encode("UTF-8")

 ############################################
 # CmdTalk implements the communication protocol with the master
@ -116,10 +102,7 @@ class CmdTalk(object):
    # followed by data. The param name is returned as str/unicode, the data
    # as bytes
    def readparam(self):
-        if PY3:
-            inf = self.infile.buffer
-        else:
-            inf = self.infile
+        inf = self.infile.buffer
        s = inf.readline()
        if s == b'':
            if self.exitfunc:
@ -143,7 +126,7 @@ class CmdTalk(object):
                      (paramsize, len(paramdata)), 1, 1)
        else:
            paramdata = b''
-        if PY3 and not self.nodecodeinput:
+        if not self.nodecodeinput:
            try:
                paramdata = paramdata.decode('utf-8')
            except Exception as ex:
@ -154,18 +137,11 @@ class CmdTalk(object):
        #          (paramname, paramsize, paramdata))
        return (paramname, paramdata)

-    if PY3:
-        def senditem(self, nm, data):
-            data = makebytes(data)
-            l = len(data)
-            self.outfile.buffer.write(makebytes("%s: %d\n" % (nm, l)))
-            self.breakwrite(self.outfile.buffer, data)
-    else:
-        def senditem(self, nm, data):
-            data = makebytes(data)
-            l = len(data)
-            self.outfile.write(makebytes("%s: %d\n" % (nm, l)))
-            self.breakwrite(self.outfile, data)
+    def senditem(self, nm, data):
+        data = makebytes(data)
+        l = len(data)
+        self.outfile.buffer.write(makebytes("%s: %d\n" % (nm, l)))
+        self.breakwrite(self.outfile.buffer, data)
        
    # Send answer: document, ipath, possible eof.
    def answer(self, outfields):
@ -242,7 +218,7 @@ def main(proto, processor):
        params[args[2*i]] = args[2*i+1]
    res = processor.process(params)

-    ioout = sys.stdout.buffer if PY3 else sys.stdout
+    ioout = sys.stdout.buffer

    for nm,value in res.items():
        #self.log("Senditem: [%s] -> [%s]" % (nm, value))
--- a/src/filters/kosplitter.py
+++ b/src/filters/kosplitter.py
@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
 #################################
 # Copyright (C) 2020 J.F.Dockes
 #   This program is free software; you can redistribute it and/or modify
--- a/src/filters/rcl7z.py
+++ b/src/filters/rcl7z.py
@ -3,7 +3,7 @@
 # 7-Zip file filter for Recoll

 # Thanks to Recoll user Martin Ziegler
-# This is a modified version of rclzip, with some help from rcltar
+# This is a modified version of rclzip.py, with some help from rcltar.py
 #
 # Normally using py7zr https://github.com/miurahr/py7zr
 #
--- a/src/filters/rclaudio.py
+++ b/src/filters/rclaudio.py
@ -238,7 +238,7 @@ class AudioTagExtractor(RclBaseHandler):
                if tagname.startswith('APIC:'):
                    #self.em.rclog("mp3 img: %s" % mutf[tagname].mime)
                    return 'jpg' if mutf[tagname].mime == 'image/jpeg' else 'png'
-        elif 'audio/x-flac' in mime:
+        elif 'audio/flac' in mime:
            if mutf.pictures:
                return 'jpg' if mutf.pictures[0].mime == 'image/jpeg' else 'png'
        elif 'audio/mp4' in mime:
@ -351,6 +351,11 @@ class AudioTagExtractor(RclBaseHandler):
        # Metadata tags. The names vary depending on the file type. We
        # just have a big translation dictionary for all
        for tag,val in mutf.items():
+            #print(f"TAG {tag} VAL {val}", file=sys.stderr)
+            # Mutagen sends out COMM==eng= with tag COMM::eng We don't know what to do with the
+            # language (or possible other attributes), so get rid of it for now:
+            if tag.find("COMM::") == 0:
+                tag = "COMM"
            if tag.find('TXXX:') == 0:
                tag = tag[5:].upper()
            elif tag.find('TXX:') == 0:
--- a/src/filters/rclbasehandler.py
+++ b/src/filters/rclbasehandler.py
@ -18,8 +18,6 @@
 # Base for extractor classes. With some common generic implementations
 # for the boilerplate functions.

-from __future__ import print_function
-
 import os
 import sys
 import rclexecm
--- a/src/filters/rclcheckneedretry.sh
+++ b/src/filters/rclcheckneedretry.sh
@ -17,9 +17,15 @@
 # with retry set).
 #

+# If $HOME does not exist, there is nothing we can do (happens, for example when run as upmpdcli)
+if test ! -d "$HOME" ; then
+    exit 0
+fi
+
 # Bin dirs to be tested:
 bindirs="/usr/bin /usr/local/bin $HOME/bin /opt/*/bin"

+
 rfiledir=$HOME/.config/Recoll.org
 rfile=$rfiledir/needidxretrydate
 nrfile=$rfiledir/tneedidxretrydate
--- a/src/filters/rclchm.py
+++ b/src/filters/rclchm.py
--- a/src/filters/rcldia.py
+++ b/src/filters/rcldia.py
@ -1,12 +1,11 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-from __future__ import print_function

 # dia (http://live.gnome.org/Dia) file filter for recoll
 # stefan.friedel@iwr.uni-heidelberg.de 2012
 #
 # add the following to ~/.recoll/mimeconf into the [index] section:
-# application/x-dia-diagram = execm rcldia;mimetype=text/plain;charset=utf-8
+# application/x-dia-diagram = execm rcldia.py;mimetype=text/plain;charset=utf-8
 # and into the [icons] section:
 # application/x-dia-diagram = drawing
 # and finally under [categories]:
--- a/src/filters/rcldjvu.py
+++ b/src/filters/rcldjvu.py
@ -17,8 +17,6 @@

 # Recoll DJVU extractor

-from __future__ import print_function
-
 import os
 import sys
 import re
--- a/src/filters/rcldoc.py
+++ b/src/filters/rcldoc.py
@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-from __future__ import print_function

 import rclexecm
 import rclexec1
--- a/src/filters/rclepub.py
+++ b/src/filters/rclepub.py
@ -1,6 +1,5 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
 """Extract Html content from an EPUB file (.epub)"""
-from __future__ import print_function

 rclepub_html_mtype = "text/html"

--- a/src/filters/rclepub1.py
+++ b/src/filters/rclepub1.py
@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 """Extract Html content from an EPUB file (.chm), concatenating all sections"""
-from __future__ import print_function

 import sys
 import os
--- a/src/filters/rclexec1.py
+++ b/src/filters/rclexec1.py
@ -26,8 +26,6 @@
 # this would be to slow. So this helps implementing a permanent script
 # to repeatedly execute single commands.

-from __future__ import print_function
-
 import subprocess
 import rclexecm
 from rclbasehandler import RclBaseHandler
--- a/src/filters/rclexecm.py
+++ b/src/filters/rclexecm.py
@ -20,8 +20,6 @@
 # All data is binary. This is important for Python3
 # All parameter names are converted to and processed as str/unicode

-from __future__ import print_function
-
 import sys
 import os
 import tempfile
@ -30,7 +28,6 @@ import getopt
 import rclconfig
 import cmdtalk

-PY3 = (sys.version > '3')
 _g_mswindows = (sys.platform == "win32")
 _g_execdir = os.path.dirname(sys.argv[0])

@ -62,12 +59,11 @@ def makebytes(data):
 # Possibly decode binary file name for use as subprocess argument,
 # depending on platform.
 def subprocfile(fn):
-    # On Windows PY3 the list2cmdline() method in subprocess assumes that
-    # all args are str, and we receive file names as UTF-8. So we need
-    # to convert.
-    # On Unix all list elements get converted to bytes in the C
-    # _posixsubprocess module, nothing to do.
-    if PY3 and _g_mswindows and type(fn) != type(''):
+    # On Windows Python 3 the list2cmdline() method in subprocess assumes that all args are str, and
+    # we receive file names as UTF-8. So we need to convert.
+    # On Unix all list elements get converted to bytes in the C _posixsubprocess module, nothing to
+    # do.
+    if _g_mswindows and type(fn) != type(''):
        return fn.decode('UTF-8')
    else:
        return fn
@ -265,19 +261,30 @@ def execPythonScript(icmd):
    
 # Temp dir helper
 class SafeTmpDir:
-    def __init__(self, em):
+    def __init__(self, tag, em=None):
+        self.tag = tag
        self.em = em
-        self.toptmp = ""
-        self.tmpdir = ""
+        self.toptmp = None
+        self.tmpdir = None

    def __del__(self):
-        try:
-            if self.toptmp:
-                shutil.rmtree(self.tmpdir, True)
+        if self.toptmp:
+            try:
+                if self.tmpdir:
+                    shutil.rmtree(self.tmpdir, True)
                os.rmdir(self.toptmp)
-        except Exception as err:
-            self.em.rclog("delete dir failed for " + self.toptmp)
+            except Exception as err:
+                if self.em:
+                    self.em.rclog("delete dir failed for " + self.toptmp)

+    def vacuumdir(self):
+        if self.tmpdir:
+            for fn in os.listdir(self.tmpdir):
+                path = os.path.join(self.tmpdir, fn)
+                if os.path.isfile(path):
+                    os.unlink(path)
+        return True
+    
    def getpath(self):
        if not self.tmpdir:
            envrcltmp = os.getenv('RECOLL_TMPDIR')
@ -286,7 +293,7 @@ class SafeTmpDir:
            else:
                self.toptmp = tempfile.mkdtemp(prefix='rcltmp')

-            self.tmpdir = os.path.join(self.toptmp, 'rclsofftmp')
+            self.tmpdir = os.path.join(self.toptmp, self.tag)
            os.makedirs(self.tmpdir)

        return self.tmpdir
@ -305,8 +312,7 @@ def main(proto, extract):
    # Not running the main loop: either acting as single filter (when called
    # from other filter for example), or debugging
    def usage():
-        print("Usage: rclexecm.py [-d] [-s] [-i ipath] <filename>",
-              file=sys.stderr)
+        print("Usage: rclexecm.py [-d] [-f] [-h] [-i ipath] [-s] <filename>", file=sys.stderr)
        print("       rclexecm.py -w <prog>", file=sys.stderr)
        sys.exit(1)
        
@ -361,7 +367,7 @@ def main(proto, extract):

    params = {'filename' : makebytes(path)}

-    # Some filters (e.g. rclaudio) need/get a MIME type from the indexer.
+    # Some filters (e.g. rclaudio.py) need/get a MIME type from the indexer.
    # We make a half-assed attempt to emulate:
    mimetype = _g_config.mimeType(path)
    if not mimetype and not _g_mswindows:
@ -373,10 +379,7 @@ def main(proto, extract):
        print("Open error", file=sys.stderr)
        sys.exit(1)

-    if PY3:
-        ioout = sys.stdout.buffer
-    else:
-        ioout = sys.stdout
+    ioout = sys.stdout.buffer
    if ipath != b"" or actAsSingle:
        params['ipath'] = ipath
        ok, data, ipath, eof = extract.getipath(params)
--- a/src/filters/rclfb2.py
+++ b/src/filters/rclfb2.py
@ -16,8 +16,6 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ######################################

-from __future__ import print_function
-
 import sys
 import rclexecm
 import rclxslt
--- a/src/filters/rclgenxslt.py
+++ b/src/filters/rclgenxslt.py
@ -18,8 +18,6 @@

 # Base class for simple (one stylesheet) xslt-based handlers

-from __future__ import print_function
-
 import sys
 import rclxslt
 import gzip
--- a/src/filters/rclhwp.py
+++ b/src/filters/rclhwp.py
@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
 # Copyright (C) 2020 J.F.Dockes
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/src/filters/rclics.py
+++ b/src/filters/rclics.py
@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-from __future__ import print_function

 # Read an ICS file, break it into "documents" which are events, todos,
 # or journal entries, and interface with recoll execm
--- a/src/filters/rclimg.py
+++ b/src/filters/rclimg.py
@ -6,7 +6,6 @@
 #
 # Uses pyexiv2. Also tried Pillow, found it useless for tags.
 #
-from __future__ import print_function

 import sys
 import os
--- a/src/filters/rclinfo.py
+++ b/src/filters/rclinfo.py
@ -3,8 +3,6 @@
 # Read a file in GNU info format and output its nodes as subdocs,
 # interfacing with recoll execm

-from __future__ import print_function
-
 import rclexecm
 import sys
 import os
@ -141,7 +139,7 @@ class InfoSimpleSplitter:
                        if name == b'File':
                            infofile = value
                except Exception as err:
-                    print("rclinfo: bad line in %s: [%s] %s\n" % \
+                    print("rclinfo.py: bad line in %s: [%s] %s\n" % \
                          (infofile, line, err), file = sys.stderr)
                    nodename = prevnodename
                    node += line
--- a/src/filters/rclipynb.py
+++ b/src/filters/rclipynb.py
@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# Copyright (C) 2021 J.F.Dockes
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the
+# Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+# Recoll handler for iPython / Jupyter notebook files.
+
+import os
+import sys
+import json
+
+import rclexecm
+from rclbasehandler import RclBaseHandler
+
+class IPYNBextractor(RclBaseHandler):
+
+    def __init__(self, em):
+        super(IPYNBextractor, self).__init__(em)
+
+    def html_text(self, fn):
+        text = open(fn, 'rb').read()
+        data = json.loads(text)
+        mdtext = ""
+        if "worksheets" in data:
+            cells = data["worksheets"][0]["cells"]
+        else:
+            cells = data["cells"]
+        for cell in cells:
+            if cell["cell_type"] == "markdown":
+                mdtext += "\n"
+                for line in cell["source"]:
+                    mdtext += "# " + line + "\n"
+            elif cell["cell_type"] == "code":
+                mdtext += "\n\n"
+                key = "source" if "source" in cell else "input"
+                for line in cell[key]:
+                    mdtext += line
+                mdtext += "\n"
+        #print("%s"%mdtext, file=sys.stderr)
+        self.outputmimetype = 'text/plain'
+        return mdtext
+
+
+# Main program: create protocol handler and extractor and run them
+proto = rclexecm.RclExecM()
+extract = IPYNBextractor(proto)
+rclexecm.main(proto, extract)
--- a/src/filters/rclkar.py
+++ b/src/filters/rclkar.py
@ -1,8 +1,6 @@
 #!/usr/bin/env python3

 # Read a .kar midi karaoke file and translate to recoll indexable format
-# This does not work with Python3 yet because python:midi doesn't 
-from __future__ import print_function

 import rclexecm
 import sys
@ -46,11 +44,7 @@ htmltemplate = '''

 nlbytes = b'\n'
 bsbytes = b'\\'
-PY3 = sys.version > '3'
-if PY3:
-    nullchar = 0
-else:
-    nullchar = chr(0)
+nullchar = 0
    
 class KarTextExtractor(RclBaseHandler):
    # Afaik, the only charset encodings with null bytes are variations on
--- a/src/filters/rcllatinclass.py
+++ b/src/filters/rcllatinclass.py
@ -13,12 +13,7 @@ epsilon with dasia (in unicode but not iso). Can this be replaced by either epsi
 with acute accent ?
 """

-from __future__ import print_function
-
 import sys
-PY3 = sys.version > '3'
-if not PY3:
-    import string
 import glob
 import os
 import os.path
@ -38,10 +33,7 @@ class European8859TextClassifier:
        # Table to translate from punctuation to spaces
        self.punct = b'''0123456789<>/*?[].@+-,#_$%&={};.,:!"''' + b"'\n\r"
        spaces = len(self.punct) * b' '
-        if PY3:
-            self.spacetable = bytes.maketrans(self.punct, spaces)
-        else:
-            self.spacetable = string.maketrans(self.punct, spaces)
+        self.spacetable = bytes.maketrans(self.punct, spaces)

    def readlanguages(self, langzip):
        """Extract the stop words lists from the zip file.
--- a/src/filters/rclmidi.py
+++ b/src/filters/rclmidi.py
@ -23,24 +23,15 @@
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 # 
-from __future__ import print_function

 import sys
 from struct import unpack, pack
 import six

-PY3 = sys.version > '3'
-
-if PY3:
-    def next_byte_as_int(data):
-        return next(data)
-    def next_byte_as_char(data):
-        return bytes([next(data)])
-else:
-    def next_byte_as_int(data):
-        return ord(data.next())
-    def next_byte_as_char(data):
-        return next(data)
+def next_byte_as_int(data):
+    return next(data)
+def next_byte_as_char(data):
+    return bytes([next(data)])

 ##
 ## Constants
@ -270,12 +261,8 @@ class NoteEvent(Event):
                                self.velocity)

    def decode_data(self):
-        if PY3:
-            self.pitch = self.data[0]
-            self.velocity = self.data[1]
-        else:
-            self.pitch = ord(self.data[0])
-            self.velocity = ord(self.data[1])
+        self.pitch = self.data[0]
+        self.velocity = self.data[1]


 class NoteOnEvent(NoteEvent):
@ -309,12 +296,8 @@ class ControlChangeEvent(Event):
                                hex(ord(self.data[1])))

    def decode_data(self):
-        if PY3:
-            self.control = self.data[0]
-            self.value = self.data[1]
-        else:
-            self.control = ord(self.data[0])
-            self.value = ord(self.data[1])
+        self.control = self.data[0]
+        self.value = self.data[1]


 class ProgramChangeEvent(Event):
@ -328,10 +311,7 @@ class ProgramChangeEvent(Event):
                                hex(ord(self.data[0])))

    def decode_data(self):
-        if PY3:
-            self.value = self.data[0]
-        else:
-            self.value = ord(self.data[0])
+        self.value = self.data[0]


 class ChannelAfterTouchEvent(Event):
@ -356,12 +336,8 @@ class PitchWheelEvent(Event):
                                hex(ord(self.data[1])))

    def decode_data(self):
-        if PY3:
-            first = self.data[0]
-            second = self.data[1]
-        else:
-            first = ord(self.data[0]) 
-            second = ord(self.data[1])
+        first = self.data[0]
+        second = self.data[1]
        self.value = ((second << 7) | first) - 0x2000


@ -461,10 +437,7 @@ class PortEvent(MetaEvent):

    def decode_data(self):
        assert(len(self.data) == 1)
-        if PY3:
-            self.port = self.data[0]
-        else:
-            self.port = ord(self.data[0])
+        self.port = self.data[0]

 class TrackLoopEvent(MetaEvent):
    name = 'Track Loop'
@ -498,13 +471,7 @@ class SetTempoEvent(MetaEvent):

    def decode_data(self):
        assert(len(self.data) == 3)
-        if PY3:
-            self.mpqn = (self.data[0] << 16) + (self.data[1] << 8) \
-                        + self.data[2]
-        else:
-            self.mpqn = (ord(self.data[0]) << 16) + (ord(self.data[1]) << 8) \
-                        + ord(self.data[2])
-
+        self.mpqn = (self.data[0] << 16) + (self.data[1] << 8) + self.data[2]
        self.tempo = float(6e7) / self.mpqn


@ -523,22 +490,13 @@ class TimeSignatureEvent(MetaEvent):
                            (super(TimeSignatureEvent, self).__str__(),
                                self.numerator, self.denominator,
                                self.metronome, self.thirtyseconds)
-    if PY3:
-        def decode_data(self):
-            assert(len(self.data) == 4)
-            self.numerator = self.data[0]
-            # Weird: the denominator is two to the power of the data variable
-            self.denominator = 2 ** self.data[1]
-            self.metronome = self.data[2]
-            self.thirtyseconds = self.data[3]
-    else:
-        def decode_data(self):
-            assert(len(self.data) == 4)
-            self.numerator = ord(self.data[0])
-            # Weird: the denominator is two to the power of the data variable
-            self.denominator = 2 ** ord(self.data[1])
-            self.metronome = ord(self.data[2])
-            self.thirtyseconds = ord(self.data[3])
+    def decode_data(self):
+        assert(len(self.data) == 4)
+        self.numerator = self.data[0]
+        # Weird: the denominator is two to the power of the data variable
+        self.denominator = 2 ** self.data[1]
+        self.metronome = self.data[2]
+        self.thirtyseconds = self.data[3]


 class KeySignatureEvent(MetaEvent):
--- a/src/filters/rclocr.py
+++ b/src/filters/rclocr.py
@ -26,6 +26,8 @@

 import os
 import sys
+import atexit
+import signal
 import importlib.util

 import rclconfig
@ -33,7 +35,27 @@ import rclocrcache
 import rclexecm

 def _deb(s):
-    rclexecm.logmsg(s)
+    rclexecm.logmsg("rclocr: %s" % s)
+
+ocrcleanupmodule = None
+@atexit.register
+def finalcleanup():
+    if ocrcleanupmodule:
+        ocrcleanupmodule.cleanocr()
+
+def signal_handler(sig, frame):
+    sys.exit(1)
+
+# Not all signals necessary exist on all systems, use catch
+try: signal.signal(signal.SIGHUP, signal_handler)
+except: pass
+try: signal.signal(signal.SIGINT, signal_handler)
+except: pass
+try: signal.signal(signal.SIGQUIT, signal_handler)
+except: pass
+try: signal.signal(signal.SIGTERM, signal_handler)
+except: pass
+
    
 def Usage():
    _deb("Usage: rclocr.py <imagefilename>")
@ -72,7 +94,7 @@ if incache:
    try:
        breakwrite(sys.stdout.buffer, data)
    except Exception as e:
-        _deb("RCLOCR error writing: %s" % e)
+        _deb("error writing: %s" % e)
        sys.exit(1)
    sys.exit(0)
    
@ -112,6 +134,7 @@ if not ok:

 # The OCR module will retrieve its specific parameters from the
 # configuration
+ocrcleanupmodule = ocr
 status, data = ocr.runocr(config, path)

 if not status:
--- a/src/filters/rclocrabbyy.py
+++ b/src/filters/rclocrabbyy.py
@ -42,6 +42,9 @@ abbyocrdir = ""
 def _deb(s):
    rclexecm.logmsg(s)

+def cleanocr():
+    pass
+
 # Return true if abbyy appears to be available
 def ocrpossible(config, path):
    global abbyyocrcmd
--- a/src/filters/rclocrcache.py
+++ b/src/filters/rclocrcache.py
@ -22,37 +22,63 @@
 # OCR is extremely slow, caching the results is necessary.
 #
 # The cache stores 2 kinds of objects:
-# - Path files are named from the hash of the image file path and
-#   contain the image data hash, the modification time and size of the
-#   image file at the time the OCR'd data was stored in the cache, and
-#   the image path itself (the last is for purging only).
-# - Data files are named with the hash of the image data and contain
-#   the zlib-compressed OCR'd data.
+# - Path files are named from the hash of the image file path and contain the
+#   image data hash, the modification time and size of the image file at the
+#   time the OCR'd data was stored in the cache, and the image path itself (the
+#   last is for purging only).
+# - Data files are named with the hash of the image data and contain the
+#   zlib-compressed OCR'd data.
+# - The cache Path and Data files are stored under top subdirectories: objects/
+#   and paths/.
 #
 # When retrieving data from the cache:
-#  - We first use the image file size and modification time: if an
-#    entry exists for the imagepath/mtime/size triplet, and is up to
-#    date, the corresponding data is obtained from the data file and
-#    returned.
-#  - Else we then use the image data: if an entry exists for the
-#    computed hashed value of the data, it is returned. This allows
-#    moving files around without needing to run OCR again, but of
-#    course, it is more expensive than the first step
+#  - We first use the image file size and modification time: if an entry exists
+#    for the imagepath/mtime/size triplet, and is up to date, the corresponding
+#    data is obtained from the data file and returned.
+#  - Else we then use the image data: if an entry exists for the computed hashed
+#    value of the data, it is returned. This allows moving files around without
+#    needing to run OCR again, but of course, it is more expensive than the
+#    first step
 #
-#  If we need to use the second step, as a side effect, a path file is
-#  created or updated so that the data will be found with the first
-#  step next time around.
+# In both cases, the paths are hashed with sha1, and the first two characters of
+# the hash are used as a top level directory, the rest as a file name. E.g. for:
+#   pd,pf = self._hashpath(path), the result would be stored under pd/pf
 #
-# Purging the cache of obsolete data.
+# If we need to use the second step, as a side effect, a path file is created or
+# updated so that the data will be found with the first step next time around.
 #
-#  - The cache path and data files are stored under 2 different
-#    directories (objects, paths) to make purging easier.
-#  - Purging the paths tree just involves walking it, reading the
-#    files, and checking the existence of the recorded paths.
-#  - There is no easy way to purge the data tree. The only possibility
-#    is to input a list of possible source files (e.g. result of a
-#    find in the image files area), and compute all the hashes. Data
-#    files which do not match one of the hashes are deleted.
+# When processing embedded documents like email attachments, recoll uses
+# temporary copies in TMPDIR (which defaults to /tmp) or RECOLL_TMPDIR. Of
+# course the paths for the temporary files changes when re-processing a given
+# document. We do not store the Path file for data stored in TMPDIR or
+# RECOLL_TMPDIR, because doing so would cause an indefinite accumulation of
+# unusable Path files. This means that access to the OCR data for these
+# documents always causes the computation of the data hash, and is slower. With
+# recent Recoll versions which cache the text content in the index, this only
+# occurs when reindexing (with older versions, this could also occur for
+# Preview).
+#
+# Purging the cache of obsolete data:
+#
+# This can be done by running this file as a top level script with a --purge
+# option (possibly completed by a --purgedata option but see below)
+#  - Purging the paths tree just involves walking it, reading the files, and
+#    checking the existence of the recorded paths. Path files for non-existent
+#    files are deleted.
+#  - Purging the data tree: we make a list of all Data files referenced by at
+#    least one Path file, then walk the data tree, deleting unreferenced
+#    files. This means that Data files from temporary document copies (see
+#    above) will be deleted, which is quite unsatisfying. This would be
+#    difficult to change:
+#    - There is no way to detect the affected files because the Data files store
+#      no origin information
+#    - Even if we wanted to store an indication that the data file comes from a
+#      temporary document, we'd have no way to access the original document
+#      because the full ipath is not available. Changing this would be close to
+#      impossible because internfile...
+# In consequence the --purgedata option must be explicitely added for a data
+# purge to be performed. Only set it if re-OCRing all embedded documents is reasonable.
+

 import sys
 import os
@ -61,11 +87,19 @@ import urllib.parse
 import zlib
 import glob

-import rclexecm
+from rclexecm import logmsg as _deb
+
+def _catslash(p):
+    if p and p[-1] != "/":
+        p += "/"
+    return p
+
+
+_tmpdir = os.environ["TMPDIR"] if "TMPDIR" in os.environ else "/tmp"
+_tmpdir = _catslash(_tmpdir)
+_recoll_tmpdir = os.environ["RECOLL_TMPDIR"] if "RECOLL_TMPDIR" in os.environ else None
+_recoll_tmpdir = _catslash(_recoll_tmpdir)

-def _deb(s):
-    rclexecm.logmsg(s)
-    

 class OCRCache(object):
    def __init__(self, conf):
@ -90,7 +124,7 @@ class OCRCache(object):

    # Compute sha1 of path data contents, as two parts of 2 and 38 chars
    def _hashdata(self, path):
-        #_deb("Hashing DATA")
+        # _deb("Hashing DATA")
        m = hashlib.sha1()
        with open(path, "rb") as f:
            while True:
@ -101,39 +135,39 @@ class OCRCache(object):
                h = m.hexdigest()
        return h[0:2], h[2:]

-    
    def _readpathfile(self, ppf):
        '''Read path file and return values. We do not decode the image path
        as this is only used for purging'''
        with open(ppf, 'r') as f:
            line = f.read()
-        dd,df,tm,sz,pth = line.split()
+        dd, df, tm, sz, pth = line.split()
        tm = int(tm)
        sz = int(sz)
-        return dd,df,tm,sz,pth
-        
+        return dd, df, tm, sz, pth
+
    # Try to read the stored attributes for a given path: data hash,
    # modification time and size. If this fails, the path itself is
    # not cached (but the data still might be, maybe the file was moved)
    def _cachedpathattrs(self, path):
-        pd,pf = self._hashpath(path)
+        pd, pf = self._hashpath(path)
        pathfilepath = os.path.join(self.pathdir, pd, pf)
        if not os.path.exists(pathfilepath):
            return False, None, None, None, None
        try:
            dd, df, tm, sz, pth = self._readpathfile(pathfilepath)
            return True, dd, df, tm, sz
-        except:
+        except Exception as ex:
+            _deb(f"Error while trying to access pathfile {pathfilepath}: {ex}")
            return False, None, None, None, None

    # Compute the path hash, and get the mtime and size for given
    # path, for updating the cache path file
    def _newpathattrs(self, path):
-        pd,pf = self._hashpath(path)
+        pd, pf = self._hashpath(path)
        tm = int(os.path.getmtime(path))
        sz = int(os.path.getsize(path))
        return pd, pf, tm, sz
-    
+
    # Check if the cache appears up to date for a given path, only
    # using the modification time and size. Return the data file path
    # elements if we get a hit.
@ -142,31 +176,25 @@ class OCRCache(object):
        if not ret:
            return False, None, None
        pd, pf, ntm, nsz = self._newpathattrs(path)
-        #_deb(" tm %d  sz %d" % (ntm, nsz))
-        #_deb("otm %d osz %d" % (otm, osz))
+        # _deb(" tm %d  sz %d" % (ntm, nsz))
+        # _deb("otm %d osz %d" % (otm, osz))
        if otm != ntm or osz != nsz:
            return False, None, None
        return True, od, of

-    # Check if cache appears up to date for path (no data check),
-    # return True/False
-    def pathincache(self, path):
-        ret, dd, df = self._pathincache(path)
-        return ret
-    
    # Compute the data file name for path. Expensive: we compute the data hash.
    # Return both the data file path and path elements (for storage in path file)
    def _datafilename(self, path):
        d, f = self._hashdata(path)
        return os.path.join(self.objdir, d, f), d, f

-    # Check if the data for path is in cache: expensive, needs to
-    # compute the hash for the path's data contents. Returns True/False
-    def dataincache(self, path):
-        return os.path.exists(self._datafilename(path)[0])
-
    # Create path file with given elements.
    def _updatepathfile(self, pd, pf, dd, df, tm, sz, path):
+        global _tmpdir, _recoll_tmpdir
+        if (_tmpdir and path.startswith(_tmpdir)) or \
+           (_recoll_tmpdir and path.startswith(_recoll_tmpdir)):
+            _deb(f"ocrcache: not storing path data for temporary file {path}")
+            return
        dir = os.path.join(self.pathdir, pd)
        if not os.path.exists(dir):
            os.makedirs(dir)
@ -178,7 +206,7 @@ class OCRCache(object):
    # Store data for path. Only rewrite an existing data file if told
    # to do so: this is only useful if we are forcing an OCR re-run.
    def store(self, path, datatostore, force=False):
-        dd,df = self._hashdata(path)
+        dd, df = self._hashdata(path)
        pd, pf, tm, sz = self._newpathattrs(path)
        self._updatepathfile(pd, pf, dd, df, tm, sz, path)
        dir = os.path.join(self.objdir, dd)
@ -186,7 +214,7 @@ class OCRCache(object):
            os.makedirs(dir)
        dfile = os.path.join(dir, df)
        if force or not os.path.exists(dfile):
-            #_deb("Storing data")
+            # _deb("Storing data")
            cpressed = zlib.compress(datatostore)
            with open(dfile, "wb") as f:
                f.write(cpressed)
@ -203,11 +231,12 @@ class OCRCache(object):
            dfn, dd, df = self._datafilename(path)

        if not os.path.exists(dfn):
+            _deb(f"ocrcache: no existing OCR data file for {path}")
            return False, b""

        if not pincache:
-            # File has moved. create/Update path file for next time
-            _deb("ocrcache::get file %s was moved, updating path data" % path)
+            # File may have moved. Create/Update path file for next time
+            _deb(f"ocrcache::get: data ok but path file for {path} does not exist: creating it")
            pd, pf, tm, sz = self._newpathattrs(path)
            self._updatepathfile(pd, pf, dd, df, tm, sz, path)

@ -223,10 +252,10 @@ class OCRCache(object):
        ntm = int(os.path.getmtime(origpath))
        nsz = int(os.path.getsize(origpath))
        if ntm != otm or nsz != osz:
-            #_deb("Purgepaths otm %d ntm %d osz %d nsz %d"%(otm, ntm, osz, nsz))
+            # _deb("Purgepaths otm %d ntm %d osz %d nsz %d"%(otm, ntm, osz, nsz))
            return True
        return False
-    
+
    def purgepaths(self):
        '''Remove all stale pathfiles: source image does not exist or has
        been changed. Mostly useful for removed files, modified ones would be
@ -251,15 +280,15 @@ class OCRCache(object):
    def _pgdt_pathcb(self, f):
        '''Get a pathfile name, read it, and record datafile identifier
        (concatenate data file subdir and file name)'''
-        #_deb("_pgdt_pathcb: %s" % f)
+        # _deb("_pgdt_pathcb: %s" % f)
        dd, df, tm, sz, orgpath = self._readpathfile(f)
        self._pgdt_alldatafns.add(dd+df)

    def _pgdt_datacb(self, datafn):
        '''Get a datafile name and check that it is referenced by a previously
        seen pathfile'''
-        p1,fn = os.path.split(datafn)
-        p2,dn = os.path.split(p1)
+        p1, fn = os.path.split(datafn)
+        p2, dn = os.path.split(p1)
        tst = dn+fn
        if tst in self._pgdt_alldatafns:
            _deb("purgedata: ok         : %s" % datafn)
@ -267,7 +296,7 @@ class OCRCache(object):
        else:
            _deb("purgedata: removing   : %s" % datafn)
            os.remove(datafn)
-            
+
    def purgedata(self):
        '''Remove all data files which do not match any from the input list,
        based on data contents hash. We make a list of all data files
@ -280,50 +309,61 @@ class OCRCache(object):
        self._pgdt_alldatafns = set()
        self._walk(self.pathdir, self._pgdt_pathcb)
        self._walk(self.objdir, self._pgdt_datacb)
-        
-
+   

 if __name__ == '__main__':
    import rclconfig
-    def _Usage():
-        _deb("Usage: rclocrcache.py --purge")
+    import getopt
+
+    def Usage(f=sys.stderr):
+        print("Usage: rclocrcache.py --purge [--purgedata]", file=f)
+        print("Usage: rclocrcache.py --store <imgdatapath> <ocrdatapath>", file=f)
+        print("Usage: rclocrcache.py --get <imgdatapath>", file=f)
        sys.exit(1)
-    if len(sys.argv) != 2:
-        _Usage()
-    if sys.argv[1] != "--purge":
-        _Usage()
-    
+
    conf = rclconfig.RclConfig()
    cache = OCRCache(conf)
-    cache.purgepaths()
-    cache.purgedata()
-    sys.exit(0)
-    
-#    def trycache(p):
-#        _deb("== CACHE tests for %s"%p)
-#        ret = cache.pathincache(p)
-#        s = "" if ret else " not"
-#        _deb("path for %s%s in cache" % (p, s))
-#        if not ret:
-#            return False
-#        ret = cache.dataincache(p)
-#        s = "" if ret else " not"
-#        _deb("data for %s%s in cache" % (p, s))
-#        return ret
-#    def trystore(p):
-#        _deb("== STORE test for %s" % p)
-#        cache.store(p, b"my OCR'd text is one line\n", force=False)
-#    def tryget(p):
-#        _deb("== GET test for %s" % p)
-#        incache, data = cache.get(p)
-#        if incache:
-#            _deb("Data from cache [%s]" % data)
-#        else:
-#            _deb("Data was not found in cache")
-#        return incache, data
-#    if False:
-#        path = sys.argv[1]
-#        incache, data = tryget(path)
-#        if not incache:
-#            trystore(path)
-#
+    opts, args = getopt.getopt(sys.argv[1:], "h", ["help", "purge", "purgedata", "store", "get"])
+    purgedata = False
+    purge = False
+
+    for opt, arg in opts:
+        if opt in ['-h', '--help']:
+            Usage(sys.stdout)
+        elif opt in ['--purgedata']:
+            purgedata = True
+        elif opt in ['--purge']:
+            if len(args) != 0:
+                Usage()
+            purge = True
+        elif opt in ['--store']:
+            if len(args) != 2:
+                Usage()
+            imgdatapath = args[0]
+            ocrdatapath = args[1]
+            ocrdata = open(ocrdatapath, "rb").read()
+            cache.store(imgdatapath, ocrdata, force=False)
+            sys.exit(0)
+        elif opt in ['--get']:
+            if len(args) != 1:
+                Usage()
+            imgdatapath = args[0]
+            incache, data = cache.get(imgdatapath)
+            if incache:
+                print(f"OCR data from cache {data}")
+                sys.exit(0)
+            else:
+                print("OCR Data was not found in cache", file=sys.stderr)
+                sys.exit(1)
+        else:
+            print(f"Unknown option {opt}", file=sys.stderr)
+            Usage()
+
+    # End options. Need purging ?
+    if purge:
+        cache.purgepaths()
+        if purgedata:
+            cache.purgedata()
+
+    Usage()
+        
--- a/src/filters/rclocrtesseract.py
+++ b/src/filters/rclocrtesseract.py
@ -21,7 +21,6 @@

 import os
 import sys
-import atexit
 import tempfile
 import subprocess
 import glob
@ -38,39 +37,28 @@ _okexts = ('.tif', '.tiff', '.jpg', '.png', '.jpeg')

 tesseractcmd = None
 pdftoppmcmd = None
-
+pdftocairocmd = None

 def _deb(s):
-    rclexecm.logmsg(s)
-
-
-def vacuumdir(dir):
-    if dir:
-        for fn in os.listdir(dir):
-            path = os.path.join(dir, fn)
-            if os.path.isfile(path):
-                os.unlink(path)
-    return True
-
+    rclexecm.logmsg("rclocrtesseract: %s" % s)

 tmpdir = None
+
 def _maybemaketmpdir():
    global tmpdir
    if tmpdir:
-        if not vacuumdir(tmpdir):
-            _deb("openfile: vacuumdir %s failed" % tmpdir)
+        if not tmpdir.vacuumdir():
+            _deb("openfile: vacuumdir %s failed" % tmpdir.getpath())
            return False
    else:
-        tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
+        tmpdir = rclexecm.SafeTmpDir("rclocrtesseract")


-def finalcleanup():
+def cleanocr():
+    global tmpdir
    if tmpdir:
-        vacuumdir(tmpdir)
-        os.rmdir(tmpdir)
-
-
-atexit.register(finalcleanup)
+        del tmpdir
+        tmpdir = None


 # Return true if tesseract and the appropriate conversion program for
@ -107,12 +95,16 @@ def ocrpossible(config, path):
        # legacy code used pdftoppm for some reason, and it appears
        # that the newest builds from conda-forge do not include
        # pdftocairo. So stay with pdftoppm.
-        global pdftoppmcmd
-        if not pdftoppmcmd:
-            pdftoppmcmd = rclexecm.which("pdftoppm")
-            if not pdftoppmcmd:
-                pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
-        if pdftoppmcmd:
+        global pdftoppmcmd, pdftocairocmd
+        if not pdftoppmcmd and not pdftocairocmd:
+            pdftocairocmd = rclexecm.which("pdftocairo")
+            if not pdftocairocmd:
+                pdftocairocmd = rclexecm.which("poppler/pdftocairo")
+            if not pdftocairocmd:
+                pdftoppmcmd = rclexecm.which("pdftoppm")
+                if not pdftoppmcmd:
+                    pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
+        if pdftoppmcmd or pdftocairocmd:
            return True

    return False
@ -169,14 +161,17 @@ def _pdftesseract(config, path):

    tesseractlang = _guesstesseractlang(config, path)

-    #tesserrorfile = os.path.join(tmpdir, "tesserrorfile")
-    tmpfile = os.path.join(tmpdir, "ocrXXXXXX")
+    #tesserrorfile = os.path.join(tmpdir.getpath(), "tesserrorfile")
+    tmpfile = os.path.join(tmpdir.getpath(), "ocrXXXXXX")

    # Split pdf pages
    try:
-        vacuumdir(tmpdir)
-        cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
-        #_deb("Executing %s" % cmd)
+        tmpdir.vacuumdir()
+        if pdftocairocmd:
+            cmd = [pdftocairocmd, "-tiff", "-tiffcompression", "lzw", "-r", "300", path, tmpfile]
+        else:
+            cmd = [pdftoppmcmd, "-r", "300", path, tmpfile]
+            #_deb("Executing %s" % cmd)
        subprocess.check_call(cmd)
    except Exception as e:
        _deb("%s failed: %s" % (pdftoppmcmd,e))
@ -186,8 +181,8 @@ def _pdftesseract(config, path):
    # system is full. There is no really good way to check for
    # this. We consider any empty file to signal an error
    
-    ppmfiles = glob.glob(tmpfile + "*")
-    for f in ppmfiles:
+    pages = glob.glob(tmpfile + "*")
+    for f in pages:
        size = os.path.getsize(f)
        if os.path.getsize(f) == 0:
            _deb("pdftoppm created empty files. "
@ -203,7 +198,7 @@ def _pdftesseract(config, path):
        except:
            pass

-    for f in sorted(ppmfiles):
+    for f in sorted(pages):
        out = b''
        try:
            out = subprocess.check_output(
--- a/src/filters/rclorgmode.py
+++ b/src/filters/rclorgmode.py
@ -1,19 +1,38 @@
 #!/usr/bin/env python3
-from __future__ import print_function
+# Copyright (C) 2020-2022 J.F.Dockes
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the
+# Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-# Read an org-mode file, break it into "documents" along the separator lines
-# and interface with recoll execm
+'''Read an org-mode file, optionally break it into subdocs" along level 1 headings'''

-import rclexecm
 import sys
 import re

+import rclexecm
+import rclconfig
+import conftree
+
 class OrgModeExtractor:
    def __init__(self, em):
        self.file = ""
-        self.contents = []
        self.em = em
-
+        self.selftext = ""
+        self.docs = []
+        config = rclconfig.RclConfig()
+        self.createsubdocs = conftree.valToBool(config.getConfParam("orgmodesubdocs"))
+        
    def extractone(self, index):
        if index >= len(self.docs):
            return(False, "", "", True)
@ -23,7 +42,7 @@ class OrgModeExtractor:
        iseof = rclexecm.RclExecM.noteof
        if self.currentindex >= len(self.docs) -1:
            iseof = rclexecm.RclExecM.eofnext
-        self.em.setmimetype("text/plain")
+        self.em.setmimetype("text/x-orgmode-sub")
        try:
            self.em.setfield("title", docdata.splitlines()[0])
        except:
@ -33,7 +52,6 @@ class OrgModeExtractor:
    ###### File type handler api, used by rclexecm ---------->
    def openfile(self, params):
        self.file = params["filename"]
-
        try:
            data = open(self.file, "rb").read()
        except Exception as e:
@ -41,9 +59,15 @@ class OrgModeExtractor:
            return False

        self.currentindex = -1
+        if not self.createsubdocs:
+            self.selftext = data
+            return True

        res = rb'''^\* '''
        self.docs = re.compile(res, flags=re.MULTILINE).split(data)
+        # Note that there can be text before the first heading. This goes into the self doc,
+        # because it's not a proper entry.
+        self.selftext = self.docs[0]
        self.docs = self.docs[1:]
        #self.em.rclog("openfile: Entry count: %d" % len(self.docs))
        return True
@ -59,6 +83,8 @@ class OrgModeExtractor:
        return self.extractone(index)
        
    def getnext(self, params):
+        if not self.createsubdocs:
+            return (True, self.selftext, "", rclexecm.RclExecM.eofnext)

        if self.currentindex == -1:
            # Return "self" doc
@ -68,7 +94,7 @@ class OrgModeExtractor:
                eof = rclexecm.RclExecM.eofnext
            else:
                eof = rclexecm.RclExecM.noteof
-            return (True, "", "", eof)
+            return (True, self.selftext, "", eof)

        if self.currentindex >= len(self.docs):
            self.em.rclog("getnext: EOF hit")
--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@ -33,6 +33,7 @@ import glob
 import traceback
 import atexit
 import signal
+import time

 import rclexecm
 import rclconfig
@ -66,11 +67,17 @@ _htmlprefix =b'''<html><head>
 _htmlsuffix = b'''</pre></body></html>'''

 def finalcleanup():
+    global tmpdir
    if tmpdir:
-        vacuumdir(tmpdir)
-        os.rmdir(tmpdir)
+        del tmpdir
+        tmpdir = None

+ocrproc = None
 def signal_handler(signal, frame):
+    global ocrproc
+    if ocrproc:
+        ocrproc.wait()
+        ocrproc = None
    sys.exit(1)

 atexit.register(finalcleanup)
@ -85,14 +92,6 @@ except: pass
 try: signal.signal(signal.SIGTERM, signal_handler)
 except: pass

-def vacuumdir(dir):
-    if dir:
-        for fn in os.listdir(dir):
-            path = os.path.join(dir, fn)
-            if os.path.isfile(path):
-                os.unlink(path)
-    return True
-
 class PDFExtractor:
    def __init__(self, em):
        self.currentindex = 0
@ -213,7 +212,7 @@ class PDFExtractor:
            # no big deal
            return True
        try:
-            vacuumdir(tmpdir)
+            tmpdir.vacuumdir()
            # Note: the java version of pdftk sometimes/often fails
            # here with writing to stdout:
            #    Error occurred during initialization of VM
@ -223,9 +222,9 @@ class PDFExtractor:
            # output, until we fix the error or preferably find a way
            # to do it with poppler...
            subprocess.check_call(
-                [self.pdftk, self.filename, "unpack_files", "output",
-                 tmpdir], stdout=sys.stderr)
-            self.attachlist = sorted(os.listdir(tmpdir))
+                [self.pdftk, self.filename, "unpack_files", "output", tmpdir.getpath()],
+                stdout=sys.stderr)
+            self.attachlist = sorted(os.listdir(tmpdir.getpath()))
            return True
        except Exception as e:
            self.em.rclog("extractAttach: failed: %s" % e)
@ -399,11 +398,12 @@ class PDFExtractor:
    def maybemaketmpdir(self):
        global tmpdir
        if tmpdir:
-            if not vacuumdir(tmpdir):
-                self.em.rclog("openfile: vacuumdir %s failed" % tmpdir)
+            if not tmpdir.vacuumdir():
+                self.em.rclog("openfile: vacuumdir %s failed" % tmpdir.getpath())
                return False
        else:
-            tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
+            tmpdir = rclexecm.SafeTmpDir("rclpdf", self.em)
+            #self.em.rclog("Using temporary directory %s" % tmpdir.getpath())
            if self.pdftk and re.match("/snap/", self.pdftk):
                # We know this is Unix (Ubuntu actually). Check that tmpdir
                # belongs to the user as snap commands can't use /tmp to share
@ -415,9 +415,7 @@ class PDFExtractor:
                    if st.st_uid == os.getuid():
                        ok = True
                if not ok:
-                    self.em.rclog(
-                        "pdftk is a snap command and needs TMPDIR to be "
-                        "a directory you own")
+                    self.em.rclog("pdftk is a snap command and needs TMPDIR to be owned by you")

    def _process_annotations(self, html):
        doc = Poppler.Document.new_from_file(
@ -491,9 +489,11 @@ class PDFExtractor:
            s = self.config.getConfParam("pdfocr")
            if rclexecm.configparamtrue(s):
                try:
-                    cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
-                           self.filename]
-                    data = subprocess.check_output(cmd)
+                    cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"), self.filename]
+                    global ocrproc
+                    ocrproc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+                    data, stderr = ocrproc.communicate()
+                    ocrproc = None
                    html = _htmlprefix + rclexecm.htmlescape(data) + _htmlsuffix
                except Exception as e:
                    self.em.rclog("%s failed: %s" % (cmd, e))
@ -520,7 +520,9 @@ class PDFExtractor:
        if not self.attextractdone:
            if not self.extractAttach():
                return (False, "", "", rclexecm.RclExecM.eofnow)
-        path = os.path.join(tmpdir, ipath)
+        if type(ipath) != type(""):
+            ipath = ipath.decode('utf-8')
+        path = os.path.join(tmpdir.getpath(), ipath)
        if os.path.isfile(path):
            f = open(path, "rb")
            docdata = f.read();
--- a/src/filters/rclppt.py
+++ b/src/filters/rclppt.py
@ -2,8 +2,6 @@

 # Recoll PPT text extractor

-from __future__ import print_function
-
 import rclexecm
 import rclexec1
 import re
--- a/src/filters/rclpst.py
+++ b/src/filters/rclpst.py
@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
 #################################
 # Copyright (C) 2019 J.F.Dockes
 #   This program is free software; you can redistribute it and/or modify
@ -28,12 +28,14 @@ import os
 import pathlib
 import email.parser
 import email.policy
+import email.message
 import mailbox
 import subprocess
 import rclexecm
 import rclconfig
 import conftree
 import base64
+import traceback

 _mswindows = (sys.platform == "win32" or sys.platform == "msys")
 if _mswindows:
@ -95,14 +97,26 @@ class EmailBuilder(object):
        newmsg = email.message.EmailMessage(policy=email.policy.default)
        headerstr = self.headers.decode("UTF-8", errors='replace')
        # print("%s" % headerstr)
-        headers = self.parser.parsestr(headerstr, headersonly=True)
+        try:
+            headers = self.parser.parsestr(headerstr, headersonly=True)
+        except:
+            # This sometimes fails, for example with 'day is out of range for month'. Try to go on
+            # without headers
+            headers = email.message.EmailMessage()
+            
        #self.log("EmailBuilder: content-type %s" % headers['content-type'])
-        for nm in ('from', 'subject'):
+        for nm in ('from', 'subject', 'date'):
            if nm in headers:
-                newmsg.add_header(nm, headers[nm])
+                try:
+                    newmsg.add_header(nm, headers[nm])
+                except:
+                    pass

        for h in ('to', 'cc'):
-            tolist = headers.get_all(h)
+            try:
+                tolist = headers.get_all(h)
+            except:
+                tolist = []
            if not tolist:
                continue
            alldests = ""
@ -113,7 +127,10 @@ class EmailBuilder(object):
                    alldests += sd + ", "
            if alldests:
                alldests = alldests.rstrip(", ")
-                newmsg.add_header(h, alldests)
+                try:
+                    newmsg.add_header(h, alldests)
+                except:
+                    pass

 # Decoding the body: the .pst contains the text value decoded from qp
 # or base64 (at least that's what libpff sends). Unfortunately, it
@ -135,8 +152,13 @@ class EmailBuilder(object):
                charset = headers.get_content_charset()
                body = ''
                if charset:
-                    body = self.body.decode(charset, errors='replace')
-                    #self.log("DECODE FROM HEADER CHARSET %s SUCCEEDED"% charset)
+                    if charset == 'unicode':
+                        charset = 'utf-16'
+                    try:
+                        body = self.body.decode(charset, errors='replace')
+                        #self.log("DECODE FROM HEADER CHARSET %s SUCCEEDED"% charset)
+                    except:
+                        pass
                else:
                    try:
                        body = self.body.decode('utf-8')
@ -377,6 +399,7 @@ class PstExtractor(object):
                return(False, "", "", rclexecm.RclExecM.eofnow)
        except Exception as ex:
            self.em.rclog("getnext: exception: %s" % ex)
+            traceback.print_exc()
            return(False, "", "", rclexecm.RclExecM.eofnow)
            
        return (True, doc, ipath, rclexecm.RclExecM.noteof)
--- a/src/filters/rclpython.py
+++ b/src/filters/rclpython.py
@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3

 # Rclpython is based on "colorize.py" from:
 # http://chrisarndt.de/en/software/python/colorize.html
@ -51,6 +51,12 @@ _css_classes = {
    _TEXT:              'text',
 }

+# python3.8 token.py sends an ENCODING token which we ignore
+try:
+    token_encoding_type = token.ENCODING
+except:
+    token_encoding_type = 62
+    
 _HTML_HEADER = """\
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
  "http://www.w3.org/TR/html4/loose.dtd">
@ -146,17 +152,21 @@ class Parser:
    def __call__(self, toktype, toktext, startpos, endpos, line):
        """ Token handler.
        """
+        srow, scol = startpos
+        erow, ecol = endpos
        if 0:
            print("type %s %s text %s start %s %s end %s %s<br>\n" % \
                  (toktype, token.tok_name[toktype], toktext, \
-                   srow, scol,erow,ecol))
-        srow, scol = startpos
-        erow, ecol = endpos
+                   srow, scol,erow,ecol), file=sys.stderr)
+
        # calculate new positions
        oldpos = self.pos
        newpos = self.lines[srow] + scol
        self.pos = newpos + len(toktext)

+        if toktype == token_encoding_type:
+            return
+
        # handle newlines
        if toktype in [token.NEWLINE, tokenize.NL]:
            self.out.write(b'\n')
--- a/src/filters/rclrar.py
+++ b/src/filters/rclrar.py
@ -18,8 +18,6 @@
 #   Free Software Foundation, Inc.,
 #   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

-from __future__ import print_function
-
 import sys
 import rclexecm
 import os
@ -60,7 +58,7 @@ except Exception as ex:
 # (https://www.rarlab.com/rar_add.htm). The unrar-free version fails
 # with the message "Failed the read enough data"
 # 
-# This is identical to rclzip except I did a search/replace from zip
+# This is identical to rclzip.py except I did a search/replace from zip
 # to rar, and changed this comment.
 class RarExtractor:
    def __init__(self, em):
--- a/src/filters/rclrtf.py
+++ b/src/filters/rclrtf.py
@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-from __future__ import print_function

 import rclexecm
 import rclexec1
--- a/src/filters/rcltar.py
+++ b/src/filters/rcltar.py
@ -2,12 +2,10 @@

 # Tar-file filter for Recoll
 # Thanks to Recoll user Martin Ziegler
-# This is a modified version of /usr/share/recoll/filters/rclzip
+# This is a modified version of /usr/share/recoll/filters/rclzip.py
 # It works not only for tar-files, but automatically for gzipped and
 # bzipped tar-files at well.

-from __future__ import print_function
-
 import rclexecm

 try:
--- a/src/filters/rcltext.py
+++ b/src/filters/rcltext.py
@ -18,8 +18,6 @@
 # Wrapping a text file. Recoll does it internally in most cases, but
 # this is for use by another filter.

-from __future__ import print_function
-
 import rclexecm
 import sys
 from rclbasehandler import RclBaseHandler
--- a/src/filters/rcltxtlines.py
+++ b/src/filters/rcltxtlines.py
@ -2,7 +2,6 @@
 """Index text lines as document (execm handler sample). This exists
 to demonstrate the execm interface and is not meant to be useful or
 efficient"""
-from __future__ import print_function

 import sys
 import os
--- a/src/filters/rcluncomp.py
+++ b/src/filters/rcluncomp.py
@ -1,5 +1,4 @@
 # No shebang: this is only used on Windows. We use a shell script on Linux
-from __future__ import print_function

 import rclexecm
 import sys
--- a/src/filters/rclwar.py
+++ b/src/filters/rclwar.py
@ -2,8 +2,6 @@

 # WAR web archive filter for recoll. War file are gzipped tar files

-from __future__ import print_function
-
 import rclexecm
 import tarfile

--- a/src/filters/rclxmp.py
+++ b/src/filters/rclxmp.py
@ -16,7 +16,6 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

 # Code to extract XMP tags using libexempi and python-xmp
-from __future__ import print_function

 can_xmp = True
 try:
--- a/src/filters/rclzip.py
+++ b/src/filters/rclzip.py
@ -18,11 +18,11 @@

 # Zip file extractor for Recoll

-from __future__ import print_function
-
 import os
 import posixpath
 import fnmatch
+import datetime
+
 import rclexecm
 from zipfile import ZipFile

@ -49,7 +49,7 @@ if not hasrclconfig:
 # and stores it in the catalog as an unicode object. Else it uses the
 # binary string, which it decodes as CP437 (zip standard).
 #
-# When reading the file, the input file name is used by rclzip
+# When reading the file, the input file name is used by rclzip.py
 # directly as an index into the catalog.
 #
 # When we send the file name data to the indexer, we have to serialize
@ -119,6 +119,8 @@ class ZipExtractor:
                # element).
                filename = posixpath.basename(ipath)
                self.em.setfield("filename", filename)
+                dt = datetime.datetime(*info.date_time)
+                self.em.setfield("modificationdate", str(int(dt.timestamp())))
            except:
                pass
            ok = True
@ -151,14 +153,11 @@ class ZipExtractor:
            if skipped is not None:
                self.skiplist += conftree.stringToStrings(skipped)
        try:
-            if rclexecm.PY3:
-                # Note: py3 ZipFile wants an str file name, which
-                # is wrong: file names are binary. But it accepts an
-                # open file, and open() has no such restriction
-                self.f = open(filename, 'rb')
-                self.zip = ZipFile(self.f)
-            else:
-                self.zip = ZipFile(filename)
+            # Note: py3 ZipFile wants an str file name, which
+            # is wrong: file names are binary. But it accepts an
+            # open file, and open() has no such restriction
+            self.f = open(filename, 'rb')
+            self.zip = ZipFile(self.f)
            return True
        except Exception as err:
            self.em.rclog("openfile: failed: [%s]" % err)
--- a/src/filters/recoll-we-move-files.py
+++ b/src/filters/recoll-we-move-files.py
@ -1,5 +1,5 @@
-#!/usr/bin/env python3
-# Copyright (C) 2017 J.F.Dockes
+#!/usr/bin/python3
+# Copyright (C) 2017-2022 J.F.Dockes
 #   This program is free software; you can redistribute it and/or modify
 #   it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
@ -31,6 +31,7 @@ but it can also be run by hand.
 import sys
 import os
 import re
+import getopt
 try:
    from hashlib import md5 as md5
 except:
@ -94,28 +95,44 @@ def list_all_files(dir):
    return mfiles,cfiles

 #######################
+def msg(s):
+    print(f"{s}", file=sys.stderr)
 def usage():
-    print("Usage: recoll-we-move-files.py [<downloaddir>]", file=sys.stderr)
+    msg("Usage: recoll-we-move-files.py [-c <recollconfigdir>]")
+    msg(" The script needs the recoll configuration directory. This can be set either through")
+    msg(" the RECOLL_CONFDIR environment variable or the '-c' command line option (which takes")
+    msg(" precedence). If none is set, the default configuration directory will be used.")
    sys.exit(1)

-config = rclconfig.RclConfig()

-# Source dir is parameter, else from config else default Downloads directory
+opts, args = getopt.getopt(sys.argv[1:], "c:")
+if not len(args) == 0:
+    usage()
+
+configdir = None
+for opt,val in opts:
+    #logdeb(f"opt {opt} val {val}")
+    if opt == "-c":
+        configdir = val
+        if not os.path.isdir(val):
+            msg(f"{val} is not a directory")
+            usage()
+    else:
+        usage()
+
+config = rclconfig.RclConfig(argcnf=configdir)
+
+# Get the directory where the browser extension creates the page files. Our user can set it as a
+# subdirectory of the default Downloads directory, for tidyness
 downloadsdir = config.getConfParam("webdownloadsdir")
 if not downloadsdir:
    downloadsdir = "~/Downloads"
 downloadsdir = os.path.expanduser(downloadsdir)
+if not os.path.isdir(downloadsdir):
+    msg(f"Downloads directory {downloadsdir} does not exist")
+    sys.exit(1)

-if len(sys.argv) == 2:
-    mydir = sys.argv[1]
-elif len(sys.argv) == 1:
-    mydir = downloadsdir
-else:
-    usage()
-if not os.path.isdir(mydir):
-    usage()
-
-# Get target webqueue recoll directory from recoll configuration
+# Get the target recoll webqueue directory, into which we are going to move the downloaded files.
 webqueuedir = config.getConfParam("webqueuedir")
 if not webqueuedir:
    if _mswindows:
@ -125,10 +142,11 @@ if not webqueuedir:
 webqueuedir = os.path.expanduser(webqueuedir)
 os.makedirs(webqueuedir, exist_ok = True)

-# logdeb("webqueuedir is %s" % webqueuedir)
+
+#logdeb(f"recoll confdir [{configdir}] downloadsdir [{downloadsdir}] webqueuedir [{webqueuedir}]")

 # Get the lists of all files created by the browser addon
-mfiles, cfiles = list_all_files(mydir)
+mfiles, cfiles = list_all_files(downloadsdir)

 # Only keep the last version
 mfiles = delete_previous_instances(mfiles, downloadsdir)
@ -143,7 +161,7 @@ cfiles = delete_previous_instances(cfiles, downloadsdir)
 # The old plugin created the data first, so we move data then meta
 for hash in cfiles.keys():
    if hash in mfiles.keys():
-        newname = "firefox-recoll-web-"+hash
+        newname = "firefox-recoll-web-" + hash
        shutil.move(os.path.join(downloadsdir, cfiles[hash]),
                    os.path.join(webqueuedir, newname))
        shutil.move(os.path.join(downloadsdir, mfiles[hash]),
--- a/src/filters/xlsxmltocsv.py
+++ b/src/filters/xlsxmltocsv.py
@ -23,8 +23,6 @@
 # the minimum version supported.


-from __future__ import print_function
-
 import sys
 import xml.sax

--- a/src/index/exefetcher.cpp
+++ b/src/index/exefetcher.cpp
@ -61,8 +61,7 @@ public:
 EXEDocFetcher::EXEDocFetcher(const EXEDocFetcher::Internal& _m)
 {
    m = new Internal(_m);
-    LOGDEB("EXEDocFetcher::EXEDocFetcher: fetch is " <<
-           stringsToString(m->sfetch) << "\n");
+    LOGDEB("EXEDocFetcher::EXEDocFetcher: fetch is " << stringsToString(m->sfetch) << "\n");
 }

 bool EXEDocFetcher::fetch(RclConfig*, const Rcl::Doc& idoc, RawDoc& out)
@ -77,8 +76,7 @@ bool EXEDocFetcher::makesig(RclConfig*, const Rcl::Doc& idoc, string& sig)
 }

 // Lookup bckid in the config and create an appropriate fetcher.
-std::unique_ptr<EXEDocFetcher> exeDocFetcherMake(RclConfig *config,
-                                                 const string& bckid)
+std::unique_ptr<EXEDocFetcher> exeDocFetcherMake(RclConfig *config, const string& bckid)
 {
    // The config we only read once, not gonna change.
    static ConfSimple *bconf;
--- a/src/index/exefetcher.h
+++ b/src/index/exefetcher.h
@ -40,6 +40,8 @@ public:
    class Internal;
    EXEDocFetcher(const Internal&);
    virtual ~EXEDocFetcher() {}
+    EXEDocFetcher(const EXEDocFetcher&) = delete;
+    EXEDocFetcher& operator=(const EXEDocFetcher&) = delete;

    virtual bool fetch(RclConfig* cnf, const Rcl::Doc& idoc, RawDoc& out);
    /** Calls stat to retrieve file signature data */
@ -51,7 +53,6 @@ private:
 };

 // Lookup bckid in the config and create an appropriate fetcher.
-std::unique_ptr<EXEDocFetcher> exeDocFetcherMake(RclConfig *config,
-                                                 const std::string& bckid);
+std::unique_ptr<EXEDocFetcher> exeDocFetcherMake(RclConfig *config, const std::string& bckid);

 #endif /* _EXEFETCHER_H_INCLUDED_ */
--- a/src/index/fetcher.h
+++ b/src/index/fetcher.h
@ -72,18 +72,18 @@ public:
     * @param idoc the data gathered from the index for this doc (udi/ipath)
     * @param sig output. 
     */
-    virtual bool makesig(RclConfig* cnf, const Rcl::Doc& idoc,
-                         std::string& sig) = 0;
+    virtual bool makesig(RclConfig* cnf, const Rcl::Doc& idoc, std::string& sig) = 0;
    enum Reason{FetchOk, FetchNotExist, FetchNoPerm, FetchOther};
    virtual Reason testAccess(RclConfig*, const Rcl::Doc&) {
        return FetchOther;
    }
+    DocFetcher() {}
    virtual ~DocFetcher() {}
+    DocFetcher(const DocFetcher&) = delete;
+    DocFetcher& operator=(const DocFetcher&) = delete;
 };

-/** Return an appropriate fetcher object given the backend string 
- * identifier inside idoc*/
-std::unique_ptr<DocFetcher> docFetcherMake(RclConfig *config,
-                                           const Rcl::Doc& idoc);
+/** Return an appropriate fetcher object given the backend string identifier inside idoc*/
+std::unique_ptr<DocFetcher> docFetcherMake(RclConfig *config, const Rcl::Doc& idoc);

 #endif /* _FETCHER_H_INCLUDED_ */
--- a/src/index/fsfetcher.h
+++ b/src/index/fsfetcher.h
@ -23,14 +23,18 @@
 /** 
 * The file-system fetcher: 
 */
-class FSDocFetcher : public DocFetcher{
+class FSDocFetcher : public DocFetcher {
+public:
    /** FSDocFetcher::fetch always returns a file name */
    virtual bool fetch(RclConfig* cnf, const Rcl::Doc& idoc, RawDoc& out);
    
    /** Calls stat to retrieve file signature data */
    virtual bool makesig(RclConfig* cnf,const Rcl::Doc& idoc, std::string& sig);
    virtual DocFetcher::Reason testAccess(RclConfig* cnf, const Rcl::Doc& idoc);
+    FSDocFetcher() {}
    virtual ~FSDocFetcher() {}
+    FSDocFetcher(const FSDocFetcher&) = delete;
+    FSDocFetcher& operator=(const FSDocFetcher&) = delete;
 };

 extern void fsmakesig(const struct PathStat *stp, std::string& out);
--- a/src/index/fsindexer.cpp
+++ b/src/index/fsindexer.cpp
@ -195,6 +195,7 @@ bool FsIndexer::index(int flags)
        m_walker.setMaxDepth(2);
    }

+    bool walkok(true);
    for (const auto& topdir : m_tdl) {
        LOGDEB("FsIndexer::index: Indexing " << topdir << " into " <<
               getDbDir() << "\n");
@ -229,29 +230,46 @@ bool FsIndexer::index(int flags)
        if (m_walker.walk(topdir, *this) != FsTreeWalker::FtwOk) {
            LOGERR("FsIndexer::index: error while indexing " << topdir <<
                   ": " << m_walker.getReason() << "\n");
-            return false;
+            // DO NOT return: we need to flush the queues before the Db can be closed !
+            walkok = false;
+            break;
        }
    }

-#ifdef IDX_THREADS
-    if (m_haveInternQ) 
-        m_iwqueue.waitIdle();
-    if (m_haveSplitQ)
-        m_dwqueue.waitIdle();
-    m_db->waitUpdIdle();
-#endif // IDX_THREADS
-
+    shutdownQueues(walkok);
    if (m_missing) {
        string missing;
        m_missing->getMissingDescription(missing);
        if (!missing.empty()) {
-            LOGINFO("FsIndexer::index missing helper program(s):\n" <<
-                    missing << "\n");
+            LOGINFO("FsIndexer::index missing helper program(s):\n" << missing << "\n");
        }
        m_config->storeMissingHelperDesc(missing);
    }
-    LOGINFO("fsindexer index time:  " << chron.millis() << " mS\n");
-    return true;
+    LOGINFO("fsindexer: status: " << walkok << " index time:  " << chron.millis() << " mS\n");
+    return walkok;
+}
+
+void FsIndexer::shutdownQueues(bool ok)
+{
+#ifdef IDX_THREADS
+    if (!ok) {
+        // Error or more probably interrupt. Discard everything for fast shutdown
+        if (m_haveInternQ)  {
+            m_iwqueue.closeShop();
+        }
+        if (m_haveSplitQ) {
+            m_dwqueue.closeShop();
+        }
+        m_db->closeQueue();
+    }
+    if (m_haveInternQ)  {
+        m_iwqueue.waitIdle();
+    }
+    if (m_haveSplitQ) {
+        m_dwqueue.waitIdle();
+    }
+    m_db->waitUpdIdle();
+#endif // IDX_THREADS
 }

 static bool matchesSkipped(
@ -359,7 +377,7 @@ bool FsIndexer::indexFiles(list<string>& files, int flags)
    FsTreeWalker walker;
    walker.setSkippedPaths(m_config->getSkippedPaths());

-    for (list<string>::iterator it = files.begin(); it != files.end(); ) {
+    for (auto it = files.begin(); it != files.end(); ) {
        LOGDEB2("FsIndexer::indexFiles: [" << *it << "]\n");

        m_config->setKeyDir(path_getfather(*it));
@ -403,22 +421,14 @@ bool FsIndexer::indexFiles(list<string>& files, int flags)

    ret = true;
 out:
-#ifdef IDX_THREADS
-    if (m_haveInternQ) 
-        m_iwqueue.waitIdle();
-    if (m_haveSplitQ)
-        m_dwqueue.waitIdle();
-    m_db->waitUpdIdle();
-#endif // IDX_THREADS
+    shutdownQueues(ret);

    // Purge possible orphan documents
    if (ret == true) {
        LOGDEB("Indexfiles: purging orphans\n");
-        const vector<string>& purgecandidates = m_purgeCandidates.getCandidates();
-        for (vector<string>::const_iterator it = purgecandidates.begin();
-             it != purgecandidates.end(); it++) {
-            LOGDEB("Indexfiles: purging orphans for " << *it << "\n");
-            m_db->purgeOrphans(*it);
+        for (const auto& udi : m_purgeCandidates.getCandidates()) {
+            LOGDEB("Indexfiles: purging orphans for " << udi << "\n");
+            m_db->purgeOrphans(udi);
        }
 #ifdef IDX_THREADS
        m_db->waitUpdIdle();
@ -458,13 +468,7 @@ bool FsIndexer::purgeFiles(list<string>& files)

    ret = true;
 out:
-#ifdef IDX_THREADS
-    if (m_haveInternQ) 
-        m_iwqueue.waitIdle();
-    if (m_haveSplitQ)
-        m_dwqueue.waitIdle();
-    m_db->waitUpdIdle();
-#endif // IDX_THREADS
+    shutdownQueues(ret);
    LOGDEB("FsIndexer::purgeFiles: done\n");
    return ret;
 }
@ -488,10 +492,9 @@ void FsIndexer::localfieldsfromconf()
    ConfSimple attrs;
    m_config->valueSplitAttributes(sfields, value, attrs);
    vector<string> nmlst = attrs.getNames(cstr_null);
-    for (vector<string>::const_iterator it = nmlst.begin();
-         it != nmlst.end(); it++) {
-        string nm = m_config->fieldCanon(*it);
-        attrs.get(*it, m_localfields[nm]);
+    for (const auto& anm : nmlst) {
+        string nm = m_config->fieldCanon(anm);
+        attrs.get(anm, m_localfields[nm]);
        LOGDEB2("FsIndexer::localfieldsfromconf: [" << nm << "]->[" <<
                m_localfields[nm] << "]\n");
    }
@ -499,12 +502,11 @@ void FsIndexer::localfieldsfromconf()

 void FsIndexer::setlocalfields(const map<string, string>& fields, Rcl::Doc& doc)
 {
-    for (map<string, string>::const_iterator it = fields.begin();
-         it != fields.end(); it++) {
+    for (const auto& field : fields) {
        // Being chosen by the user, localfields override values from
        // the filter. The key is already canonic (see
        // localfieldsfromconf())
-        doc.meta[it->first] = it->second;
+        doc.meta[field.first] = field.second;
    }
 }

@ -840,9 +842,7 @@ FsTreeWalker::Status FsIndexer::processonefile(
            }
        }
 #if defined(HAVE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-        // See framagit issue 26. If this appears to be a good idea
-        // after all (not sure), we'll need a command line switch to
-        // control it. For now it's compile-time only.
+        // See framagit issue 26. This is off by default and controlled by a command line switch.
        if (m_cleancache) {
            int fd = open(fn.c_str(), O_RDONLY);
            if (fd >= 0) {
--- a/src/index/fsindexer.h
+++ b/src/index/fsindexer.h
@ -27,7 +27,6 @@
 #endif // IDX_THREADS

 class FIMissingStore;
-struct PathStat;

 class DbUpdTask;
 class InternfileTask;
@ -55,6 +54,8 @@ public:
     */
    FsIndexer(RclConfig *cnf, Rcl::Db *db);
    virtual ~FsIndexer();
+    FsIndexer(const FsIndexer&) = delete;
+    FsIndexer& operator=(const FsIndexer&) = delete;

    /** 
     * Top level file system tree index method for updating a given database.
@ -157,6 +158,7 @@ private:
    processonefile(RclConfig *config, const string &fn, 
                   const struct PathStat *,
                   const map<string,string>& localfields);
+    void shutdownQueues(bool);
 };

 #endif /* _fsindexer_h_included_ */
--- a/src/index/idxstatus.h
+++ b/src/index/idxstatus.h
@ -57,6 +57,8 @@ class DbIxStatusUpdater {
 public:
    DbIxStatusUpdater(const RclConfig *config, bool nox11monitor);
    virtual ~DbIxStatusUpdater(){}
+    DbIxStatusUpdater(const DbIxStatusUpdater&) = delete;
+    DbIxStatusUpdater& operator=(const DbIxStatusUpdater&) = delete;

    enum Incr {IncrNone, IncrDocsDone = 0x1, IncrFilesDone = 0x2, IncrFileErrors = 0x4};
    // Change phase/fn and update
--- a/src/index/indexer.cpp
+++ b/src/index/indexer.cpp
@ -59,12 +59,10 @@ bool runWebFilesMoverScript(RclConfig *config)
    static string downloadsdir;
    if (downloadsdir.empty()) {
        if (!config->getConfParam("webdownloadsdir", downloadsdir)) {
-            downloadsdir = path_tildexpand("~/Downloads");
+            downloadsdir = "~/Downloads";
        }
+        downloadsdir = path_tildexpand(downloadsdir);
    }
-    vector<string> cmdvec;
-    config->pythonCmd("recoll-we-move-files.py", cmdvec);
-    
    /* Arrange to not actually run the script if the directory did not change */
    static time_t dirmtime;
    time_t ndirmtime = 0;
@ -72,17 +70,17 @@ bool runWebFilesMoverScript(RclConfig *config)
    if (path_fileprops(downloadsdir.c_str(), &st) == 0) {
        ndirmtime = st.pst_mtime;
    }
-    /* If stat fails, presumably Downloads does not exist or is not
-       accessible, dirmtime and mdirmtime stay at 0, and we never
-       execute the script, which is the right thing. */
+    // If stat fails, presumably Downloads does not exist or is not accessible, dirmtime and
+    // mdirmtime stay at 0, and we never execute the script, which is the right thing.
    if (dirmtime != ndirmtime) {
-        /* The script is going to change the directory, so updating
-           dirmtime before it runs means that we are going to execute
-           it one time too many (it will run without doing anything),
-           but we can't set the mtime to after the run in case files
-           are created during the run. */
+        // The script is going to change the directory, so updating dirmtime before it runs means
+        // that we are going to execute it one time too many (it will run without doing anything),
+        // but we can't set the mtime to after the run in case files are created during the run.
        dirmtime = ndirmtime;
+        vector<string> cmdvec;
+        config->pythonCmd("recoll-we-move-files.py", cmdvec);
        ExecCmd cmd;
+        cmd.putenv("RECOLL_CONFDIR", config->getConfDir());
        int status = cmd.doexec1(cmdvec);
        return status == 0;
    }
--- a/Show More
+++ b/Show More