5364 lines
396 KiB
HTML
5364 lines
396 KiB
HTML
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
|
|
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head>
|
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
|
|
<title>Recoll user manual — recoll 1.25.12 documentation</title>
|
|
|
|
<link rel="stylesheet" href="_static/alabaster.css" type="text/css" />
|
|
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
|
|
|
|
<script type="text/javascript">
|
|
var DOCUMENTATION_OPTIONS = {
|
|
URL_ROOT: './',
|
|
VERSION: '1.25.12',
|
|
COLLAPSE_INDEX: false,
|
|
FILE_SUFFIX: '.html',
|
|
HAS_SOURCE: true
|
|
};
|
|
</script>
|
|
<script type="text/javascript" src="_static/jquery.js"></script>
|
|
<script type="text/javascript" src="_static/underscore.js"></script>
|
|
<script type="text/javascript" src="_static/doctools.js"></script>
|
|
<link rel="index" title="Index" href="genindex.html" />
|
|
<link rel="search" title="Search" href="search.html" />
|
|
<link rel="top" title="recoll 1.25.12 documentation" href="#" />
|
|
|
|
<link rel="stylesheet" href="_static/custom.css" type="text/css" />
|
|
|
|
<meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
|
|
|
|
</head>
|
|
<body role="document">
|
|
|
|
|
|
<div class="document">
|
|
<div class="documentwrapper">
|
|
<div class="bodywrapper">
|
|
<div class="body" role="main">
|
|
|
|
<div class="section" id="recoll-user-manual">
|
|
<h1><a class="toc-backref" href="#id4">Recoll user manual</a><a class="headerlink" href="#recoll-user-manual" title="Permalink to this headline">¶</a></h1>
|
|
<table class="docutils field-list" frame="void" rules="none">
|
|
<col class="field-name" />
|
|
<col class="field-body" />
|
|
<tbody valign="top">
|
|
<tr class="field-odd field"><th class="field-name">Author:</th><td class="field-body">Jean-Francois Dockes</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<div class="contents topic" id="contents">
|
|
<p class="topic-title first">Contents</p>
|
|
<ul class="simple">
|
|
<li><a class="reference internal" href="#recoll-user-manual" id="id4">Recoll user manual</a><ul>
|
|
<li><a class="reference internal" href="#introduction" id="id5">Introduction</a><ul>
|
|
<li><a class="reference internal" href="#giving-it-a-try" id="id6">Giving it a try</a></li>
|
|
<li><a class="reference internal" href="#full-text-search" id="id7">Full text search</a></li>
|
|
<li><a class="reference internal" href="#recoll-overview" id="id8">Recoll overview</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#indexing" id="id9">Indexing</a><ul>
|
|
<li><a class="reference internal" href="#id1" id="id10">Introduction</a></li>
|
|
<li><a class="reference internal" href="#index-storage" id="id11">Index storage</a></li>
|
|
<li><a class="reference internal" href="#index-configuration" id="id12">Index configuration</a></li>
|
|
<li><a class="reference internal" href="#indexing-the-web-pages-which-you-wisit" id="id13">Indexing the WEB pages which you wisit.</a></li>
|
|
<li><a class="reference internal" href="#extended-attributes-data" id="id14">Extended attributes data</a></li>
|
|
<li><a class="reference internal" href="#importing-external-tags" id="id15">Importing external tags</a></li>
|
|
<li><a class="reference internal" href="#the-pdf-input-handler" id="id16">The PDF input handler</a></li>
|
|
<li><a class="reference internal" href="#periodic-indexing" id="id17">Periodic indexing</a></li>
|
|
<li><a class="reference internal" href="#real-time-indexing" id="id18">Real time indexing</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#searching" id="id19">Searching</a><ul>
|
|
<li><a class="reference internal" href="#searching-with-the-qt-graphical-user-interface" id="id20">Searching with the Qt graphical user interface</a></li>
|
|
<li><a class="reference internal" href="#searching-with-the-kde-kio-slave" id="id21">Searching with the KDE KIO slave</a></li>
|
|
<li><a class="reference internal" href="#searching-on-the-command-line" id="id22">Searching on the command line</a></li>
|
|
<li><a class="reference internal" href="#using-synonyms-1-22" id="id23">Using Synonyms (1.22)</a></li>
|
|
<li><a class="reference internal" href="#path-translations" id="id24">Path translations</a></li>
|
|
<li><a class="reference internal" href="#the-query-language" id="id25">The query language</a></li>
|
|
<li><a class="reference internal" href="#search-case-and-diacritics-sensitivity" id="id26">Search case and diacritics sensitivity</a></li>
|
|
<li><a class="reference internal" href="#anchored-searches-and-wildcards" id="id27">Anchored searches and wildcards</a></li>
|
|
<li><a class="reference internal" href="#desktop-integration" id="id28">Desktop integration</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#removable-volumes" id="id29">Removable volumes</a><ul>
|
|
<li><a class="reference internal" href="#indexing-removable-volumes-in-the-main-index" id="id30">Indexing removable volumes in the main index</a></li>
|
|
<li><a class="reference internal" href="#self-contained-volumes" id="id31">Self contained volumes</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#programming-interface" id="id32">Programming interface</a><ul>
|
|
<li><a class="reference internal" href="#writing-a-document-input-handler" id="id33">Writing a document input handler</a></li>
|
|
<li><a class="reference internal" href="#field-data-processing" id="id34">Field data processing</a></li>
|
|
<li><a class="reference internal" href="#python-api" id="id35">Python API</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#installation-and-configuration" id="id36">Installation and configuration</a><ul>
|
|
<li><a class="reference internal" href="#installing-a-binary-copy" id="id37">Installing a binary copy</a></li>
|
|
<li><a class="reference internal" href="#supporting-packages" id="id38">Supporting packages</a></li>
|
|
<li><a class="reference internal" href="#building-from-source" id="id39">Building from source</a></li>
|
|
<li><a class="reference internal" href="#configuration-overview" id="id40">Configuration overview</a></li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</div>
|
|
<div class="section" id="introduction">
|
|
<h2><a class="toc-backref" href="#id5">Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this headline">¶</a></h2>
|
|
<p>This document introduces full text search notions and describes the
|
|
installation and use of the RCL application. It is updated for RCL
|
|
RCLVERSION.</p>
|
|
<p>RCL was for a long time dedicated to Unix-like systems. It was only
|
|
lately (2015) ported to MS-Windows. Many references in this manual,
|
|
especially file locations, are specific to Unix, and not valid on WIN,
|
|
where some described features are also not available. The manual will be
|
|
progressively updated. Until this happens, on WIN, most references to
|
|
shared files can be translated by looking under the Recoll installation
|
|
directory (esp. the <code class="docutils literal"><span class="pre">Share</span></code> subdirectory). The user configuration is
|
|
stored by default under <code class="docutils literal"><span class="pre">AppData/Local/Recoll</span></code> inside the user
|
|
directory, along with the index itself.</p>
|
|
<div class="section" id="giving-it-a-try">
|
|
<h3><a class="toc-backref" href="#id6">Giving it a try</a><a class="headerlink" href="#giving-it-a-try" title="Permalink to this headline">¶</a></h3>
|
|
<p>If you do not like reading manuals (who does?) but wish to give RCL a
|
|
try, just <a class="reference external" href="#RCL.INSTALL.BINARY">install</a> the application and start
|
|
the <code class="docutils literal"><span class="pre">recoll</span></code> graphical user interface (GUI), which will ask permission
|
|
to index your home directory by default, allowing you to search
|
|
immediately after indexing completes.</p>
|
|
<p>Do not do this if your home directory contains a huge number of
|
|
documents and you do not want to wait or are very short on disk space.
|
|
In this case, you may first want to customize the
|
|
<a class="reference external" href="#RCL.INDEXING.CONFIG">configuration</a> to restrict the indexed area
|
|
(for the very impatient with a completed package install, from the
|
|
<code class="docutils literal"><span class="pre">recoll</span></code> GUI: Preferences > Indexing configuration, then adjust the
|
|
Top directories section).</p>
|
|
<p>On Unix/Linux, you may need to install the appropriate <a class="reference external" href="#RCL.INSTALL.EXTERNAL">supporting
|
|
applications</a> for document types that need
|
|
them (for example antiword for Microsoft Word files).</p>
|
|
<p>The RCL for WIN package is self-contained and includes most useful
|
|
auxiliary programs. You will just need to install Python 2.7.</p>
|
|
</div>
|
|
<div class="section" id="full-text-search">
|
|
<h3><a class="toc-backref" href="#id7">Full text search</a><a class="headerlink" href="#full-text-search" title="Permalink to this headline">¶</a></h3>
|
|
<p>RCL is a full text search application, which means that it finds your
|
|
data by content rather than by external attributes (like the file name).
|
|
You specify words (terms) which should or should not appear in the text
|
|
you are looking for, and receive in return a list of matching documents,
|
|
ordered so that the most <em>relevant</em> documents will appear first.</p>
|
|
<p>You do not need to remember in what file or email message you stored a
|
|
given piece of information. You just ask for related terms, and the tool
|
|
will return a list of documents where these terms are prominent, in a
|
|
similar way to Internet search engines.</p>
|
|
<p>Full text search applications try to determine which documents are most
|
|
relevant to the search terms you provide. Computer algorithms for
|
|
determining relevance can be very complex, and in general are inferior
|
|
to the power of the human mind to rapidly determine relevance. The
|
|
quality of relevance guessing is probably the most important aspect when
|
|
evaluating a search application. RCL relies on the XAP probabilistic
|
|
information retrieval library to determine relevance.</p>
|
|
<p>In many cases, you are looking for all the forms of a word, including
|
|
plurals, different tenses for a verb, or terms derived from the same
|
|
root or <em>stem</em> (example: floor, floors, floored, flooring...). Queries
|
|
are usually automatically expanded to all such related terms (words that
|
|
reduce to the same stem). This can be prevented for searching for a
|
|
specific form.</p>
|
|
<p>Stemming, by itself, does not accommodate for misspellings or phonetic
|
|
searches. A full text search application may also support this form of
|
|
approximation. For example, a search for aliterattion returning no
|
|
result might propose alliteration, alteration, alterations, or
|
|
altercation as possible replacement terms. RCL bases its suggestions on
|
|
the actual index contents, so that suggestions may be made for words
|
|
which would not appear in a standard dictionary.</p>
|
|
</div>
|
|
<div class="section" id="recoll-overview">
|
|
<h3><a class="toc-backref" href="#id8">Recoll overview</a><a class="headerlink" href="#recoll-overview" title="Permalink to this headline">¶</a></h3>
|
|
<p>RCL uses the <a class="reference external" href="http://www.xapian.org">XAP</a> information retrieval
|
|
library as its storage and retrieval engine. XAP is a very mature
|
|
package using <a class="reference external" href="http://www.xapian.org/docs/intro_ir.html">a sophisticated probabilistic ranking
|
|
model</a>.</p>
|
|
<p>The XAP library manages an index database which describes where terms
|
|
appear in your document files. It efficiently processes the complex
|
|
queries which are produced by the RCL query expansion mechanism, and is
|
|
in charge of the all-important relevance computation task.</p>
|
|
<p>RCL provides the mechanisms and interface to get data into and out of
|
|
the index. This includes translating the many possible document formats
|
|
into pure text, handling term variations (using XAP stemmers), and
|
|
spelling approximations (using the aspell speller), interpreting user
|
|
queries and presenting results.</p>
|
|
<p>In a shorter way, RCL does the dirty footwork, XAP deals with the
|
|
intelligent parts of the process.</p>
|
|
<p>The XAP index can be big (roughly the size of the original document
|
|
set), but it is not a document archive. RCL can only display documents
|
|
that still exist at the place from which they were indexed. (Actually,
|
|
there is a way to reconstruct a document from the information in the
|
|
index, but only the pure text is saved, possibly without punctuation and
|
|
capitalization, depending on RCL version).</p>
|
|
<p>RCL stores all internal data in Unicode UTF-8 format, and it can index
|
|
files of many types with different character sets, encodings, and
|
|
languages into the same index. It can process documents embedded inside
|
|
other documents (for example a pdf document stored inside a Zip archive
|
|
sent as an email attachment...), down to an arbitrary depth.</p>
|
|
<p>Stemming is the process by which RCL reduces words to their radicals so
|
|
that searching does not depend, for example, on a word being singular or
|
|
plural (floor, floors), or on a verb tense (flooring, floored). Because
|
|
the mechanisms used for stemming depend on the specific grammatical
|
|
rules for each language, there is a separate XAP stemmer module for most
|
|
common languages where stemming makes sense.</p>
|
|
<p>RCL stores the unstemmed versions of terms in the main index and uses
|
|
auxiliary databases for term expansion (one for each stemming language),
|
|
which means that you can switch stemming languages between searches, or
|
|
add a language without needing a full reindex.</p>
|
|
<p>Storing documents written in different languages in the same index is
|
|
possible, and commonly done. In this situation, you can specify several
|
|
stemming languages for the index.</p>
|
|
<p>RCL currently makes no attempt at automatic language recognition, which
|
|
means that the stemmer will sometimes be applied to terms from other
|
|
languages with potentially strange results. In practise, even if this
|
|
introduces possibilities of confusion, this approach has been proven
|
|
quite useful, and it is much less cumbersome than separating your
|
|
documents according to what language they are written in.</p>
|
|
<p>By default, RCL strips most accents and diacritics from terms, and
|
|
converts them to lower case before either storing them in the index or
|
|
searching for them. As a consequence, it is impossible to search for a
|
|
particular capitalization of a term (<code class="docutils literal"><span class="pre">US</span></code> / <code class="docutils literal"><span class="pre">us</span></code>), or to
|
|
discriminate two terms based on diacritics (<code class="docutils literal"><span class="pre">sake</span></code> / <code class="docutils literal"><span class="pre">saké</span></code>,
|
|
<code class="docutils literal"><span class="pre">mate</span></code> / <code class="docutils literal"><span class="pre">maté</span></code>).</p>
|
|
<p>RCL can optionally store the raw terms, without accent stripping or case
|
|
conversion. In this configuration, default searches will behave as
|
|
before, but it is possible to perform searches sensitive to case and
|
|
diacritics. This is described in more detail in the <a class="reference external" href="#RCL.INDEXING.CONFIG.SENS">section about index
|
|
case and diacritics sensitivity</a>.</p>
|
|
<p>RCL has many parameters which define exactly what to index, and how to
|
|
classify and decode the source documents. These are kept in
|
|
<a class="reference external" href="#RCL.INDEXING.CONFIG">configuration files</a>. A default configuration
|
|
is copied into a standard location (usually something like
|
|
<code class="docutils literal"><span class="pre">/usr/share/recoll/examples</span></code>) during installation. The default values
|
|
set by the configuration files in this directory may be overridden by
|
|
values set inside your personal configuration, found by default in the
|
|
<code class="docutils literal"><span class="pre">.recoll</span></code> sub-directory of your home directory. The default
|
|
configuration will index your home directory with default parameters and
|
|
should be sufficient for giving RCL a try, but you may want to adjust it
|
|
later, which can be done either by editing the text files or by using
|
|
configuration menus in the <code class="docutils literal"><span class="pre">recoll</span></code> GUI. Some other parameters
|
|
affecting only the <code class="docutils literal"><span class="pre">recoll</span></code> GUI are stored in the standard location
|
|
defined by Qt.</p>
|
|
<p>The <a class="reference external" href="#RCL.INDEXING.PERIODIC.EXEC">indexing process</a> is started
|
|
automatically (after asking permission), the first time you execute the
|
|
<code class="docutils literal"><span class="pre">recoll</span></code> GUI. Indexing can also be performed by executing the
|
|
<code class="docutils literal"><span class="pre">recollindex</span></code> command. RCL indexing is multithreaded by default when
|
|
appropriate hardware resources are available, and can perform in
|
|
parallel multiple tasks for text extraction, segmentation and index
|
|
updates.</p>
|
|
<p><a class="reference external" href="#RCL.SEARCH">Searches</a> are usually performed inside the <code class="docutils literal"><span class="pre">recoll</span></code>
|
|
GUI, which has many options to help you find what you are looking for.
|
|
However, there are other ways to perform RCL searches:</p>
|
|
<ul class="simple">
|
|
<li>A <a class="reference external" href="#RCL.SEARCH.COMMANDLINE">command line interface</a>.</li>
|
|
<li>A <a class="reference external" href="#RCL.PROGRAM.PYTHONAPI">Python programming interface</a></li>
|
|
<li>A <a class="reference external" href="#RCL.SEARCH.KIO">KDE KIO slave module</a>.</li>
|
|
<li>A Ubuntu Unity
|
|
<a class="reference external" href="https://www.lesbonscomptes.com/recoll/download.html">Scope</a>
|
|
module.</li>
|
|
<li>A Gnome Shell <a class="reference external" href="https://www.lesbonscomptes.com/recoll/download.html">Search
|
|
Provider</a>.</li>
|
|
<li>A <a class="reference external" href="https://github.com/koniu/recoll-webui">WEB interface</a>.</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="indexing">
|
|
<h2><a class="toc-backref" href="#id9">Indexing</a><a class="headerlink" href="#indexing" title="Permalink to this headline">¶</a></h2>
|
|
<div class="section" id="id1">
|
|
<h3><a class="toc-backref" href="#id10">Introduction</a><a class="headerlink" href="#id1" title="Permalink to this headline">¶</a></h3>
|
|
<p>Indexing is the process by which the set of documents is analyzed and
|
|
the data entered into the database. RCL indexing is normally
|
|
incremental: documents will only be processed if they have been modified
|
|
since the last run. On the first execution, all documents will need
|
|
processing. A full index build can be forced later by specifying an
|
|
option to the indexing command (<code class="docutils literal"><span class="pre">recollindex</span></code> <code class="docutils literal"><span class="pre">-z</span></code> or <code class="docutils literal"><span class="pre">-Z</span></code>).</p>
|
|
<p><code class="docutils literal"><span class="pre">recollindex</span></code> skips files which caused an error during a previous
|
|
pass. This is a performance optimization, and a new behaviour in version
|
|
1.21 (failed files were always retried by previous versions). The
|
|
command line option <code class="docutils literal"><span class="pre">-k</span></code> can be set to retry failed files, for example
|
|
after updating an input handler.</p>
|
|
<p>The following sections give an overview of different aspects of the
|
|
indexing processes and configuration, with links to detailed sections.</p>
|
|
<p>Depending on your data, temporary files may be needed during indexing,
|
|
some of them possibly quite big. You can use the RECOLL_TMPDIR or
|
|
TMPDIR environment variables to determine where they are created (the
|
|
default is to use <code class="docutils literal"><span class="pre">/tmp</span></code>). Using TMPDIR has the nice property that it
|
|
may also be taken into account by auxiliary commands executed by
|
|
<code class="docutils literal"><span class="pre">recollindex</span></code>.</p>
|
|
<div class="section" id="indexing-modes">
|
|
<h4>Indexing modes<a class="headerlink" href="#indexing-modes" title="Permalink to this headline">¶</a></h4>
|
|
<p>RCL indexing can be performed along two main modes:</p>
|
|
<ul>
|
|
<li><p class="first"><strong>`Periodic (or batch) indexing: <#RCL.INDEXING.PERIODIC>`__.</strong></p>
|
|
<p><code class="docutils literal"><span class="pre">recollindex</span></code> is executed at discrete times. The typical usage is
|
|
to have a nightly run <a class="reference external" href="#RCL.INDEXING.PERIODIC.AUTOMAT">programmed</a>
|
|
into your <code class="docutils literal"><span class="pre">cron</span></code> file.</p>
|
|
</li>
|
|
<li><p class="first"><strong>`Real time indexing: <#RCL.INDEXING.MONITOR>`__.</strong></p>
|
|
<p><code class="docutils literal"><span class="pre">recollindex</span></code> runs permanently as a daemon and uses a file system
|
|
alteration monitor (e.g. inotify) to detect file changes. New or
|
|
updated files are indexed at once.</p>
|
|
</li>
|
|
</ul>
|
|
<p>The choice between the two methods is mostly a matter of preference, and
|
|
they can be combined by setting up multiple indexes (ie: use periodic
|
|
indexing on a big documentation directory, and real time indexing on a
|
|
small home directory). Monitoring a big file system tree can consume
|
|
significant system resources.</p>
|
|
<p>With RCL 1.24 and newer, it is also possible to set up an index so that
|
|
only a subset of the tree will be monitored and the rest will be covered
|
|
by batch/incremental indexing. (See the details in the <a class="reference external" href="#RCL.INDEXING.MONITOR">Real time
|
|
indexing</a> section.</p>
|
|
<p>The choice of method and the parameters used can be configured from the
|
|
<code class="docutils literal"><span class="pre">recoll</span></code> GUI: Preferences > Indexing schedule</p>
|
|
<p>The GUI File menu also has entries to start or stop the current indexing
|
|
operation. Stopping indexing is performed by killing the <code class="docutils literal"><span class="pre">recollindex</span></code>
|
|
process, which will checkpoint its state and exit. A later restart of
|
|
indexing will mostly resume from where things stopped (the file tree
|
|
walk has to be restarted from the beginning).</p>
|
|
<p>When the real time indexer is running, two operations are available from
|
|
the menu: ‘Stop’ and ‘Trigger incremental pass’. When no indexing is
|
|
running, you have a choice of updating the index or rebuilding it (the
|
|
first choice only processes changed files, the second one zeroes the
|
|
index before starting so that all files are processed).</p>
|
|
</div>
|
|
<div class="section" id="configurations-multiple-indexes">
|
|
<h4>Configurations, multiple indexes<a class="headerlink" href="#configurations-multiple-indexes" title="Permalink to this headline">¶</a></h4>
|
|
<p>RCL supports defining multiple indexes, each defined by its own
|
|
<a class="reference external" href="#RCL.INDEXING.CONFIG">configuration directory</a>, in which several
|
|
configuration files describe what should be indexed and how.</p>
|
|
<p>A default personal configuration directory (<code class="docutils literal"><span class="pre">$HOME/.recoll/</span></code>) is
|
|
created when a RCL program is first executed. This configuration is the
|
|
one used for indexing and querying when no specific configuration is
|
|
specified.</p>
|
|
<p>All configuration parameters have defaults, defined in system-wide
|
|
files. Without further customisation, the default configuration will
|
|
process your complete home directory, with a reasonable set of defaults.
|
|
It can be changed to process a different area of the file system, select
|
|
files in different ways, and many other things.</p>
|
|
<p>In some cases, it may be useful to create additional configuration
|
|
directories, for example, to separate personal and shared indexes, or to
|
|
take advantage of the organization of your data to improve search
|
|
precision.</p>
|
|
<p>A plausible usage scenario for the multiple index feature would be for a
|
|
system administrator to set up a central index for shared data, that you
|
|
choose to search or not in addition to your personal data. Of course,
|
|
there are other possibilities. for example, there are many cases where
|
|
you know the subset of files that should be searched, and where
|
|
narrowing the search can improve the results. You can achieve
|
|
approximately the same effect with the directory filter in advanced
|
|
search, but multiple indexes may have better performance and may be
|
|
worth the trouble in some cases.</p>
|
|
<p>A more advanced use case would be to use multiple index to improve
|
|
indexing performance, by updating several indexes in parallel (using
|
|
multiple CPU cores and disks, or possibly several machines), and then
|
|
merging them, or querying them in parallel.</p>
|
|
<p>A specific configuration can be selected by setting the RECOLL_CONFDIR
|
|
environment variable, or giving the <code class="docutils literal"><span class="pre">-c</span></code> option to any of the RCL
|
|
commands.</p>
|
|
<p>When creating or updating indexes, the different configurations are
|
|
entirely independant (no parameters are ever shared between
|
|
configurations when indexing). The <code class="docutils literal"><span class="pre">recollindex</span></code> program always works
|
|
on a single index.</p>
|
|
<p>When querying, multiple indexes can be accessed concurrently, either
|
|
from the GUI or the command line. When doing this, there is always one
|
|
main configuration, from which both configuration and index data are
|
|
used. Only the index data from the additional indexes is used (their
|
|
configuration parameters are ignored).</p>
|
|
<p>The behaviour of index update and query regarding multiple
|
|
configurations is important and sometimes confusing, so it will be
|
|
rephrased here: for index generation, multiple configurations are
|
|
totally independant from each other. When querying, configuration and
|
|
data are used from the main index (the one designated by <code class="docutils literal"><span class="pre">-c</span></code> or
|
|
RECOLL_CONFDIR), and only the data from the additional indexes is used.
|
|
This implies that some parameters should be consistent among the
|
|
configurations for indexes which are to be used together.</p>
|
|
<p>See the section about <a class="reference external" href="#RCL.INDEXING.CONFIG.MULTIPLE">configuring multiple
|
|
indexes</a> for more detail</p>
|
|
</div>
|
|
<div class="section" id="document-types">
|
|
<h4>Document types<a class="headerlink" href="#document-types" title="Permalink to this headline">¶</a></h4>
|
|
<p>RCL knows about quite a few different document types. The parameters for
|
|
document types recognition and processing are set in <a class="reference external" href="#RCL.INDEXING.CONFIG">configuration
|
|
files</a>.</p>
|
|
<p>Most file types, like HTML or word processing files, only hold one
|
|
document. Some file types, like email folders or zip archives, can hold
|
|
many individually indexed documents, which may themselves be compound
|
|
ones. Such hierarchies can go quite deep, and RCL can process, for
|
|
example, a LibreOffice document stored as an attachment to an email
|
|
message inside an email folder archived in a zip file...</p>
|
|
<p><code class="docutils literal"><span class="pre">recollindex</span></code> processes plain text, HTML, OpenDocument
|
|
(Open/LibreOffice), email formats, and a few others internally.</p>
|
|
<p>Other file types (ie: postscript, pdf, ms-word, rtf ...) need external
|
|
applications for preprocessing. The list is in the
|
|
<a class="reference external" href="#RCL.INSTALL.EXTERNAL">installation</a> section. After every indexing
|
|
operation, RCL updates a list of commands that would be needed for
|
|
indexing existing files types. This list can be displayed by selecting
|
|
the menu option File > Show Missing Helpers in the <code class="docutils literal"><span class="pre">recoll</span></code> GUI. It is
|
|
stored in the <code class="docutils literal"><span class="pre">missing</span></code> text file inside the configuration directory.</p>
|
|
<p>By default, RCL will try to index any file type that it has a way to
|
|
read. This is sometimes not desirable, and there are ways to either
|
|
exclude some types, or on the contrary define a positive list of types
|
|
to be indexed. In the latter case, any type not in the list will be
|
|
ignored.</p>
|
|
<p>Excluding files by name can be done by adding wildcard name patterns to
|
|
the <a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF.SKIPPEDNAMES">skippedNames</a> list,
|
|
which can be done from the GUI Index configuration menu. Excluding by
|
|
type can be done by setting the
|
|
<a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF.EXCLUDEDMIMETYPES">excludedmimetypes</a>
|
|
list in the configuration file (1.20 and later). This can be redefined
|
|
for subdirectories.</p>
|
|
<p>You can also define an exclusive list of MIME types to be indexed (no
|
|
others will be indexed), by settting the
|
|
<a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF.INDEXEDMIMETYPES">indexedmimetypes</a>
|
|
configuration variable. Example:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">indexedmimetypes</span> <span class="o">=</span> <span class="n">text</span><span class="o">/</span><span class="n">html</span> <span class="n">application</span><span class="o">/</span><span class="n">pdf</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>It is possible to redefine this parameter for subdirectories. Example:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="p">[</span><span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">my</span><span class="o">/</span><span class="nb">dir</span><span class="p">]</span>
|
|
<span class="n">indexedmimetypes</span> <span class="o">=</span> <span class="n">application</span><span class="o">/</span><span class="n">pdf</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>(When using sections like this, don’t forget that they remain in effect
|
|
until the end of the file or another section indicator).</p>
|
|
<p><code class="docutils literal"><span class="pre">excludedmimetypes</span></code> or <code class="docutils literal"><span class="pre">indexedmimetypes</span></code>, can be set either by
|
|
editing the <cite>configuration file
|
|
(``recoll.conf`</cite>) <#RCL.INSTALL.CONFIG.RECOLLCONF>`__ for the index, or
|
|
by using the GUI index configuration tool.</p>
|
|
<blockquote>
|
|
<div><p><strong>Note</strong></p>
|
|
<p>When editing the <code class="docutils literal"><span class="pre">indexedmimetypes</span></code> or <code class="docutils literal"><span class="pre">excludedmimetypes</span></code>
|
|
lists, you should use the MIME values listed in the <code class="docutils literal"><span class="pre">mimemap</span></code> file
|
|
or in Recoll result lists in preference to <code class="docutils literal"><span class="pre">file</span> <span class="pre">-i</span></code> output: there
|
|
are a number of differences. The <code class="docutils literal"><span class="pre">file</span> <span class="pre">-i</span></code> output should only be
|
|
used for files without extensions, or for which the extension is not
|
|
listed in <code class="docutils literal"><span class="pre">mimemap</span></code></p>
|
|
</div></blockquote>
|
|
</div>
|
|
<div class="section" id="indexing-failures">
|
|
<h4>Indexing failures<a class="headerlink" href="#indexing-failures" title="Permalink to this headline">¶</a></h4>
|
|
<p>Indexing may fail for some documents, for a number of reasons: a helper
|
|
program may be missing, the document may be corrupt, we may fail to
|
|
uncompress a file because no file system space is available, etc.</p>
|
|
<p>RCL versions prior to 1.21 always retried to index files which had
|
|
previously caused an error. This guaranteed that anything that may have
|
|
become indexable (for example because a helper had been installed) would
|
|
be indexed. However this was bad for performance because some indexing
|
|
failures may be quite costly (for example failing to uncompress a big
|
|
file because of insufficient disk space).</p>
|
|
<p>The indexer in RCL versions 1.21 and later does not retry failed files
|
|
by default. Retrying will only occur if an explicit option (<code class="docutils literal"><span class="pre">-k</span></code>) is
|
|
set on the <code class="docutils literal"><span class="pre">recollindex</span></code> command line, or if a script executed when
|
|
<code class="docutils literal"><span class="pre">recollindex</span></code> starts up says so. The script is defined by a
|
|
configuration variable (<code class="docutils literal"><span class="pre">checkneedretryindexscript</span></code>), and makes a
|
|
rather lame attempt at deciding if a helper command may have been
|
|
installed, by checking if any of the common <code class="docutils literal"><span class="pre">bin</span></code> directories have
|
|
changed.</p>
|
|
</div>
|
|
<div class="section" id="recovery">
|
|
<h4>Recovery<a class="headerlink" href="#recovery" title="Permalink to this headline">¶</a></h4>
|
|
<p>In the rare case where the index becomes corrupted (which can signal
|
|
itself by weird search results or crashes), the index files need to be
|
|
erased before restarting a clean indexing pass. Just delete the
|
|
<code class="docutils literal"><span class="pre">xapiandb</span></code> directory (see <a class="reference external" href="#RCL.INDEXING.STORAGE">next section</a>),
|
|
or, alternatively, start the next <code class="docutils literal"><span class="pre">recollindex</span></code> with the <code class="docutils literal"><span class="pre">-z</span></code>
|
|
option, which will reset the database before indexing. The difference
|
|
between the two methods is that the second will not change the current
|
|
index format, which may be undesirable if a newer format is supported by
|
|
the XAP version.</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="index-storage">
|
|
<h3><a class="toc-backref" href="#id11">Index storage</a><a class="headerlink" href="#index-storage" title="Permalink to this headline">¶</a></h3>
|
|
<p>The default location for the index data is the <code class="docutils literal"><span class="pre">xapiandb</span></code> subdirectory
|
|
of the RCL configuration directory, typically
|
|
<code class="docutils literal"><span class="pre">$HOME/.recoll/xapiandb/</span></code>. This can be changed via two different
|
|
methods (with different purposes):</p>
|
|
<ol class="arabic">
|
|
<li><p class="first">For a given configuration directory, you can specify a non-default
|
|
storage location for the index by setting the <code class="docutils literal"><span class="pre">dbdir</span></code> parameter in
|
|
the configuration file (see the <a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF">configuration
|
|
section</a>). This method would
|
|
mainly be of use if you wanted to keep the configuration directory in
|
|
its default location, but desired another location for the index,
|
|
typically out of disk occupation or performance concerns.</p>
|
|
</li>
|
|
<li><p class="first">You can specify a different configuration directory by setting the
|
|
RECOLL_CONFDIR environment variable, or using the <code class="docutils literal"><span class="pre">-c</span></code> option to
|
|
the RCL commands. This method would typically be used to index
|
|
different areas of the file system to different indexes. For example,
|
|
if you were to issue the following command:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">recoll</span> <span class="o">-</span><span class="n">c</span> <span class="o">~/.</span><span class="n">indexes</span><span class="o">-</span><span class="n">email</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Then RCL would use configuration files stored in
|
|
<code class="docutils literal"><span class="pre">~/.indexes-email/</span></code> and, (unless specified otherwise in
|
|
<code class="docutils literal"><span class="pre">recoll.conf</span></code>) would look for the index in
|
|
<code class="docutils literal"><span class="pre">~/.indexes-email/xapiandb/</span></code>.</p>
|
|
<p>Using multiple configuration directories and <a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF">configuration
|
|
options</a> allows you to tailor
|
|
multiple configurations and indexes to handle whatever subset of the
|
|
available data you wish to make searchable.</p>
|
|
</li>
|
|
</ol>
|
|
<p>The size of the index is determined by the size of the set of documents,
|
|
but the ratio can vary a lot. For a typical mixed set of documents, the
|
|
index size will often be close to the data set size. In specific cases
|
|
(a set of compressed mbox files for example), the index can become much
|
|
bigger than the documents. It may also be much smaller if the documents
|
|
contain a lot of images or other non-indexed data (an extreme example
|
|
being a set of mp3 files where only the tags would be indexed).</p>
|
|
<p>Of course, images, sound and video do not increase the index size, which
|
|
means that in most cases, the space used by the index will be negligible
|
|
against the total amount of data on the computer.</p>
|
|
<p>The index data directory (<code class="docutils literal"><span class="pre">xapiandb</span></code>) only contains data that can be
|
|
completely rebuilt by an index run (as long as the original documents
|
|
exist), and it can always be destroyed safely.</p>
|
|
<div class="section" id="xap-index-formats">
|
|
<h4>XAP index formats<a class="headerlink" href="#xap-index-formats" title="Permalink to this headline">¶</a></h4>
|
|
<p>XAP versions usually support several formats for index storage. A given
|
|
major XAP version will have a current format, used to create new
|
|
indexes, and will also support the format from the previous major
|
|
version.</p>
|
|
<p>XAP will not convert automatically an existing index from the older
|
|
format to the newer one. If you want to upgrade to the new format, or if
|
|
a very old index needs to be converted because its format is not
|
|
supported any more, you will have to explicitly delete the old index
|
|
(typically <code class="docutils literal"><span class="pre">~/.recoll/xapiandb</span></code>), then run a normal indexing command.
|
|
Using <code class="docutils literal"><span class="pre">recollindex</span></code> option <code class="docutils literal"><span class="pre">-z</span></code> would not work in this situation.</p>
|
|
</div>
|
|
<div class="section" id="security-aspects">
|
|
<h4>Security aspects<a class="headerlink" href="#security-aspects" title="Permalink to this headline">¶</a></h4>
|
|
<p>The RCL index does not hold complete copies of the indexed documents (it
|
|
almost does after version 1.24). But it does hold enough data to allow
|
|
for an almost complete reconstruction. If confidential data is indexed,
|
|
access to the database directory should be restricted.</p>
|
|
<p>RCL will create the configuration directory with a mode of 0700 (access
|
|
by owner only). As the index data directory is by default a
|
|
sub-directory of the configuration directory, this should result in
|
|
appropriate protection.</p>
|
|
<p>If you use another setup, you should think of the kind of protection you
|
|
need for your index, set the directory and files access modes
|
|
appropriately, and also maybe adjust the <code class="docutils literal"><span class="pre">umask</span></code> used during index
|
|
updates.</p>
|
|
</div>
|
|
<div class="section" id="special-considerations-for-big-indexes">
|
|
<h4>Special considerations for big indexes<a class="headerlink" href="#special-considerations-for-big-indexes" title="Permalink to this headline">¶</a></h4>
|
|
<p>This only needs concern you if your index is going to be bigger than
|
|
around 5 GBytes. Beyond 10 GBytes, it becomes a serious issue. Most
|
|
people have much smaller indexes. For reference, 5 GBytes would be
|
|
around 2000 bibles, a lot of text. If you have a huge text dataset
|
|
(remember: images don’t count, the text content of PDFs is typically
|
|
less than 5% of the file size), read on.</p>
|
|
<p>The amount of writing performed by Xapian during index creation is not
|
|
linear with the index size (it is somewhere between linear and
|
|
quadratic). For big indexes this becomes a performance issue, and may
|
|
even be an SSD disk wear issue.</p>
|
|
<p>The problem can be mitigated by observing the following rules:</p>
|
|
<ul class="simple">
|
|
<li>Partition the data set and create several indexes of reasonable size
|
|
rather than a huge one. These indexes can then be queried in parallel
|
|
(using the RCL external indexes facility), or merged using
|
|
<code class="docutils literal"><span class="pre">xapian-compact</span></code>.</li>
|
|
<li>Have a lot of RAM available and set the <code class="docutils literal"><span class="pre">idxflushmb</span></code> RCL
|
|
configuration parameter as high as you can without swapping
|
|
(experimentation will be needed). 200 would be a minimum in this
|
|
context.</li>
|
|
<li>Use Xapian 1.4.10 or newer, as this version brought a significant
|
|
improvement in the amount of writes.</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="index-configuration">
|
|
<h3><a class="toc-backref" href="#id12">Index configuration</a><a class="headerlink" href="#index-configuration" title="Permalink to this headline">¶</a></h3>
|
|
<p>Variables set inside the <a class="reference external" href="#RCL.INSTALL.CONFIG">RCL configuration
|
|
files</a> control which areas of the file system
|
|
are indexed, and how files are processed. These variables can be set
|
|
either by editing the text files or by using the <cite>dialogs in the
|
|
``recoll`</cite> GUI <#RCL.INDEXING.CONFIG.GUI>`__.</p>
|
|
<p>The first time you start <code class="docutils literal"><span class="pre">recoll</span></code>, you will be asked whether or not
|
|
you would like it to build the index. If you want to adjust the
|
|
configuration before indexing, just click Cancel at this point, which
|
|
will get you into the configuration interface. If you exit at this
|
|
point, <code class="docutils literal"><span class="pre">recoll</span></code> will have created a <code class="docutils literal"><span class="pre">~/.recoll</span></code> directory containing
|
|
empty configuration files, which you can edit by hand.</p>
|
|
<p>The configuration is documented inside the <a class="reference external" href="#RCL.INSTALL.CONFIG">installation
|
|
chapter</a> of this document, or in the recoll.conf
|
|
5 man page, but the most current information will most likely be the
|
|
comments inside the sample file. The most immediately useful variable is
|
|
probably <code class="docutils literal"><span class="pre">`topdirs</span></code> <#RCL.INSTALL.CONFIG.RECOLLCONF.TOPDIRS>`__, which
|
|
determines what subtrees and files get indexed.</p>
|
|
<p>The applications needed to index file types other than text, HTML or
|
|
email (ie: pdf, postscript, ms-word...) are described in the <a class="reference external" href="#RCL.INSTALL.EXTERNAL">external
|
|
packages section.</a></p>
|
|
<p>As of Recoll 1.18 there are two incompatible types of Recoll indexes,
|
|
depending on the treatment of character case and diacritics. A <a class="reference external" href="#RCL.INDEXING.CONFIG.SENS">further
|
|
section</a> describes the two types in more
|
|
detail.</p>
|
|
<div class="section" id="multiple-indexes">
|
|
<h4>Multiple indexes<a class="headerlink" href="#multiple-indexes" title="Permalink to this headline">¶</a></h4>
|
|
<p>Multiple RCL indexes can be created by using several configuration
|
|
directories which are typically set to index different areas of the file
|
|
system. A specific index can be selected for updating or searching,
|
|
using the RECOLL_CONFDIR environment variable or the <code class="docutils literal"><span class="pre">-c</span></code> option to
|
|
<code class="docutils literal"><span class="pre">recoll</span></code> and <code class="docutils literal"><span class="pre">recollindex</span></code>.</p>
|
|
<p>Index configuration parameters can be set either by using a text editor
|
|
on the files, or, for most parameters, by using the <code class="docutils literal"><span class="pre">recoll</span></code> index
|
|
configuration GUI. In the latter case, the configuration directory for
|
|
which parameters are modified is the one which was selected by
|
|
RECOLL_CONFDIR or the <code class="docutils literal"><span class="pre">-c</span></code> parameter, and there is no way to switch
|
|
configurations within the GUI.</p>
|
|
<p>As a remainder from a previous section, a <code class="docutils literal"><span class="pre">recollindex</span></code> program
|
|
instance can only update one specific index, and it will only use
|
|
parameters from a single configuration (no parameters are ever shared
|
|
between configurations when indexing). All the query methods
|
|
(<code class="docutils literal"><span class="pre">recoll</span></code>, <code class="docutils literal"><span class="pre">recollq</span></code>, the Python API, etc.) operate with a main
|
|
configuration, from which both configuration and index data are used,
|
|
but can also query data from multiple additional indexes. Only the index
|
|
data from the latter is used, their configuration parameters are
|
|
ignored.</p>
|
|
<p>When searching, the current main index (defined by RECOLL_CONFDIR or
|
|
<code class="docutils literal"><span class="pre">-c</span></code>) is always active. If this is undesirable, you can set up your
|
|
base configuration to index an empty directory.</p>
|
|
<p>If a set of multiple indexes are to be used together for searches, some
|
|
configuration parameters must be consistent among the set. These are
|
|
parameters which need to be the same when indexing and searching. As the
|
|
parameters come from the main configuration when searching, they need to
|
|
be compatible with what was set when creating the other indexes (which
|
|
came from their respective configuration directories).</p>
|
|
<p>Most importantly, all indexes to be queried concurrently must have the
|
|
same option concerning character case and diacritics stripping, but
|
|
there are other constraints. Most of the relevant parameters are
|
|
described in the <a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF.TERMS">linked
|
|
section</a>.</p>
|
|
<p>The different search interfaces (GUI, command line, ...) have different
|
|
methods to define the set of indexes to be used, see the appropriate
|
|
section.</p>
|
|
<p>At the moment, using multiple configurations implies a small level of
|
|
command line usage. Additional configuration directories (beyond
|
|
<code class="docutils literal"><span class="pre">~/.recoll</span></code>) must be created by hand (<code class="docutils literal"><span class="pre">mkdir</span></code> or such), the GUI will
|
|
not do it. This is to avoid mistakenly creating additional directories
|
|
when an argument is mistyped. Also, the GUI or the indexer must be
|
|
launched with a specific option or environment to work on the right
|
|
configuration.</p>
|
|
<p>To be more practical, here follows a few examples of the commands need
|
|
to create, configure, update, and query an additional index.</p>
|
|
<p>Initially creating the configuration and index:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">mkdir</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">my</span><span class="o">/</span><span class="n">new</span><span class="o">/</span><span class="n">config</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Configuring the new index can be done from the <code class="docutils literal"><span class="pre">recoll</span></code> GUI, launched
|
|
from the command line to pass the <code class="docutils literal"><span class="pre">-c</span></code> option (you could create a
|
|
desktop file to do it for you), and then using the GUI index
|
|
configuration tool to set up the index.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">recoll</span> <span class="o">-</span><span class="n">c</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">my</span><span class="o">/</span><span class="n">new</span><span class="o">/</span><span class="n">config</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Alternatively, you can just start a text editor on the main
|
|
configuration file <code class="docutils literal"><span class="pre">`recoll.conf</span></code> <#RCL.INSTALL.CONFIG.RECOLLCONF>`__.</p>
|
|
<p>Creating and updating the index can be done from the command line:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">recollindex</span> <span class="o">-</span><span class="n">c</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">my</span><span class="o">/</span><span class="n">new</span><span class="o">/</span><span class="n">config</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>or from the File menu of a GUI launched with the same option
|
|
(<code class="docutils literal"><span class="pre">recoll</span></code>, see above).</p>
|
|
<p>The same GUI would also let you set up batch indexing for the new index.
|
|
Real time indexing can only be set up from the GUI for the default index
|
|
(the menu entry will be inactive if the GUI was started with a
|
|
non-default <code class="docutils literal"><span class="pre">-c</span></code> option).</p>
|
|
<p>The new index can be queried alone with</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">recoll</span> <span class="o">-</span><span class="n">c</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">my</span><span class="o">/</span><span class="n">new</span><span class="o">/</span><span class="n">config</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Or, in parallel with the default index, by starting <code class="docutils literal"><span class="pre">recoll</span></code> without a
|
|
<code class="docutils literal"><span class="pre">-c</span></code> option, and using the Preferences > External Index Dialog menu.</p>
|
|
</div>
|
|
<div class="section" id="index-case-and-diacritics-sensitivity">
|
|
<h4>Index case and diacritics sensitivity<a class="headerlink" href="#index-case-and-diacritics-sensitivity" title="Permalink to this headline">¶</a></h4>
|
|
<p>As of RCL version 1.18 you have a choice of building an index with terms
|
|
stripped of character case and diacritics, or one with raw terms. For a
|
|
source term of <code class="docutils literal"><span class="pre">Résumé</span></code>, the former will store <code class="docutils literal"><span class="pre">resume</span></code>, the latter
|
|
<code class="docutils literal"><span class="pre">Résumé</span></code>.</p>
|
|
<p>Each type of index allows performing searches insensitive to case and
|
|
diacritics: with a raw index, the user entry will be expanded to match
|
|
all case and diacritics variations present in the index. With a stripped
|
|
index, the search term will be stripped before searching.</p>
|
|
<p>A raw index allows for another possibility which a stripped index cannot
|
|
offer: using case and diacritics to discriminate between terms,
|
|
returning different results when searching for <code class="docutils literal"><span class="pre">US</span></code> and <code class="docutils literal"><span class="pre">us</span></code> or
|
|
<code class="docutils literal"><span class="pre">resume</span></code> and <code class="docutils literal"><span class="pre">résumé</span></code>. Read the <a class="reference external" href="#RCL.SEARCH.CASEDIAC">section about search case and
|
|
diacritics sensitivity</a> for more details.</p>
|
|
<p>The type of index to be created is controlled by the <code class="docutils literal"><span class="pre">indexStripChars</span></code>
|
|
configuration variable which can only be changed by editing the
|
|
configuration file. Any change implies an index reset (not automated by
|
|
RCL), and all indexes in a search must be set in the same way (again,
|
|
not checked by RCL).</p>
|
|
<p>If the <code class="docutils literal"><span class="pre">indexStripChars</span></code> is not set, RCL 1.18 creates a stripped index
|
|
by default, for compatibility with previous versions.</p>
|
|
<p>As a cost for added capability, a raw index will be slightly bigger than
|
|
a stripped one (around 10%). Also, searches will be more complex, so
|
|
probably slightly slower, and the feature is still young, so that a
|
|
certain amount of weirdness cannot be excluded.</p>
|
|
<p>One of the most adverse consequence of using a raw index is that some
|
|
phrase and proximity searches may become impossible: because each term
|
|
needs to be expanded, and all combinations searched for, the
|
|
multiplicative expansion may become unmanageable.</p>
|
|
</div>
|
|
<div class="section" id="indexing-threads-configuration">
|
|
<h4>Indexing threads configuration<a class="headerlink" href="#indexing-threads-configuration" title="Permalink to this headline">¶</a></h4>
|
|
<p>The RCL indexing process <code class="docutils literal"><span class="pre">recollindex</span></code> can use multiple threads to
|
|
speed up indexing on multiprocessor systems. The work done to index
|
|
files is divided in several stages and some of the stages can be
|
|
executed by multiple threads. The stages are:</p>
|
|
<ol class="arabic simple">
|
|
<li>File system walking: this is always performed by the main thread.</li>
|
|
<li>File conversion and data extraction.</li>
|
|
<li>Text processing (splitting, stemming, etc.).</li>
|
|
<li>XAP index update.</li>
|
|
</ol>
|
|
<p>You can also read a <a class="reference external" href="http://www.recoll.org/idxthreads/threadingRecoll.html">longer
|
|
document</a>
|
|
about the transformation of RCL indexing to multithreading.</p>
|
|
<p>The threads configuration is controlled by two configuration file
|
|
parameters.</p>
|
|
<dl class="docutils">
|
|
<dt><code class="docutils literal"><span class="pre">thrQSizes</span></code></dt>
|
|
<dd>This variable defines the job input queues configuration. There are
|
|
three possible queues for stages 2, 3 and 4, and this parameter
|
|
should give the queue depth for each stage (three integer values).
|
|
If a value of -1 is used for a given stage, no queue is used, and
|
|
the thread will go on performing the next stage. In practise, deep
|
|
queues have not been shown to increase performance. A value of 0 for
|
|
the first queue tells RCL to perform autoconfiguration (no need for
|
|
anything else in this case, thrTCounts is not used) - this is the
|
|
default configuration.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">thrTCounts</span></code></dt>
|
|
<dd><p class="first">This defines the number of threads used for each stage. If a value
|
|
of -1 is used for one of the queue depths, the corresponding thread
|
|
count is ignored. It makes no sense to use a value other than 1 for
|
|
the last stage because updating the XAP index is necessarily
|
|
single-threaded (and protected by a mutex).</p>
|
|
<p><strong>Note</strong></p>
|
|
<p class="last">If the first value in <code class="docutils literal"><span class="pre">thrQSizes</span></code> is 0, <code class="docutils literal"><span class="pre">thrTCounts</span></code> is ignored.</p>
|
|
</dd>
|
|
</dl>
|
|
<p>The following example would use three queues (of depth 2), and 4 threads
|
|
for converting source documents, 2 for processing their text, and one to
|
|
update the index. This was tested to be the best configuration on the
|
|
test system (quadri-processor with multiple disks).</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">thrQSizes</span> <span class="o">=</span> <span class="mi">2</span> <span class="mi">2</span> <span class="mi">2</span>
|
|
<span class="n">thrTCounts</span> <span class="o">=</span> <span class="mi">4</span> <span class="mi">2</span> <span class="mi">1</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>The following example would use a single queue, and the complete
|
|
processing for each document would be performed by a single thread
|
|
(several documents will still be processed in parallel in most cases).
|
|
The threads will use mutual exclusion when entering the index update
|
|
stage. In practise the performance would be close to the precedent case
|
|
in general, but worse in certain cases (e.g. a Zip archive would be
|
|
performed purely sequentially), so the previous approach is preferred.
|
|
YMMV... The 2 last values for thrTCounts are ignored.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">thrQSizes</span> <span class="o">=</span> <span class="mi">2</span> <span class="o">-</span><span class="mi">1</span> <span class="o">-</span><span class="mi">1</span>
|
|
<span class="n">thrTCounts</span> <span class="o">=</span> <span class="mi">6</span> <span class="mi">1</span> <span class="mi">1</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>The following example would disable multithreading. Indexing will be
|
|
performed by a single thread.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">thrQSizes</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span> <span class="o">-</span><span class="mi">1</span> <span class="o">-</span><span class="mi">1</span>
|
|
</pre></div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="the-index-configuration-gui">
|
|
<h4>The index configuration GUI<a class="headerlink" href="#the-index-configuration-gui" title="Permalink to this headline">¶</a></h4>
|
|
<p>Most parameters for a given index configuration can be set from a
|
|
<code class="docutils literal"><span class="pre">recoll</span></code> GUI running on this configuration (either as default, or by
|
|
setting RECOLL_CONFDIR or the <code class="docutils literal"><span class="pre">-c</span></code> option.)</p>
|
|
<p>The interface is started from the Preferences > Index Configuration menu
|
|
entry. It is divided in four tabs, Global parameters, Local parameters,
|
|
Web history (which is explained in the next section) and Search
|
|
parameters.</p>
|
|
<p>The Global parameters tab allows setting global variables, like the
|
|
lists of top directories, skipped paths, or stemming languages.</p>
|
|
<p>The Local parameters tab allows setting variables that can be redefined
|
|
for subdirectories. This second tab has an initially empty list of
|
|
customisation directories, to which you can add. The variables are then
|
|
set for the currently selected directory (or at the top level if the
|
|
empty line is selected).</p>
|
|
<p>The Search parameters section defines parameters which are used at query
|
|
time, but are global to an index and affect all search tools, not only
|
|
the GUI.</p>
|
|
<p>The meaning for most entries in the interface is self-evident and
|
|
documented by a <code class="docutils literal"><span class="pre">ToolTip</span></code> popup on the text label. For more detail,
|
|
you will need to refer to the <a class="reference external" href="#RCL.INSTALL.CONFIG">configuration
|
|
section</a> of this guide.</p>
|
|
<p>The configuration tool normally respects the comments and most of the
|
|
formatting inside the configuration file, so that it is quite possible
|
|
to use it on hand-edited files, which you might nevertheless want to
|
|
backup first...</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="indexing-the-web-pages-which-you-wisit">
|
|
<h3><a class="toc-backref" href="#id13">Indexing the WEB pages which you wisit.</a><a class="headerlink" href="#indexing-the-web-pages-which-you-wisit" title="Permalink to this headline">¶</a></h3>
|
|
<p>With the help of a Firefox extension, RCL can index the Internet pages
|
|
that you visit. The extension has a long history: it was initially
|
|
designed for the Beagle indexer, then adapted to RCL and the Firefox XUL
|
|
API. A new version of the addon has been written to work with the
|
|
WebExtensions API, which is the only one supported after Firefox version
|
|
57.</p>
|
|
<p>The extension works by copying visited WEB pages to an indexing queue
|
|
directory, which RCL then processes, indexing the data, storing it into
|
|
a local cache, then removing the file from the queue.</p>
|
|
<p>Because the WebExtensions API introduces more constraints to what
|
|
extensions can do, the new version works with one more step: the files
|
|
are first created in the browser default downloads location (typically
|
|
<code class="docutils literal"><span class="pre">$HOME/Downloads</span></code> ), then moved by a script in the old queue location.
|
|
The script is automatically executed by the RCL indexer versions 1.23.5
|
|
and newer. It could conceivably be executed independantly to make the
|
|
new browser extension compatible with an older RCL version (the script
|
|
is named <code class="docutils literal"><span class="pre">recoll-we-move-files.py</span></code>).</p>
|
|
<blockquote>
|
|
<div><p><strong>Note</strong></p>
|
|
<p>For the WebExtensions-based version to work, it is necessary to set
|
|
the <code class="docutils literal"><span class="pre">webdownloadsdir</span></code> value in the configuration if it was changed
|
|
from the default <code class="docutils literal"><span class="pre">$HOME/Downloads</span></code> in the browser preferences.</p>
|
|
</div></blockquote>
|
|
<p>The visited WEB pages indexing feature can be enabled on the RCL side
|
|
from the GUI Index configuration panel, or by editing the configuration
|
|
file (set <code class="docutils literal"><span class="pre">processwebqueue</span></code> to 1).</p>
|
|
<p>A current pointer to the extension can be found, along with up-to-date
|
|
instructions, on the <a class="reference external" href="&FAQS;IndexWebHistory">Recoll wiki</a>.</p>
|
|
<p>A copy of the indexed WEB pages is retained by Recoll in a local cache
|
|
(from which previews can be fetched). The cache size can be adjusted
|
|
from the Index configuration / Web history panel. Once the maximum size
|
|
is reached, old pages are purged - both from the cache and the index -
|
|
to make room for new ones, so you need to explicitly archive in some
|
|
other place the pages that you want to keep indefinitely.</p>
|
|
</div>
|
|
<div class="section" id="extended-attributes-data">
|
|
<h3><a class="toc-backref" href="#id14">Extended attributes data</a><a class="headerlink" href="#extended-attributes-data" title="Permalink to this headline">¶</a></h3>
|
|
<p>User extended attributes are named pieces of information that most
|
|
modern file systems can attach to any file.</p>
|
|
<p>RCL versions 1.19 and later process extended attributes as document
|
|
fields by default. For older versions, this has to be activated at build
|
|
time.</p>
|
|
<p>A <a class="reference external" href="http://www.freedesktop.org/wiki/CommonExtendedAttributes">freedesktop
|
|
standard</a>
|
|
defines a few special attributes, which are handled as such by RCL:</p>
|
|
<dl class="docutils">
|
|
<dt>mime_type</dt>
|
|
<dd>If set, this overrides any other determination of the file MIME
|
|
type.</dd>
|
|
<dt>charset</dt>
|
|
<dd>If set, this defines the file character set (mostly useful for plain
|
|
text files).</dd>
|
|
</dl>
|
|
<p>By default, other attributes are handled as RCL fields. On Linux, the
|
|
<code class="docutils literal"><span class="pre">user</span></code> prefix is removed from the name. This can be configured more
|
|
precisely inside the <code class="docutils literal"><span class="pre">`fields</span></code> configuration
|
|
file <#RCL.INSTALL.CONFIG.FIELDS>`__.</p>
|
|
</div>
|
|
<div class="section" id="importing-external-tags">
|
|
<h3><a class="toc-backref" href="#id15">Importing external tags</a><a class="headerlink" href="#importing-external-tags" title="Permalink to this headline">¶</a></h3>
|
|
<p>During indexing, it is possible to import metadata for each file by
|
|
executing commands. For example, this could extract user tag data for
|
|
the file and store it in a field for indexing.</p>
|
|
<p>See the <cite>section about the ``metadatacmds`</cite>
|
|
field <#RCL.INSTALL.CONFIG.RECOLLCONF.METADATACMDS>`__ in the main
|
|
configuration chapter for a description of the configuration syntax.</p>
|
|
<p>As an example, if you would want RCL to use tags managed by tmsu, you
|
|
would add the following to the configuration file:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span>[/some/area/of/the/fs]
|
|
metadatacmds = ; tags = tmsu tags %f
|
|
|
|
|
|
**Note**
|
|
|
|
Depending on the tmsu version, you may need/want to add options like
|
|
``--database=/some/db``.
|
|
</pre></div>
|
|
</div>
|
|
<p>You may want to restrict this processing to a subset of the directory
|
|
tree, because it may slow down indexing a bit
|
|
(<code class="docutils literal"><span class="pre">[some/area/of/the/fs]</span></code>).</p>
|
|
<p>Note the initial semi-colon after the equal sign.</p>
|
|
<p>In the example above, the output of <code class="docutils literal"><span class="pre">tmsu</span></code> is used to set a field
|
|
named <code class="docutils literal"><span class="pre">tags</span></code>. The field name is arbitrary and could be <code class="docutils literal"><span class="pre">tmsu</span></code> or
|
|
<code class="docutils literal"><span class="pre">myfield</span></code> just the same, but <code class="docutils literal"><span class="pre">tags</span></code> is an alias for the standard RCL
|
|
<code class="docutils literal"><span class="pre">keywords</span></code> field, and the <code class="docutils literal"><span class="pre">tmsu</span></code> output will just augment its
|
|
contents. This will avoid the need to extend the <a class="reference external" href="#RCL.PROGRAM.FIELDS">field
|
|
configuration</a>.</p>
|
|
<p>Once re-indexing is performed (you’ll need to force the file reindexing,
|
|
RCL will not detect the need by itself), you will be able to search from
|
|
the query language, through any of its aliases:
|
|
<code class="docutils literal"><span class="pre">tags:some/alternate/values</span></code> or <code class="docutils literal"><span class="pre">tags:all,these,values</span></code> (the compact
|
|
field search syntax is supported for recoll 1.20 and later. For older
|
|
versions, you would need to repeat the <code class="docutils literal"><span class="pre">tags:</span></code> specifier for each
|
|
term, e.g. <code class="docutils literal"><span class="pre">tags:some</span> <span class="pre">OR</span> <span class="pre">tags:alternate</span></code>).</p>
|
|
<p>You should be aware that tags changes will not be detected by the
|
|
indexer if the file itself did not change. One possible workaround would
|
|
be to update the file <code class="docutils literal"><span class="pre">ctime</span></code> when you modify the tags, which would be
|
|
consistent with how extended attributes function. A pair of <code class="docutils literal"><span class="pre">chmod</span></code>
|
|
commands could accomplish this, or a <code class="docutils literal"><span class="pre">touch</span> <span class="pre">-a</span></code> . Alternatively, just
|
|
couple the tag update with a <code class="docutils literal"><span class="pre">recollindex</span> <span class="pre">-e</span> <span class="pre">-i</span> <span class="pre">filename.</span></code></p>
|
|
</div>
|
|
<div class="section" id="the-pdf-input-handler">
|
|
<h3><a class="toc-backref" href="#id16">The PDF input handler</a><a class="headerlink" href="#the-pdf-input-handler" title="Permalink to this headline">¶</a></h3>
|
|
<p>The PDF format is very important for scientific and technical
|
|
documentation, and document archival. It has extensive facilities for
|
|
storing metadata along with the document, and these facilities are
|
|
actually used in the real world.</p>
|
|
<p>In consequence, the <code class="docutils literal"><span class="pre">rclpdf.py</span></code> PDF input handler has more complex
|
|
capabilities than most others, and it is also more configurable.
|
|
Specifically, <code class="docutils literal"><span class="pre">rclpdf.py</span></code> can automatically use tesseract to perform
|
|
OCR if the document text is empty, it can be configured to extract
|
|
specific metadata tags from an XMP packet, and to extract PDF
|
|
attachments.</p>
|
|
<div class="section" id="ocr-with-tesseract">
|
|
<h4>OCR with Tesseract<a class="headerlink" href="#ocr-with-tesseract" title="Permalink to this headline">¶</a></h4>
|
|
<p>If both tesseract and <code class="docutils literal"><span class="pre">pdftoppm</span></code> (generally from the poppler-utils
|
|
package) are installed, the PDF handler may attempt OCR on PDF files
|
|
with no text content. This is controlled by the
|
|
<a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF.PDFOCR">pdfocr</a> configuration
|
|
variable, which is false by default because OCR is very slow.</p>
|
|
<p>The choice of language is very important for successfull OCR. Recoll has
|
|
currently no way to determine this from the document itself. You can set
|
|
the language to use through the contents of a <code class="docutils literal"><span class="pre">.ocrpdflang</span></code> text file
|
|
in the same directory as the PDF document, or through the
|
|
RECOLL_TESSERACT_LANG environment variable, or through the contents of
|
|
an <code class="docutils literal"><span class="pre">ocrpdf</span></code> text file inside the configuration directory. If none of
|
|
the above are used, RCL will try to guess the language from the NLS
|
|
environment.</p>
|
|
</div>
|
|
<div class="section" id="xmp-fields-extraction">
|
|
<h4>XMP fields extraction<a class="headerlink" href="#xmp-fields-extraction" title="Permalink to this headline">¶</a></h4>
|
|
<p>The <code class="docutils literal"><span class="pre">rclpdf.py</span></code> script in RCL version 1.23.2 and later can extract XMP
|
|
metadata fields by executing the <code class="docutils literal"><span class="pre">pdfinfo</span></code> command (usually found with
|
|
poppler-utils). This is controlled by the
|
|
<a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF.PDFEXTRAMETA">pdfextrameta</a>
|
|
configuration variable, which specifies which tags to extract and,
|
|
possibly, how to rename them.</p>
|
|
<p>The <a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF.PDFEXTRAMETAFIX">pdfextrametafix</a>
|
|
variable can be used to designate a file with Python code to edit the
|
|
metadata fields (available for RCL 1.23.3 and later. 1.23.2 has
|
|
equivalent code inside the handler script). Example:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">sys</span>
|
|
<span class="kn">import</span> <span class="nn">re</span>
|
|
|
|
<span class="k">class</span> <span class="nc">MetaFixer</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
|
|
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
|
<span class="k">pass</span>
|
|
|
|
<span class="k">def</span> <span class="nf">metafix</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">nm</span><span class="p">,</span> <span class="n">txt</span><span class="p">):</span>
|
|
<span class="k">if</span> <span class="n">nm</span> <span class="o">==</span> <span class="s1">'bibtex:pages'</span><span class="p">:</span>
|
|
<span class="n">txt</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="sa">r</span><span class="s1">'--'</span><span class="p">,</span> <span class="s1">'-'</span><span class="p">,</span> <span class="n">txt</span><span class="p">)</span>
|
|
<span class="k">elif</span> <span class="n">nm</span> <span class="o">==</span> <span class="s1">'someothername'</span><span class="p">:</span>
|
|
<span class="c1"># do something else</span>
|
|
<span class="k">pass</span>
|
|
<span class="k">elif</span> <span class="n">nm</span> <span class="o">==</span> <span class="s1">'stillanother'</span><span class="p">:</span>
|
|
<span class="c1"># etc.</span>
|
|
<span class="k">pass</span>
|
|
|
|
<span class="k">return</span> <span class="n">txt</span>
|
|
<span class="k">def</span> <span class="nf">wrapup</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">metaheaders</span><span class="p">):</span>
|
|
<span class="k">pass</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>If the ‘metafix()’ method is defined, it is called for each metadata
|
|
field. A new MetaFixer object is created for each PDF document (so the
|
|
object can keep state for, for example, eliminating duplicate values).
|
|
If the ‘wrapup()’ method is defined, it is called at the end of XMP
|
|
fields processing with the whole metadata as parameter, as an array of
|
|
‘(nm, val)’ pairs, allowing an alternate approach for editing or
|
|
adding/deleting fields.</p>
|
|
</div>
|
|
<div class="section" id="pdf-attachment-indexing">
|
|
<h4>PDF attachment indexing<a class="headerlink" href="#pdf-attachment-indexing" title="Permalink to this headline">¶</a></h4>
|
|
<p>If pdftk is installed, and if the the
|
|
<a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF.PDFATTACH">pdfattach</a> configuration
|
|
variable is set, the PDF input handler will try to extract PDF
|
|
attachements for indexing as sub-documents of the PDF file. This is
|
|
disabled by default, because it slows down PDF indexing a bit even if
|
|
not one attachment is ever found (PDF attachments are uncommon in my
|
|
experience).</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="periodic-indexing">
|
|
<h3><a class="toc-backref" href="#id17">Periodic indexing</a><a class="headerlink" href="#periodic-indexing" title="Permalink to this headline">¶</a></h3>
|
|
<div class="section" id="running-indexing">
|
|
<h4>Running indexing<a class="headerlink" href="#running-indexing" title="Permalink to this headline">¶</a></h4>
|
|
<p>Indexing is always performed by the <code class="docutils literal"><span class="pre">recollindex</span></code> program, which can
|
|
be started either from the command line or from the File menu in the
|
|
<code class="docutils literal"><span class="pre">recoll</span></code> GUI program. When started from the GUI, the indexing will run
|
|
on the same configuration <code class="docutils literal"><span class="pre">recoll</span></code> was started on. When started from
|
|
the command line, <code class="docutils literal"><span class="pre">recollindex</span></code> will use the RECOLL_CONFDIR variable
|
|
or accept a <code class="docutils literal"><span class="pre">-c</span></code> confdir option to specify a non-default configuration
|
|
directory.</p>
|
|
<p>If the <code class="docutils literal"><span class="pre">recoll</span></code> program finds no index when it starts, it will
|
|
automatically start indexing (except if canceled).</p>
|
|
<p>The <code class="docutils literal"><span class="pre">recollindex</span></code> indexing process can be interrupted by sending an
|
|
interrupt (Ctrl-C, SIGINT) or terminate (SIGTERM) signal. Some time may
|
|
elapse before the process exits, because it needs to properly flush and
|
|
close the index. This can also be done from the <code class="docutils literal"><span class="pre">recoll</span></code> GUI File >
|
|
Stop Indexing menu entry.</p>
|
|
<p>After such an interruption, the index will be somewhat inconsistent
|
|
because some operations which are normally performed at the end of the
|
|
indexing pass will have been skipped (for example, the stemming and
|
|
spelling databases will be inexistant or out of date). You just need to
|
|
restart indexing at a later time to restore consistency. The indexing
|
|
will restart at the interruption point (the full file tree will be
|
|
traversed, but files that were indexed up to the interruption and for
|
|
which the index is still up to date will not need to be reindexed).</p>
|
|
<p><code class="docutils literal"><span class="pre">recollindex</span></code> has a number of other options which are described in its
|
|
man page. Only a few will be described here.</p>
|
|
<p>Option <code class="docutils literal"><span class="pre">-z</span></code> will reset the index when starting. This is almost the
|
|
same as destroying the index files (the nuance is that the XAP format
|
|
version will not be changed).</p>
|
|
<p>Option <code class="docutils literal"><span class="pre">-Z</span></code> will force the update of all documents without resetting
|
|
the index first. This will not have the “clean start” aspect of <code class="docutils literal"><span class="pre">-z</span></code>,
|
|
but the advantage is that the index will remain available for querying
|
|
while it is rebuilt, which can be a significant advantage if it is very
|
|
big (some installations need days for a full index rebuild).</p>
|
|
<p>Option <code class="docutils literal"><span class="pre">-k</span></code> will force retrying files which previously failed to be
|
|
indexed, for example because of a missing helper program.</p>
|
|
<p>Of special interest also, maybe, are the <code class="docutils literal"><span class="pre">-i</span></code> and <code class="docutils literal"><span class="pre">-f</span></code> options.
|
|
<code class="docutils literal"><span class="pre">-i</span></code> allows indexing an explicit list of files (given as command line
|
|
parameters or read on <code class="docutils literal"><span class="pre">stdin</span></code>). <code class="docutils literal"><span class="pre">-f</span></code> tells <code class="docutils literal"><span class="pre">recollindex</span></code> to ignore
|
|
file selection parameters from the configuration. Together, these
|
|
options allow building a custom file selection process for some area of
|
|
the file system, by adding the top directory to the <code class="docutils literal"><span class="pre">skippedPaths</span></code>
|
|
list and using an appropriate file selection method to build the file
|
|
list to be fed to <code class="docutils literal"><span class="pre">recollindex</span></code> <code class="docutils literal"><span class="pre">-if</span></code>. Trivial example:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">find</span> <span class="o">.</span> <span class="o">-</span><span class="n">name</span> <span class="n">indexable</span><span class="o">.</span><span class="n">txt</span> <span class="o">-</span><span class="nb">print</span> <span class="o">|</span> <span class="n">recollindex</span> <span class="o">-</span><span class="k">if</span>
|
|
</pre></div>
|
|
</div>
|
|
<p><code class="docutils literal"><span class="pre">recollindex</span></code> <code class="docutils literal"><span class="pre">-i</span></code> will not descend into subdirectories specified as
|
|
parameters, but just add them as index entries. It is up to the external
|
|
file selection method to build the complete file list.</p>
|
|
</div>
|
|
<div class="section" id="using-cron-to-automate-indexing">
|
|
<h4>Using <code class="docutils literal"><span class="pre">cron</span></code> to automate indexing<a class="headerlink" href="#using-cron-to-automate-indexing" title="Permalink to this headline">¶</a></h4>
|
|
<p>The most common way to set up indexing is to have a cron task execute it
|
|
every night. For example the following <code class="docutils literal"><span class="pre">crontab</span></code> entry would do it
|
|
every day at 3:30AM (supposing <code class="docutils literal"><span class="pre">recollindex</span></code> is in your PATH):</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="mi">30</span> <span class="mi">3</span> <span class="o">*</span> <span class="o">*</span> <span class="o">*</span> <span class="n">recollindex</span> <span class="o">></span> <span class="o">/</span><span class="n">some</span><span class="o">/</span><span class="n">tmp</span><span class="o">/</span><span class="nb">dir</span><span class="o">/</span><span class="n">recolltrace</span> <span class="mi">2</span><span class="o">>&</span><span class="mi">1</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Or, using <code class="docutils literal"><span class="pre">anacron</span></code>:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="mi">1</span> <span class="mi">15</span> <span class="n">su</span> <span class="n">mylogin</span> <span class="o">-</span><span class="n">c</span> <span class="s2">"recollindex recollindex > /tmp/rcltraceme 2>&1"</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>As of version 1.17 the RCL GUI has dialogs to manage <code class="docutils literal"><span class="pre">crontab</span></code> entries
|
|
for <code class="docutils literal"><span class="pre">recollindex</span></code>. You can reach them from the Preferences > Indexing
|
|
Schedule menu. They only work with the good old <code class="docutils literal"><span class="pre">cron</span></code>, and do not
|
|
give access to all features of <code class="docutils literal"><span class="pre">cron</span></code> scheduling.</p>
|
|
<p>The usual command to edit your <code class="docutils literal"><span class="pre">crontab</span></code> is <code class="docutils literal"><span class="pre">crontab</span></code> <code class="docutils literal"><span class="pre">-e</span></code> (which
|
|
will usually start the <code class="docutils literal"><span class="pre">vi</span></code> editor to edit the file). You may have
|
|
more sophisticated tools available on your system.</p>
|
|
<p>Please be aware that there may be differences between your usual
|
|
interactive command line environment and the one seen by crontab
|
|
commands. Especially the PATH variable may be of concern. Please check
|
|
the crontab manual pages about possible issues.</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="real-time-indexing">
|
|
<h3><a class="toc-backref" href="#id18">Real time indexing</a><a class="headerlink" href="#real-time-indexing" title="Permalink to this headline">¶</a></h3>
|
|
<p>Real time monitoring/indexing is performed by starting the
|
|
<code class="docutils literal"><span class="pre">recollindex</span></code> <code class="docutils literal"><span class="pre">-m</span></code> command. With this option, <code class="docutils literal"><span class="pre">recollindex</span></code> will
|
|
detach from the terminal and become a daemon, permanently monitoring
|
|
file changes and updating the index.</p>
|
|
<p>While it is convenient that data is indexed in real time, repeated
|
|
indexing can generate a significant load on the system when files such
|
|
as email folders change. Also, monitoring large file trees by itself
|
|
significantly taxes system resources. You probably do not want to enable
|
|
it if your system is short on resources. Periodic indexing is adequate
|
|
in most cases.</p>
|
|
<p>As of RCL 1.24, you can set the
|
|
<a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF.MONITORDIRS">monitordirs</a>
|
|
configuration variable to specify that only a subset of your indexed
|
|
files will be monitored for instant indexing. In this situation, an
|
|
incremental pass on the full tree can be triggered by either restarting
|
|
the indexer, or just running <code class="docutils literal"><span class="pre">recollindex</span></code>, which will notify the
|
|
running process. The <code class="docutils literal"><span class="pre">recoll</span></code> GUI also has a menu entry for this.</p>
|
|
<div class="section" id="real-time-indexing-automatic-daemon-start">
|
|
<h4>Real time indexing: automatic daemon start<a class="headerlink" href="#real-time-indexing-automatic-daemon-start" title="Permalink to this headline">¶</a></h4>
|
|
<p>Under KDE, Gnome and some other desktop environments, the daemon can
|
|
automatically started when you log in, by creating a desktop file inside
|
|
the <code class="docutils literal"><span class="pre">~/.config/autostart</span></code> directory. This can be done for you by the
|
|
RCL GUI. Use the Preferences->Indexing Schedule menu.</p>
|
|
<p>With older X11 setups, starting the daemon is normally performed as part
|
|
of the user session script.</p>
|
|
<p>The <code class="docutils literal"><span class="pre">rclmon.sh</span></code> script can be used to easily start and stop the
|
|
daemon. It can be found in the <code class="docutils literal"><span class="pre">examples</span></code> directory (typically
|
|
<code class="docutils literal"><span class="pre">/usr/local/[share/]recoll/examples</span></code>).</p>
|
|
<p>For example, my out of fashion xdm-based session has a <code class="docutils literal"><span class="pre">.xsession</span></code>
|
|
script with the following lines at the end:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span>recollconf=$HOME/.recoll-home
|
|
recolldata=/usr/local/share/recoll
|
|
RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh start
|
|
|
|
fvwm
|
|
</pre></div>
|
|
</div>
|
|
<p>The indexing daemon gets started, then the window manager, for which the
|
|
session waits.</p>
|
|
<p>By default the indexing daemon will monitor the state of the X11
|
|
session, and exit when it finishes, it is not necessary to kill it
|
|
explicitly. (The X11 server monitoring can be disabled with option
|
|
<code class="docutils literal"><span class="pre">-x</span></code> to <code class="docutils literal"><span class="pre">recollindex</span></code>).</p>
|
|
<p>If you use the daemon completely out of an X11 session, you need to add
|
|
option <code class="docutils literal"><span class="pre">-x</span></code> to disable X11 session monitoring (else the daemon will
|
|
not start).</p>
|
|
</div>
|
|
<div class="section" id="real-time-indexing-miscellaneous-details">
|
|
<h4>Real time indexing: miscellaneous details<a class="headerlink" href="#real-time-indexing-miscellaneous-details" title="Permalink to this headline">¶</a></h4>
|
|
<p>By default, the messages from the indexing daemon will be sent to the
|
|
same file as those from the interactive commands (<code class="docutils literal"><span class="pre">logfilename</span></code>). You
|
|
may want to change this by setting the <code class="docutils literal"><span class="pre">daemlogfilename</span></code> and
|
|
<code class="docutils literal"><span class="pre">daemloglevel</span></code> configuration parameters. Also the log file will only
|
|
be truncated when the daemon starts. If the daemon runs permanently, the
|
|
log file may grow quite big, depending on the log level.</p>
|
|
<p>When building RCL, the real time indexing support can be customised
|
|
during package <a class="reference external" href="#RCL.INSTALL.BUILDING">configuration</a> with the
|
|
<code class="docutils literal"><span class="pre">--with[out]-fam</span></code> or <code class="docutils literal"><span class="pre">--with[out]-inotify</span></code> options. The default is
|
|
currently to include inotify monitoring on systems that support it, and,
|
|
as of RCL 1.17, gamin support on FreeBSD.</p>
|
|
<blockquote>
|
|
<div><p><strong>Note</strong></p>
|
|
<p>On Linux systems, monitoring a big tree may need increasing the
|
|
resources available to inotify, which are normally defined in
|
|
<code class="docutils literal"><span class="pre">/etc/sysctl.conf</span></code>.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="c1">### inotify</span>
|
|
<span class="c1">#</span>
|
|
<span class="c1"># cat /proc/sys/fs/inotify/max_queued_events - 16384</span>
|
|
<span class="c1"># cat /proc/sys/fs/inotify/max_user_instances - 128</span>
|
|
<span class="c1"># cat /proc/sys/fs/inotify/max_user_watches - 16384</span>
|
|
<span class="c1">#</span>
|
|
<span class="c1"># -- Change to:</span>
|
|
<span class="c1">#</span>
|
|
<span class="n">fs</span><span class="o">.</span><span class="n">inotify</span><span class="o">.</span><span class="n">max_queued_events</span><span class="o">=</span><span class="mi">32768</span>
|
|
<span class="n">fs</span><span class="o">.</span><span class="n">inotify</span><span class="o">.</span><span class="n">max_user_instances</span><span class="o">=</span><span class="mi">256</span>
|
|
<span class="n">fs</span><span class="o">.</span><span class="n">inotify</span><span class="o">.</span><span class="n">max_user_watches</span><span class="o">=</span><span class="mi">32768</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Especially, you will need to trim your tree or adjust the
|
|
<code class="docutils literal"><span class="pre">max_user_watches</span></code> value if indexing exits with a message about
|
|
errno <code class="docutils literal"><span class="pre">ENOSPC</span></code> (28) from <code class="docutils literal"><span class="pre">inotify_add_watch</span></code>.</p>
|
|
<p><strong>Note</strong></p>
|
|
<p>When using the real time monitor, it may happen that some files need
|
|
to be indexed, but change so often that they impose an excessive
|
|
load for the system.</p>
|
|
<p>RCL provides a configuration option to specify the minimum time
|
|
before which a file, specified by a wildcard pattern, cannot be
|
|
reindexed. See the <code class="docutils literal"><span class="pre">mondelaypatterns</span></code> parameter in the
|
|
<a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF.MISC">configuration section</a>.</p>
|
|
</div></blockquote>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="searching">
|
|
<h2><a class="toc-backref" href="#id19">Searching</a><a class="headerlink" href="#searching" title="Permalink to this headline">¶</a></h2>
|
|
<div class="section" id="searching-with-the-qt-graphical-user-interface">
|
|
<h3><a class="toc-backref" href="#id20">Searching with the Qt graphical user interface</a><a class="headerlink" href="#searching-with-the-qt-graphical-user-interface" title="Permalink to this headline">¶</a></h3>
|
|
<p>The <code class="docutils literal"><span class="pre">recoll</span></code> program provides the main user interface for searching.
|
|
It is based on the Qt library.</p>
|
|
<p><code class="docutils literal"><span class="pre">recoll</span></code> has two search modes:</p>
|
|
<ul class="simple">
|
|
<li>Simple search (the default, on the main screen) has a single entry
|
|
field where you can enter multiple words.</li>
|
|
<li>Advanced search (a panel accessed through the Tools menu or the
|
|
toolbox bar icon) has multiple entry fields, which you may use to
|
|
build a logical condition, with additional filtering on file type,
|
|
location in the file system, modification date, and size.</li>
|
|
</ul>
|
|
<p>In most cases, you can enter the terms as you think them, even if they
|
|
contain embedded punctuation or other non-textual characters. For
|
|
example, RCL can handle things like email addresses, or arbitrary cut
|
|
and paste from another text window, punctation and all.</p>
|
|
<p>The main case where you should enter text differently from how it is
|
|
printed is for east-asian languages (Chinese, Japanese, Korean). Words
|
|
composed of single or multiple characters should be entered separated by
|
|
white space in this case (they would typically be printed without white
|
|
space).</p>
|
|
<p>Some searches can be quite complex, and you may want to re-use them
|
|
later, perhaps with some tweaking. RCL versions 1.21 and later can save
|
|
and restore searches, using XML files. See <a class="reference external" href="#RCL.SEARCH.SAVING">Saving and restoring
|
|
queries</a>.</p>
|
|
<div class="section" id="simple-search">
|
|
<h4>Simple search<a class="headerlink" href="#simple-search" title="Permalink to this headline">¶</a></h4>
|
|
<p>Start the <code class="docutils literal"><span class="pre">recoll</span></code> program.</p>
|
|
<p>Possibly choose a search mode: Any term, All terms, File name or Query
|
|
language.</p>
|
|
<p>Enter search term(s) in the text field at the top of the window.</p>
|
|
<p>Click the Search button or hit the Enter key to start the search.</p>
|
|
<p>The initial default search mode is Query language. Without special
|
|
directives, this will look for documents containing all of the search
|
|
terms (the ones with more terms will get better scores), just like the
|
|
All terms mode. Any term will search for documents where at least one of
|
|
the terms appear.</p>
|
|
<p>The Query Language features are described in <a class="reference external" href="#RCL.SEARCH.LANG">a separate
|
|
section</a>.</p>
|
|
<p>All search modes allow terms to be expanded with wildcards characters
|
|
(<code class="docutils literal"><span class="pre">*</span></code>, <code class="docutils literal"><span class="pre">?</span></code>, <code class="docutils literal"><span class="pre">[]</span></code>). See the <a class="reference external" href="#RCL.SEARCH.WILDCARDS">section about
|
|
wildcards</a> for more details.</p>
|
|
<p>The File name search mode will specifically look for file names. The
|
|
point of having a separate file name search is that wild card expansion
|
|
can be performed more efficiently on a small subset of the index
|
|
(allowing wild cards on the left of terms without excessive penality).
|
|
Things to know:</p>
|
|
<ul class="simple">
|
|
<li>White space in the entry should match white space in the file name,
|
|
and is not treated specially.</li>
|
|
<li>The search is insensitive to character case and accents,
|
|
independantly of the type of index.</li>
|
|
<li>An entry without any wild card character and not capitalized will be
|
|
prepended and appended with ‘*’ (ie: etc -> *etc*, but Etc ->
|
|
etc).</li>
|
|
<li>If you have a big index (many files), excessively generic fragments
|
|
may result in inefficient searches.</li>
|
|
</ul>
|
|
<p>In all modes except File name, you can search for exact phrases
|
|
(adjacent words in a given order) by enclosing the input inside double
|
|
quotes. Ex: <code class="docutils literal"><span class="pre">"virtual</span> <span class="pre">reality"</span></code>.</p>
|
|
<p>When using a stripped index (the default), character case has no
|
|
influence on search, except that you can disable stem expansion for any
|
|
term by capitalizing it. Ie: a search for <code class="docutils literal"><span class="pre">floor</span></code> will also normally
|
|
look for <code class="docutils literal"><span class="pre">flooring</span></code>, <code class="docutils literal"><span class="pre">floored</span></code>, etc., but a search for <code class="docutils literal"><span class="pre">Floor</span></code>
|
|
will only look for <code class="docutils literal"><span class="pre">floor</span></code>, in any character case. Stemming can also
|
|
be disabled globally in the preferences. When using a raw index, <a class="reference external" href="#RCL.SEARCH.CASEDIAC">the
|
|
rules are a bit more complicated</a>.</p>
|
|
<p>RCL remembers the last few searches that you performed. You can directly
|
|
access the search history by clicking the clock button on the right of
|
|
the search entry, while the latter is empty. Otherwise, the history is
|
|
used for entry completion (see next). Only the search texts are
|
|
remembered, not the mode (all/any/file name).</p>
|
|
<p>While text is entered in the search area, <code class="docutils literal"><span class="pre">recoll</span></code> will display
|
|
possible completions, filtered from the history and the index search
|
|
terms. This can be disabled with a GUI Preferences option.</p>
|
|
<p>Double-clicking on a word in the result list or a preview window will
|
|
insert it into the simple search entry field.</p>
|
|
<p>You can cut and paste any text into an All terms or Any term search
|
|
field, punctuation, newlines and all - except for wildcard characters
|
|
(single <code class="docutils literal"><span class="pre">?</span></code> characters are ok). RCL will process it and produce a
|
|
meaningful search. This is what most differentiates this mode from the
|
|
Query Language mode, where you have to care about the syntax.</p>
|
|
<p>You can use the <a class="reference external" href="#RCL.SEARCH.GUI.COMPLEX">Tools > Advanced search</a>
|
|
dialog for more complex searches.</p>
|
|
</div>
|
|
<div class="section" id="the-default-result-list">
|
|
<h4>The default result list<a class="headerlink" href="#the-default-result-list" title="Permalink to this headline">¶</a></h4>
|
|
<p>After starting a search, a list of results will instantly be displayed
|
|
in the main list window.</p>
|
|
<p>By default, the document list is presented in order of relevance (how
|
|
well the system estimates that the document matches the query). You can
|
|
sort the result by ascending or descending date by using the vertical
|
|
arrows in the toolbar.</p>
|
|
<p>Clicking on the <code class="docutils literal"><span class="pre">Preview</span></code> link for an entry will open an internal
|
|
preview window for the document. Further <code class="docutils literal"><span class="pre">Preview</span></code> clicks for the same
|
|
search will open tabs in the existing preview window. You can use
|
|
Shift+Click to force the creation of another preview window, which may
|
|
be useful to view the documents side by side. (You can also browse
|
|
successive results in a single preview window by typing
|
|
Shift+ArrowUp/Down in the window).</p>
|
|
<p>Clicking the <code class="docutils literal"><span class="pre">Open</span></code> link will start an external viewer for the
|
|
document. By default, RCL lets the desktop choose the appropriate
|
|
application for most document types (there is a short list of
|
|
exceptions, see further). If you prefer to completely customize the
|
|
choice of applications, you can uncheck the Use desktop preferences
|
|
option in the GUI preferences dialog, and click the Choose editor
|
|
applications button to adjust the predefined RCL choices. The tool
|
|
accepts multiple selections of MIME types (e.g. to set up the editor for
|
|
the dozens of office file types).</p>
|
|
<p>Even when Use desktop preferences is checked, there is a small list of
|
|
exceptions, for MIME types where the RCL choice should override the
|
|
desktop one. These are applications which are well integrated with RCL,
|
|
especially evince for viewing PDF and Postscript files because of its
|
|
support for opening the document at a specific page and passing a search
|
|
string as an argument. Of course, you can edit the list (in the GUI
|
|
preferences) if you would prefer to lose the functionality and use the
|
|
standard desktop tool.</p>
|
|
<p>You may also change the choice of applications by editing the
|
|
<code class="docutils literal"><span class="pre">`mimeview</span></code> <#RCL.INSTALL.CONFIG.MIMEVIEW>`__ configuration file if
|
|
you find this more convenient.</p>
|
|
<p>Each result entry also has a right-click menu with an Open With entry.
|
|
This lets you choose an application from the list of those which
|
|
registered with the desktop for the document MIME type.</p>
|
|
<p>The <code class="docutils literal"><span class="pre">Preview</span></code> and <code class="docutils literal"><span class="pre">Open</span></code> edit links may not be present for all
|
|
entries, meaning that RCL has no configured way to preview a given file
|
|
type (which was indexed by name only), or no configured external editor
|
|
for the file type. This can sometimes be adjusted simply by tweaking the
|
|
<code class="docutils literal"><span class="pre">`mimemap</span></code> <#RCL.INSTALL.CONFIG.MIMEMAP>`__ and
|
|
<code class="docutils literal"><span class="pre">`mimeview</span></code> <#RCL.INSTALL.CONFIG.MIMEVIEW>`__ configuration files (the
|
|
latter can be modified with the user preferences dialog).</p>
|
|
<p>The format of the result list entries is entirely configurable by using
|
|
the preference dialog to <a class="reference external" href="#RCL.SEARCH.GUI.CUSTOM.RESLIST">edit an HTML
|
|
fragment</a>.</p>
|
|
<p>You can click on the <code class="docutils literal"><span class="pre">Query</span> <span class="pre">details</span></code> link at the top of the results
|
|
page to see the query actually performed, after stem expansion and other
|
|
processing.</p>
|
|
<p>Double-clicking on any word inside the result list or a preview window
|
|
will insert it into the simple search text.</p>
|
|
<p>The result list is divided into pages (the size of which you can change
|
|
in the preferences). Use the arrow buttons in the toolbar or the links
|
|
at the bottom of the page to browse the results.</p>
|
|
<div class="section" id="no-results-the-spelling-suggestions">
|
|
<h5>No results: the spelling suggestions<a class="headerlink" href="#no-results-the-spelling-suggestions" title="Permalink to this headline">¶</a></h5>
|
|
<p>When a search yields no result, and if the aspell dictionary is
|
|
configured, RCL will try to check for misspellings among the query
|
|
terms, and will propose lists of replacements. Clicking on one of the
|
|
suggestions will replace the word and restart the search. You can hold
|
|
any of the modifier keys (Ctrl, Shift, etc.) while clicking if you would
|
|
rather stay on the suggestion screen because several terms need
|
|
replacement.</p>
|
|
</div>
|
|
<div class="section" id="the-result-list-right-click-menu">
|
|
<h5>The result list right-click menu<a class="headerlink" href="#the-result-list-right-click-menu" title="Permalink to this headline">¶</a></h5>
|
|
<p>Apart from the preview and edit links, you can display a pop-up menu by
|
|
right-clicking over a paragraph in the result list. This menu has the
|
|
following entries:</p>
|
|
<ul class="simple">
|
|
<li>Preview</li>
|
|
<li>Open</li>
|
|
<li>Open With</li>
|
|
<li>Run Script</li>
|
|
<li>Copy File Name</li>
|
|
<li>Copy Url</li>
|
|
<li>Save to File</li>
|
|
<li>Find similar</li>
|
|
<li>Preview Parent document</li>
|
|
<li>Open Parent document</li>
|
|
<li>Open Snippets Window</li>
|
|
</ul>
|
|
<p>The Preview and Open entries do the same thing as the corresponding
|
|
links.</p>
|
|
<p>Open With lets you open the document with one of the applications
|
|
claiming to be able to handle its MIME type (the information comes from
|
|
the <code class="docutils literal"><span class="pre">.desktop</span></code> files in <code class="docutils literal"><span class="pre">/usr/share/applications</span></code>).</p>
|
|
<p>Run Script allows starting an arbitrary command on the result file. It
|
|
will only appear for results which are top-level files. See
|
|
<a class="reference external" href="#RCL.SEARCH.GUI.RUNSCRIPT">further</a> for a more detailed description.</p>
|
|
<p>The Copy File Name and Copy Url copy the relevant data to the clipboard,
|
|
for later pasting.</p>
|
|
<p>Save to File allows saving the contents of a result document to a chosen
|
|
file. This entry will only appear if the document does not correspond to
|
|
an existing file, but is a subdocument inside such a file (ie: an email
|
|
attachment). It is especially useful to extract attachments with no
|
|
associated editor.</p>
|
|
<p>The Open/Preview Parent document entries allow working with the higher
|
|
level document (e.g. the email message an attachment comes from). RCL is
|
|
sometimes not totally accurate as to what it can or can’t do in this
|
|
area. For example the Parent entry will also appear for an email which
|
|
is part of an mbox folder file, but you can’t actually visualize the
|
|
mbox (there will be an error dialog if you try).</p>
|
|
<p>If the document is a top-level file, Open Parent will start the default
|
|
file manager on the enclosing filesystem directory.</p>
|
|
<p>The Find similar entry will select a number of relevant term from the
|
|
current document and enter them into the simple search field. You can
|
|
then start a simple search, with a good chance of finding documents
|
|
related to the current result. I can’t remember a single instance where
|
|
this function was actually useful to me...</p>
|
|
<p>The Open Snippets Window entry will only appear for documents which
|
|
support page breaks (typically PDF, Postscript, DVI). The snippets
|
|
window lists extracts from the document, taken around search terms
|
|
occurrences, along with the corresponding page number, as links which
|
|
can be used to start the native viewer on the appropriate page. If the
|
|
viewer supports it, its search function will also be primed with one of
|
|
the search terms.</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="the-result-table">
|
|
<h4>The result table<a class="headerlink" href="#the-result-table" title="Permalink to this headline">¶</a></h4>
|
|
<p>In RCL 1.15 and newer, the results can be displayed in spreadsheet-like
|
|
fashion. You can switch to this presentation by clicking the table-like
|
|
icon in the toolbar (this is a toggle, click again to restore the list).</p>
|
|
<p>Clicking on the column headers will allow sorting by the values in the
|
|
column. You can click again to invert the order, and use the header
|
|
right-click menu to reset sorting to the default relevance order (you
|
|
can also use the sort-by-date arrows to do this).</p>
|
|
<p>Both the list and the table display the same underlying results. The
|
|
sort order set from the table is still active if you switch back to the
|
|
list mode. You can click twice on a date sort arrow to reset it from
|
|
there.</p>
|
|
<p>The header right-click menu allows adding or deleting columns. The
|
|
columns can be resized, and their order can be changed (by dragging).
|
|
All the changes are recorded when you quit <code class="docutils literal"><span class="pre">recoll</span></code></p>
|
|
<p>Hovering over a table row will update the detail area at the bottom of
|
|
the window with the corresponding values. You can click the row to
|
|
freeze the display. The bottom area is equivalent to a result list
|
|
paragraph, with links for starting a preview or a native application,
|
|
and an equivalent right-click menu. Typing Esc (the Escape key) will
|
|
unfreeze the display.</p>
|
|
</div>
|
|
<div class="section" id="running-arbitrary-commands-on-result-files-1-20-and-later">
|
|
<h4>Running arbitrary commands on result files (1.20 and later)<a class="headerlink" href="#running-arbitrary-commands-on-result-files-1-20-and-later" title="Permalink to this headline">¶</a></h4>
|
|
<p>Apart from the Open and Open With operations, which allow starting an
|
|
application on a result document (or a temporary copy), based on its
|
|
MIME type, it is also possible to run arbitrary commands on results
|
|
which are top-level files, using the Run Script entry in the results
|
|
pop-up menu.</p>
|
|
<p>The commands which will appear in the Run Script submenu must be defined
|
|
by <code class="docutils literal"><span class="pre">.desktop</span></code> files inside the <code class="docutils literal"><span class="pre">scripts</span></code> subdirectory of the current
|
|
configuration directory.</p>
|
|
<p>Here follows an example of a <code class="docutils literal"><span class="pre">.desktop</span></code> file, which could be named for
|
|
example, <code class="docutils literal"><span class="pre">~/.recoll/scripts/myscript.desktop</span></code> (the exact file name
|
|
inside the directory is irrelevant):</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">Desktop</span> <span class="n">Entry</span><span class="p">]</span>
|
|
<span class="n">Type</span><span class="o">=</span><span class="n">Application</span>
|
|
<span class="n">Name</span><span class="o">=</span><span class="n">MyFirstScript</span>
|
|
<span class="n">Exec</span><span class="o">=/</span><span class="n">home</span><span class="o">/</span><span class="n">me</span><span class="o">/</span><span class="nb">bin</span><span class="o">/</span><span class="n">tryscript</span> <span class="o">%</span><span class="n">F</span>
|
|
<span class="n">MimeType</span><span class="o">=*/*</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>The <code class="docutils literal"><span class="pre">Name</span></code> attribute defines the label which will appear inside the
|
|
Run Script menu. The <code class="docutils literal"><span class="pre">Exec</span></code> attribute defines the program to be run,
|
|
which does not need to actually be a script, of course. The <code class="docutils literal"><span class="pre">MimeType</span></code>
|
|
attribute is not used, but needs to exist.</p>
|
|
<p>The commands defined this way can also be used from links inside the
|
|
<a class="reference external" href="#RCL.SEARCH.GUI.CUSTOM.RESLIST.PARA">result paragraph</a>.</p>
|
|
<p>As an example, it might make sense to write a script which would move
|
|
the document to the trash and purge it from the RCL index.</p>
|
|
</div>
|
|
<div class="section" id="displaying-thumbnails">
|
|
<h4>Displaying thumbnails<a class="headerlink" href="#displaying-thumbnails" title="Permalink to this headline">¶</a></h4>
|
|
<p>The default format for the result list entries and the detail area of
|
|
the result table display an icon for each result document. The icon is
|
|
either a generic one determined from the MIME type, or a thumbnail of
|
|
the document appearance. Thumbnails are only displayed if found in the
|
|
standard freedesktop location, where they would typically have been
|
|
created by a file manager.</p>
|
|
<p>Recoll has no capability to create thumbnails. A relatively simple trick
|
|
is to use the Open parent document/folder entry in the result list popup
|
|
menu. This should open a file manager window on the containing
|
|
directory, which should in turn create the thumbnails (depending on your
|
|
settings). Restarting the search should then display the thumbnails.</p>
|
|
<p>There are also <a class="reference external" href="&FAQS;ResultsThumbnails.wiki">some pointers about thumbnail
|
|
generation</a> on the RCL wiki.</p>
|
|
</div>
|
|
<div class="section" id="the-preview-window">
|
|
<h4>The preview window<a class="headerlink" href="#the-preview-window" title="Permalink to this headline">¶</a></h4>
|
|
<p>The preview window opens when you first click a <code class="docutils literal"><span class="pre">Preview</span></code> link inside
|
|
the result list.</p>
|
|
<p>Subsequent preview requests for a given search open new tabs in the
|
|
existing window (except if you hold the Shift key while clicking which
|
|
will open a new window for side by side viewing).</p>
|
|
<p>Starting another search and requesting a preview will create a new
|
|
preview window. The old one stays open until you close it.</p>
|
|
<p>You can close a preview tab by typing Ctrl-W (Ctrl + W) in the window.
|
|
Closing the last tab for a window will also close the window.</p>
|
|
<p>Of course you can also close a preview window by using the window
|
|
manager button in the top of the frame.</p>
|
|
<p>You can display successive or previous documents from the result list
|
|
inside a preview tab by typing Shift+Down or Shift+Up (Down and Up are
|
|
the arrow keys).</p>
|
|
<p>A right-click menu in the text area allows switching between displaying
|
|
the main text or the contents of fields associated to the document (ie:
|
|
author, abtract, etc.). This is especially useful in cases where the
|
|
term match did not occur in the main text but in one of the fields. In
|
|
the case of images, you can switch between three displays: the image
|
|
itself, the image metadata as extracted by <code class="docutils literal"><span class="pre">exiftool</span></code> and the fields,
|
|
which is the metadata stored in the index.</p>
|
|
<p>You can print the current preview window contents by typing Ctrl-P (Ctrl
|
|
+ P) in the window text.</p>
|
|
<div class="section" id="searching-inside-the-preview">
|
|
<h5>Searching inside the preview<a class="headerlink" href="#searching-inside-the-preview" title="Permalink to this headline">¶</a></h5>
|
|
<p>The preview window has an internal search capability, mostly controlled
|
|
by the panel at the bottom of the window, which works in two modes: as a
|
|
classical editor incremental search, where we look for the text entered
|
|
in the entry zone, or as a way to walk the matches between the document
|
|
and the RCL query that found it.</p>
|
|
<dl class="docutils">
|
|
<dt>Incremental text search</dt>
|
|
<dd><p class="first">The preview tabs have an internal incremental search function. You
|
|
initiate the search either by typing a / (slash) or CTL-F inside the
|
|
text area or by clicking into the Search for: text field and
|
|
entering the search string. You can then use the Next and Previous
|
|
buttons to find the next/previous occurrence. You can also type F3
|
|
inside the text area to get to the next occurrence.</p>
|
|
<p class="last">If you have a search string entered and you use Ctrl-Up/Ctrl-Down to
|
|
browse the results, the search is initiated for each successive
|
|
document. If the string is found, the cursor will be positioned at
|
|
the first occurrence of the search string.</p>
|
|
</dd>
|
|
<dt>Walking the match lists</dt>
|
|
<dd>If the entry area is empty when you click the Next or Previous
|
|
buttons, the editor will be scrolled to show the next match to any
|
|
search term (the next highlighted zone). If you select a search
|
|
group from the dropdown list and click Next or Previous, the match
|
|
list for this group will be walked. This is not the same as a text
|
|
search, because the occurences will include non-exact matches (as
|
|
caused by stemming or wildcards). The search will revert to the text
|
|
mode as soon as you edit the entry area.</dd>
|
|
</dl>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="the-query-fragments-window">
|
|
<h4>The Query Fragments window<a class="headerlink" href="#the-query-fragments-window" title="Permalink to this headline">¶</a></h4>
|
|
<p>Selecting the Tools > Query Fragments menu entry will open a window with
|
|
radio- and check-buttons which can be used to activate query language
|
|
fragments for filtering the current query. This can be useful if you
|
|
have frequent reusable selectors, for example, filtering on alternate
|
|
directories, or searching just one category of files, not covered by the
|
|
standard category selectors.</p>
|
|
<p>The contents of the window are entirely customizable, and defined by the
|
|
contents of the <code class="docutils literal"><span class="pre">fragbuts.xml</span></code> file inside the configuration
|
|
directory. The sample file distributed with RCL (which you should be
|
|
able to find under <code class="docutils literal"><span class="pre">/usr/share/recoll/examples/fragbuts.xml</span></code>),
|
|
contains an example which filters the results from the WEB history.</p>
|
|
<p>Here follows an example:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
<fragbuts version="1.0">
|
|
|
|
<radiobuttons>
|
|
|
|
<fragbut>
|
|
<label>Include Web Results</label>
|
|
<frag></frag>
|
|
</fragbut>
|
|
|
|
<fragbut>
|
|
<label>Exclude Web Results</label>
|
|
<frag>-rclbes:BGL</frag>
|
|
</fragbut>
|
|
|
|
<fragbut>
|
|
<label>Only Web Results</label>
|
|
<frag>rclbes:BGL</frag>
|
|
</fragbut>
|
|
|
|
</radiobuttons>
|
|
|
|
<buttons>
|
|
|
|
<fragbut>
|
|
<label>Year 2010</label>
|
|
<frag>date:2010-01-01/2010-12-31</frag>
|
|
</fragbut>
|
|
|
|
<fragbut>
|
|
<label>My Great Directory Only</label>
|
|
<frag>dir:/my/great/directory</frag>
|
|
</fragbut>
|
|
|
|
</buttons>
|
|
</fragbuts>
|
|
</pre></div>
|
|
</div>
|
|
<p>Each <code class="docutils literal"><span class="pre">radiobuttons</span></code> or <code class="docutils literal"><span class="pre">buttons</span></code> section defines a line of
|
|
checkbuttons or radiobuttons inside the window. Any number of buttons
|
|
can be selected, but the radiobuttons in a line are exclusive.</p>
|
|
<p>Each <code class="docutils literal"><span class="pre">fragbut</span></code> section defines the label for a button, and the Query
|
|
Language fragment which will be added (as an AND filter) before
|
|
performing the query if the button is active.</p>
|
|
<p>This feature is new in RCL 1.20, and will probably be refined depending
|
|
on user feedback.</p>
|
|
</div>
|
|
<div class="section" id="complex-advanced-search">
|
|
<h4>Complex/advanced search<a class="headerlink" href="#complex-advanced-search" title="Permalink to this headline">¶</a></h4>
|
|
<p>The advanced search dialog helps you build more complex queries without
|
|
memorizing the search language constructs. It can be opened through the
|
|
Tools menu or through the main toolbar.</p>
|
|
<p>RCL keeps a history of searches. See <a class="reference external" href="#RCL.SEARCH.GUI.COMPLEX.HISTORY">Advanced search
|
|
history</a>.</p>
|
|
<p>The dialog has two tabs:</p>
|
|
<ol class="arabic simple">
|
|
<li>The first tab lets you specify terms to search for, and permits
|
|
specifying multiple clauses which are combined to build the search.</li>
|
|
<li>The second tab lets filter the results according to file size, date
|
|
of modification, MIME type, or location.</li>
|
|
</ol>
|
|
<p>Click on the Start Search button in the advanced search dialog, or type
|
|
Enter in any text field to start the search. The button in the main
|
|
window always performs a simple search.</p>
|
|
<p>Click on the <code class="docutils literal"><span class="pre">Show</span> <span class="pre">query</span> <span class="pre">details</span></code> link at the top of the result page
|
|
to see the query expansion.</p>
|
|
<div class="section" id="avanced-search-the-find-tab">
|
|
<h5>Avanced search: the “find” tab<a class="headerlink" href="#avanced-search-the-find-tab" title="Permalink to this headline">¶</a></h5>
|
|
<p>This part of the dialog lets you constructc a query by combining
|
|
multiple clauses of different types. Each entry field is configurable
|
|
for the following modes:</p>
|
|
<ul class="simple">
|
|
<li>All terms.</li>
|
|
<li>Any term.</li>
|
|
<li>None of the terms.</li>
|
|
<li>Phrase (exact terms in order within an adjustable window).</li>
|
|
<li>Proximity (terms in any order within an adjustable window).</li>
|
|
<li>Filename search.</li>
|
|
</ul>
|
|
<p>Additional entry fields can be created by clicking the Add clause
|
|
button.</p>
|
|
<p>When searching, the non-empty clauses will be combined either with an
|
|
AND or an OR conjunction, depending on the choice made on the left (All
|
|
clauses or Any clause).</p>
|
|
<p>Entries of all types except “Phrase” and “Near” accept a mix of single
|
|
words and phrases enclosed in double quotes. Stemming and wildcard
|
|
expansion will be performed as for simple search.</p>
|
|
<p><strong>Phrases and Proximity searches.</strong></p>
|
|
<p>These two clauses work in similar ways, with the difference that
|
|
proximity searches do not impose an order on the words. In both cases,
|
|
an adjustable number (slack) of non-matched words may be accepted
|
|
between the searched ones (use the counter on the left to adjust this
|
|
count). For phrases, the default count is zero (exact match). For
|
|
proximity it is ten (meaning that two search terms, would be matched if
|
|
found within a window of twelve words). Examples: a phrase search for
|
|
<code class="docutils literal"><span class="pre">quick</span> <span class="pre">fox</span></code> with a slack of 0 will match <code class="docutils literal"><span class="pre">quick</span> <span class="pre">fox</span></code> but not
|
|
<code class="docutils literal"><span class="pre">quick</span> <span class="pre">brown</span> <span class="pre">fox</span></code>. With a slack of 1 it will match the latter, but not
|
|
<code class="docutils literal"><span class="pre">fox</span> <span class="pre">quick</span></code>. A proximity search for <code class="docutils literal"><span class="pre">quick</span> <span class="pre">fox</span></code> with the default
|
|
slack will match the latter, and also
|
|
<code class="docutils literal"><span class="pre">a</span> <span class="pre">fox</span> <span class="pre">is</span> <span class="pre">a</span> <span class="pre">cunning</span> <span class="pre">and</span> <span class="pre">quick</span> <span class="pre">animal</span></code>.</p>
|
|
</div>
|
|
<div class="section" id="avanced-search-the-filter-tab">
|
|
<h5>Avanced search: the “filter” tab<a class="headerlink" href="#avanced-search-the-filter-tab" title="Permalink to this headline">¶</a></h5>
|
|
<p>This part of the dialog has several sections which allow filtering the
|
|
results of a search according to a number of criteria</p>
|
|
<ul>
|
|
<li><p class="first">The first section allows filtering by dates of last modification. You
|
|
can specify both a minimum and a maximum date. The initial values are
|
|
set according to the oldest and newest documents found in the index.</p>
|
|
</li>
|
|
<li><p class="first">The next section allows filtering the results by file size. There are
|
|
two entries for minimum and maximum size. Enter decimal numbers. You
|
|
can use suffix multipliers: <code class="docutils literal"><span class="pre">k/K</span></code>, <code class="docutils literal"><span class="pre">m/M</span></code>, <code class="docutils literal"><span class="pre">g/G</span></code>, <code class="docutils literal"><span class="pre">t/T</span></code> for
|
|
1E3, 1E6, 1E9, 1E12 respectively.</p>
|
|
</li>
|
|
<li><p class="first">The next section allows filtering the results by their MIME types, or
|
|
MIME categories (ie: media/text/message/etc.).</p>
|
|
<p>You can transfer the types between two boxes, to define which will be
|
|
included or excluded by the search.</p>
|
|
<p>The state of the file type selection can be saved as the default (the
|
|
file type filter will not be activated at program start-up, but the
|
|
lists will be in the restored state).</p>
|
|
</li>
|
|
<li><p class="first">The bottom section allows restricting the search results to a
|
|
sub-tree of the indexed area. You can use the Invert checkbox to
|
|
search for files not in the sub-tree instead. If you use directory
|
|
filtering often and on big subsets of the file system, you may think
|
|
of setting up multiple indexes instead, as the performance may be
|
|
better.</p>
|
|
<p>You can use relative/partial paths for filtering. Ie, entering
|
|
<code class="docutils literal"><span class="pre">dirA/dirB</span></code> would match either <code class="docutils literal"><span class="pre">/dir1/dirA/dirB/myfile1</span></code> or
|
|
<code class="docutils literal"><span class="pre">/dir2/dirA/dirB/someother/myfile2</span></code>.</p>
|
|
</li>
|
|
</ul>
|
|
</div>
|
|
<div class="section" id="avanced-search-history">
|
|
<h5>Avanced search history<a class="headerlink" href="#avanced-search-history" title="Permalink to this headline">¶</a></h5>
|
|
<p>The advanced search tool memorizes the last 100 searches performed. You
|
|
can walk the saved searches by using the up and down arrow keys while
|
|
the keyboard focus belongs to the advanced search dialog.</p>
|
|
<p>The complex search history can be erased, along with the one for simple
|
|
search, by selecting the File > Erase Search History menu entry.</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="the-term-explorer-tool">
|
|
<h4>The term explorer tool<a class="headerlink" href="#the-term-explorer-tool" title="Permalink to this headline">¶</a></h4>
|
|
<p>RCL automatically manages the expansion of search terms to their
|
|
derivatives (ie: plural/singular, verb inflections). But there are other
|
|
cases where the exact search term is not known. For example, you may not
|
|
remember the exact spelling, or only know the beginning of the name.</p>
|
|
<p>The search will only propose replacement terms with spelling variations
|
|
when no matching document were found. In some cases, both proper
|
|
spellings and mispellings are present in the index, and it may be
|
|
interesting to look for them explicitely.</p>
|
|
<p>The term explorer tool (started from the toolbar icon or from the Term
|
|
explorer entry of the Tools menu) can be used to search the full index
|
|
terms list. It has three modes of operations:</p>
|
|
<dl class="docutils">
|
|
<dt>Wildcard</dt>
|
|
<dd>In this mode of operation, you can enter a search string with
|
|
shell-like wildcards (*, ?, []). ie: xapi* would display all index
|
|
terms beginning with xapi. (More about wildcards
|
|
<a class="reference external" href="#RCL.SEARCH.WILDCARDS">here</a> ).</dd>
|
|
<dt>Regular expression</dt>
|
|
<dd>This mode will accept a regular expression as input. Example:
|
|
word[0-9]+. The expression is implicitely anchored at the beginning.
|
|
Ie: press will match pression but not expression. You can use
|
|
.*press to match the latter, but be aware that this will cause a
|
|
full index term list scan, which can be quite long.</dd>
|
|
<dt>Stem expansion</dt>
|
|
<dd>This mode will perform the usual stem expansion normally done as
|
|
part user input processing. As such it is probably mostly useful to
|
|
demonstrate the process.</dd>
|
|
<dt>Spelling/Phonetic</dt>
|
|
<dd>In this mode, you enter the term as you think it is spelled, and RCL
|
|
will do its best to find index terms that sound like your entry.
|
|
This mode uses the Aspell spelling application, which must be
|
|
installed on your system for things to work (if your documents
|
|
contain non-ascii characters, RCL needs an aspell version newer than
|
|
0.60 for UTF-8 support). The language which is used to build the
|
|
dictionary out of the index terms (which is done at the end of an
|
|
indexing pass) is the one defined by your NLS environment. Weird
|
|
things will probably happen if languages are mixed up.</dd>
|
|
</dl>
|
|
<p>Note that in cases where RCL does not know the beginning of the string
|
|
to search for (ie a wildcard expression like *coll), the expansion can
|
|
take quite a long time because the full index term list will have to be
|
|
processed. The expansion is currently limited at 10000 results for
|
|
wildcards and regular expressions. It is possible to change the limit in
|
|
the configuration file.</p>
|
|
<p>Double-clicking on a term in the result list will insert it into the
|
|
simple search entry field. You can also cut/paste between the result
|
|
list and any entry field (the end of lines will be taken care of).</p>
|
|
</div>
|
|
<div class="section" id="id2">
|
|
<h4>Multiple indexes<a class="headerlink" href="#id2" title="Permalink to this headline">¶</a></h4>
|
|
<p>See the <a class="reference external" href="#RCL.INDEXING.CONFIG.MULTIPLE">section describing the use of multiple
|
|
indexes</a> for generalities. Only the
|
|
aspects concerning the <code class="docutils literal"><span class="pre">recoll</span></code> GUI are described here.</p>
|
|
<p>A <code class="docutils literal"><span class="pre">recoll</span></code> program instance is always associated with a specific
|
|
index, which is the one to be updated when requested from the File menu,
|
|
but it can use any number of RCL indexes for searching. The external
|
|
indexes can be selected through the external indexes tab in the
|
|
preferences dialog.</p>
|
|
<p>Index selection is performed in two phases. A set of all usable indexes
|
|
must first be defined, and then the subset of indexes to be used for
|
|
searching. These parameters are retained across program executions
|
|
(there are kept separately for each RCL configuration). The set of all
|
|
indexes is usually quite stable, while the active ones might typically
|
|
be adjusted quite frequently.</p>
|
|
<p>The main index (defined by RECOLL_CONFDIR) is always active. If this is
|
|
undesirable, you can set up your base configuration to index an empty
|
|
directory.</p>
|
|
<p>When adding a new index to the set, you can select either a RCL
|
|
configuration directory, or directly a XAP index directory. In the first
|
|
case, the XAP index directory will be obtained from the selected
|
|
configuration.</p>
|
|
<p>As building the set of all indexes can be a little tedious when done
|
|
through the user interface, you can use the RECOLL_EXTRA_DBS
|
|
environment variable to provide an initial set. This might typically be
|
|
set up by a system administrator so that every user does not have to do
|
|
it. The variable should define a colon-separated list of index
|
|
directories, ie:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">export</span> <span class="n">RECOLL_EXTRA_DBS</span><span class="o">=/</span><span class="n">some</span><span class="o">/</span><span class="n">place</span><span class="o">/</span><span class="n">xapiandb</span><span class="p">:</span><span class="o">/</span><span class="n">some</span><span class="o">/</span><span class="n">other</span><span class="o">/</span><span class="n">db</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Another environment variable, RECOLL_ACTIVE_EXTRA_DBS allows adding
|
|
to the active list of indexes. This variable was suggested and
|
|
implemented by a RCL user. It is mostly useful if you use scripts to
|
|
mount external volumes with RCL indexes. By using RECOLL_EXTRA_DBS and
|
|
RECOLL_ACTIVE_EXTRA_DBS, you can add and activate the index for the
|
|
mounted volume when starting <code class="docutils literal"><span class="pre">recoll</span></code>.</p>
|
|
<p>RECOLL_ACTIVE_EXTRA_DBS is available for RCL versions 1.17.2 and
|
|
later. A change was made in the same update so that <code class="docutils literal"><span class="pre">recoll</span></code> will
|
|
automatically deactivate unreachable indexes when starting up.</p>
|
|
</div>
|
|
<div class="section" id="document-history">
|
|
<h4>Document history<a class="headerlink" href="#document-history" title="Permalink to this headline">¶</a></h4>
|
|
<p>Documents that you actually view (with the internal preview or an
|
|
external tool) are entered into the document history, which is
|
|
remembered.</p>
|
|
<p>You can display the history list by using the Tools/Doc History menu
|
|
entry.</p>
|
|
<p>You can erase the document history by using the Erase document history
|
|
entry in the File menu.</p>
|
|
</div>
|
|
<div class="section" id="sorting-search-results-and-collapsing-duplicates">
|
|
<h4>Sorting search results and collapsing duplicates<a class="headerlink" href="#sorting-search-results-and-collapsing-duplicates" title="Permalink to this headline">¶</a></h4>
|
|
<p>The documents in a result list are normally sorted in order of
|
|
relevance. It is possible to specify a different sort order, either by
|
|
using the vertical arrows in the GUI toolbox to sort by date, or
|
|
switching to the result table display and clicking on any header. The
|
|
sort order chosen inside the result table remains active if you switch
|
|
back to the result list, until you click one of the vertical arrows,
|
|
until both are unchecked (you are back to sort by relevance).</p>
|
|
<p>Sort parameters are remembered between program invocations, but result
|
|
sorting is normally always inactive when the program starts. It is
|
|
possible to keep the sorting activation state between program
|
|
invocations by checking the Remember sort activation state option in the
|
|
preferences.</p>
|
|
<p>It is also possible to hide duplicate entries inside the result list
|
|
(documents with the exact same contents as the displayed one). The test
|
|
of identity is based on an MD5 hash of the document container, not only
|
|
of the text contents (so that ie, a text document with an image added
|
|
will not be a duplicate of the text only). Duplicates hiding is
|
|
controlled by an entry in the GUI configuration dialog, and is off by
|
|
default.</p>
|
|
<p>As of release 1.19, when a result document does have undisplayed
|
|
duplicates, a <code class="docutils literal"><span class="pre">Dups</span></code> link will be shown with the result list entry.
|
|
Clicking the link will display the paths (URLs + ipaths) for the
|
|
duplicate entries.</p>
|
|
</div>
|
|
<div class="section" id="search-tips-shortcuts">
|
|
<h4>Search tips, shortcuts<a class="headerlink" href="#search-tips-shortcuts" title="Permalink to this headline">¶</a></h4>
|
|
<div class="section" id="terms-and-search-expansion">
|
|
<h5>Terms and search expansion<a class="headerlink" href="#terms-and-search-expansion" title="Permalink to this headline">¶</a></h5>
|
|
<p><strong>Term completion.</strong></p>
|
|
<p>Typing Esc Space in the simple search entry field while entering a word
|
|
will either complete the current word if its beginning matches a unique
|
|
term in the index, or open a window to propose a list of completions.</p>
|
|
<p><strong>Picking up new terms from result or preview text.</strong></p>
|
|
<p>Double-clicking on a word in the result list or in a preview window will
|
|
copy it to the simple search entry field.</p>
|
|
<p><strong>Wildcards.</strong></p>
|
|
<p>Wildcards can be used inside search terms in all forms of searches.
|
|
<a class="reference external" href="#RCL.SEARCH.WILDCARDS">More about wildcards</a>.</p>
|
|
<p><strong>Automatic suffixes.</strong></p>
|
|
<p>Words like <code class="docutils literal"><span class="pre">odt</span></code> or <code class="docutils literal"><span class="pre">ods</span></code> can be automatically turned into query
|
|
language <code class="docutils literal"><span class="pre">ext:xxx</span></code> clauses. This can be enabled in the Search
|
|
preferences panel in the GUI.</p>
|
|
<p><strong>Disabling stem expansion.</strong></p>
|
|
<p>Entering a capitalized word in any search field will prevent stem
|
|
expansion (no search for <code class="docutils literal"><span class="pre">gardening</span></code> if you enter <code class="docutils literal"><span class="pre">Garden</span></code> instead
|
|
of <code class="docutils literal"><span class="pre">garden</span></code>). This is the only case where character case should make a
|
|
difference for a RCL search. You can also disable stem expansion or
|
|
change the stemming language in the preferences.</p>
|
|
<p><strong>Finding related documents.</strong></p>
|
|
<p>Selecting the Find similar documents entry in the result list paragraph
|
|
right-click menu will select a set of “interesting” terms from the
|
|
current result, and insert them into the simple search entry field. You
|
|
can then possibly edit the list and start a search to find documents
|
|
which may be apparented to the current result.</p>
|
|
<p><strong>File names.</strong></p>
|
|
<p>File names are added as terms during indexing, and you can specify them
|
|
as ordinary terms in normal search fields (RCL used to index all
|
|
directories in the file path as terms. This has been abandoned as it did
|
|
not seem really useful). Alternatively, you can use the specific file
|
|
name search which will <em>only</em> look for file names, and may be faster
|
|
than the generic search especially when using wildcards.</p>
|
|
</div>
|
|
<div class="section" id="working-with-phrases-and-proximity">
|
|
<h5>Working with phrases and proximity<a class="headerlink" href="#working-with-phrases-and-proximity" title="Permalink to this headline">¶</a></h5>
|
|
<p><strong>Phrases and Proximity searches.</strong></p>
|
|
<p>A phrase can be looked for by enclosing it in double quotes. Example:
|
|
<code class="docutils literal"><span class="pre">"user</span> <span class="pre">manual"</span></code> will look only for occurrences of <code class="docutils literal"><span class="pre">user</span></code> immediately
|
|
followed by <code class="docutils literal"><span class="pre">manual</span></code>. You can use the This phrase field of the
|
|
advanced search dialog to the same effect. Phrases can be entered along
|
|
simple terms in all simple or advanced search entry fields (except This
|
|
exact phrase).</p>
|
|
<p><strong>AutoPhrases.</strong></p>
|
|
<p>This option can be set in the preferences dialog. If it is set, a phrase
|
|
will be automatically built and added to simple searches when looking
|
|
for <code class="docutils literal"><span class="pre">Any</span> <span class="pre">terms</span></code>. This will not change radically the results, but will
|
|
give a relevance boost to the results where the search terms appear as a
|
|
phrase. Ie: searching for <code class="docutils literal"><span class="pre">virtual</span> <span class="pre">reality</span></code> will still find all
|
|
documents where either <code class="docutils literal"><span class="pre">virtual</span></code> or <code class="docutils literal"><span class="pre">reality</span></code> or both appear, but
|
|
those which contain <code class="docutils literal"><span class="pre">virtual</span> <span class="pre">reality</span></code> should appear sooner in the
|
|
list.</p>
|
|
<p>Phrase searches can strongly slow down a query if most of the terms in
|
|
the phrase are common. This is why the <code class="docutils literal"><span class="pre">autophrase</span></code> option is off by
|
|
default for RCL versions before 1.17. As of version 1.17, <code class="docutils literal"><span class="pre">autophrase</span></code>
|
|
is on by default, but very common terms will be removed from the
|
|
constructed phrase. The removal threshold can be adjusted from the
|
|
search preferences.</p>
|
|
<p><strong>Phrases and abbreviations.</strong></p>
|
|
<p>As of RCL version 1.17, dotted abbreviations like <code class="docutils literal"><span class="pre">I.B.M.</span></code> are also
|
|
automatically indexed as a word without the dots: <code class="docutils literal"><span class="pre">IBM</span></code>. Searching for
|
|
the word inside a phrase (ie: <code class="docutils literal"><span class="pre">"the</span> <span class="pre">IBM</span> <span class="pre">company"</span></code>) will only match the
|
|
dotted abrreviation if you increase the phrase slack (using the advanced
|
|
search panel control, or the <code class="docutils literal"><span class="pre">o</span></code> query language modifier). Literal
|
|
occurences of the word will be matched normally.</p>
|
|
</div>
|
|
<div class="section" id="others">
|
|
<h5>Others<a class="headerlink" href="#others" title="Permalink to this headline">¶</a></h5>
|
|
<p><strong>Using fields.</strong></p>
|
|
<p>You can use the <a class="reference external" href="#RCL.SEARCH.LANG">query language</a> and field
|
|
specifications to only search certain parts of documents. This can be
|
|
especially helpful with email, for example only searching emails from a
|
|
specific originator: <code class="docutils literal"><span class="pre">search</span> <span class="pre">tips</span> <span class="pre">from:helpfulgui</span></code></p>
|
|
<p><strong>Ajusting the result table columns.</strong></p>
|
|
<p>When displaying results in table mode, you can use a right click on the
|
|
table headers to activate a pop-up menu which will let you adjust what
|
|
columns are displayed. You can drag the column headers to adjust their
|
|
order. You can click them to sort by the field displayed in the column.
|
|
You can also save the result list in CSV format.</p>
|
|
<p><strong>Changing the GUI geometry.</strong></p>
|
|
<p>It is possible to configure the GUI in wide form factor by dragging the
|
|
toolbars to one of the sides (their location is remembered between
|
|
sessions), and moving the category filters to a menu (can be set in the
|
|
Preferences > GUI configuration > User interface panel).</p>
|
|
<p><strong>Query explanation.</strong></p>
|
|
<p>You can get an exact description of what the query looked for, including
|
|
stem expansion, and Boolean operators used, by clicking on the result
|
|
list header.</p>
|
|
<p><strong>Advanced search history.</strong></p>
|
|
<p>As of RCL 1.18, you can display any of the last 100 complex searches
|
|
performed by using the up and down arrow keys while the advanced search
|
|
panel is active.</p>
|
|
<p><strong>Browsing the result list inside a preview window.</strong></p>
|
|
<p>Entering Shift-Down or Shift-Up (Shift + an arrow key) in a preview
|
|
window will display the next or the previous document from the result
|
|
list. Any secondary search currently active will be executed on the new
|
|
document.</p>
|
|
<p><strong>Scrolling the result list from the keyboard.</strong></p>
|
|
<p>You can use PageUp and PageDown to scroll the result list, Shift+Home to
|
|
go back to the first page. These work even while the focus is in the
|
|
search entry.</p>
|
|
<p><strong>Result table: moving the focus to the table.</strong></p>
|
|
<p>You can use Ctrl-r to move the focus from the search entry to the table,
|
|
and then use the arrow keys to change the current row. Ctrl-Shift-s
|
|
returns to the search.</p>
|
|
<p><strong>Result table: open / preview.</strong></p>
|
|
<p>With the focus in the result table, you can use Ctrl-o to open the
|
|
document from the current row, Ctrl-Shift-o to open the document and
|
|
close <code class="docutils literal"><span class="pre">recoll</span></code>, Ctrl-d to preview the document.</p>
|
|
<p><strong>Editing a new search while the focus is not in the search entry.</strong></p>
|
|
<p>You can use the Ctrl-Shift-S shortcut to return the cursor to the search
|
|
entry (and select the current search text), while the focus is anywhere
|
|
in the main window.</p>
|
|
<p><strong>Forced opening of a preview window.</strong></p>
|
|
<p>You can use Shift+Click on a result list <code class="docutils literal"><span class="pre">Preview</span></code> link to force the
|
|
creation of a preview window instead of a new tab in the existing one.</p>
|
|
<p><strong>Closing previews.</strong></p>
|
|
<p>Entering Ctrl-W in a tab will close it (and, for the last tab, close the
|
|
preview window). Entering Esc will close the preview window and all its
|
|
tabs.</p>
|
|
<p><strong>Printing previews.</strong></p>
|
|
<p>Entering Ctrl-P in a preview window will print the currently displayed
|
|
text.</p>
|
|
<p><strong>Quitting.</strong></p>
|
|
<p>Entering Ctrl-Q almost anywhere will close the application.</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="saving-and-restoring-queries-1-21-and-later">
|
|
<h4>Saving and restoring queries (1.21 and later)<a class="headerlink" href="#saving-and-restoring-queries-1-21-and-later" title="Permalink to this headline">¶</a></h4>
|
|
<p>Both simple and advanced query dialogs save recent history, but the
|
|
amount is limited: old queries will eventually be forgotten. Also,
|
|
important queries may be difficult to find among others. This is why
|
|
both types of queries can also be explicitely saved to files, from the
|
|
GUI menus: File > Save last query / Load last query</p>
|
|
<p>The default location for saved queries is a subdirectory of the current
|
|
configuration directory, but saved queries are ordinary files and can be
|
|
written or moved anywhere.</p>
|
|
<p>Some of the saved query parameters are part of the preferences (e.g.
|
|
<code class="docutils literal"><span class="pre">autophrase</span></code> or the active external indexes), and may differ when the
|
|
query is loaded from the time it was saved. In this case, RCL will warn
|
|
of the differences, but will not change the user preferences.</p>
|
|
</div>
|
|
<div class="section" id="customizing-the-search-interface">
|
|
<h4>Customizing the search interface<a class="headerlink" href="#customizing-the-search-interface" title="Permalink to this headline">¶</a></h4>
|
|
<p>You can customize some aspects of the search interface by using the GUI
|
|
configuration entry in the Preferences menu.</p>
|
|
<p>There are several tabs in the dialog, dealing with the interface itself,
|
|
the parameters used for searching and returning results, and what
|
|
indexes are searched.</p>
|
|
<p><strong>User interface parameters:.</strong></p>
|
|
<ul>
|
|
<li><p class="first">Highlight color for query terms: Terms from the user query are
|
|
highlighted in the result list samples and the preview window. The
|
|
color can be chosen here. Any Qt color string should work (ie
|
|
<code class="docutils literal"><span class="pre">red</span></code>, <code class="docutils literal"><span class="pre">#ff0000</span></code>). The default is <code class="docutils literal"><span class="pre">blue</span></code>.</p>
|
|
</li>
|
|
<li><p class="first">Style sheet: The name of a Qt style sheet text file which is applied
|
|
to the whole Recoll application on startup. The default value is
|
|
empty, but there is a skeleton style sheet (<code class="docutils literal"><span class="pre">recoll.qss</span></code>) inside
|
|
the <code class="docutils literal"><span class="pre">/usr/share/recoll/examples</span></code> directory. Using a style sheet,
|
|
you can change most <code class="docutils literal"><span class="pre">recoll</span></code> graphical parameters: colors, fonts,
|
|
etc. See the sample file for a few simple examples.</p>
|
|
<p>You should be aware that parameters (e.g.: the background color) set
|
|
inside the RCL GUI style sheet will override global system
|
|
preferences, with possible strange side effects: for example if you
|
|
set the foreground to a light color and the background to a dark one
|
|
in the desktop preferences, but only the background is set inside the
|
|
RCL style sheet, and it is light too, then text will appear
|
|
light-on-light inside the RCL GUI.</p>
|
|
</li>
|
|
<li><p class="first">Maximum text size highlighted for preview Inserting highlights on
|
|
search term inside the text before inserting it in the preview window
|
|
involves quite a lot of processing, and can be disabled over the
|
|
given text size to speed up loading.</p>
|
|
</li>
|
|
<li><p class="first">Prefer HTML to plain text for preview if set, Recoll will display
|
|
HTML as such inside the preview window. If this causes problems with
|
|
the Qt HTML display, you can uncheck it to display the plain text
|
|
version instead.</p>
|
|
</li>
|
|
<li><p class="first">Activate links in preview if set, Recoll will turn HTTP links found
|
|
inside plain text into proper HTML anchors, and clicking a link
|
|
inside a preview window will start the default browser on the link
|
|
target.</p>
|
|
</li>
|
|
<li><p class="first">Plain text to HTML line style: when displaying plain text inside the
|
|
preview window, RCL tries to preserve some of the original text line
|
|
breaks and indentation. It can either use PRE HTML tags, which will
|
|
well preserve the indentation but will force horizontal scrolling for
|
|
long lines, or use BR tags to break at the original line breaks,
|
|
which will let the editor introduce other line breaks according to
|
|
the window width, but will lose some of the original indentation. The
|
|
third option has been available in recent releases and is probably
|
|
now the best one: use PRE tags with line wrapping.</p>
|
|
</li>
|
|
<li><p class="first">Choose editor application: this opens a dialog which allows you to
|
|
select the application to be used to open each MIME type. The default
|
|
is to use the <code class="docutils literal"><span class="pre">xdg-open</span></code> utility, but you can use this dialog to
|
|
override it, setting exceptions for MIME types that will still be
|
|
opened according to RCL preferences. This is useful for passing
|
|
parameters like page numbers or search strings to applications that
|
|
support them (e.g. evince). This cannot be done with <code class="docutils literal"><span class="pre">xdg-open</span></code>
|
|
which only supports passing one parameter.</p>
|
|
</li>
|
|
<li><p class="first">Disable Qt autocompletion in search entry: this will disable the
|
|
completion popup. Il will only appear, and display the full history,
|
|
either if you enter only white space in the search area, or if you
|
|
click the clock button on the right of the area.</p>
|
|
</li>
|
|
<li><p class="first">Document filter choice style: this will let you choose if the
|
|
document categories are displayed as a list or a set of buttons, or a
|
|
menu.</p>
|
|
</li>
|
|
<li><p class="first">Start with simple search mode: this lets you choose the value of the
|
|
simple search type on program startup. Either a fixed value (e.g.
|
|
<code class="docutils literal"><span class="pre">Query</span> <span class="pre">Language</span></code>, or the value in use when the program last exited.</p>
|
|
</li>
|
|
<li><p class="first">Start with advanced search dialog open: If you use this dialog
|
|
frequently, checking the entries will get it to open when recoll
|
|
starts.</p>
|
|
</li>
|
|
<li><p class="first">Remember sort activation state if set, Recoll will remember the sort
|
|
tool stat between invocations. It normally starts with sorting
|
|
disabled.</p>
|
|
</li>
|
|
</ul>
|
|
<p><strong>Result list parameters:.</strong></p>
|
|
<ul class="simple">
|
|
<li>Number of results in a result page</li>
|
|
<li>Result list font: There is quite a lot of information shown in the
|
|
result list, and you may want to customize the font and/or font size.
|
|
The rest of the fonts used by RCL are determined by your generic Qt
|
|
config (try the <code class="docutils literal"><span class="pre">qtconfig</span></code> command).</li>
|
|
<li>Edit result list paragraph format string: allows you to change the
|
|
presentation of each result list entry. See the <a class="reference external" href="#RCL.SEARCH.GUI.CUSTOM.RESLIST">result list
|
|
customisation section</a>.</li>
|
|
<li>Edit result page HTML header insert: allows you to define text
|
|
inserted at the end of the result page HTML header. More detail in
|
|
the <a class="reference external" href="#RCL.SEARCH.GUI.CUSTOM.RESLIST">result list customisation
|
|
section.</a></li>
|
|
<li>Date format: allows specifying the format used for displaying dates
|
|
inside the result list. This should be specified as an strftime()
|
|
string (man strftime).</li>
|
|
<li>Abstract snippet separator: for synthetic abstracts built from index
|
|
data, which are usually made of several snippets from different parts
|
|
of the document, this defines the snippet separator, an ellipsis by
|
|
default.</li>
|
|
</ul>
|
|
<p><strong>Search parameters:.</strong></p>
|
|
<ul class="simple">
|
|
<li>Hide duplicate results: decides if result list entries are shown for
|
|
identical documents found in different places.</li>
|
|
<li>Stemming language: stemming obviously depends on the document’s
|
|
language. This listbox will let you chose among the stemming
|
|
databases which were built during indexing (this is set in the <a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF">main
|
|
configuration file</a>), or later
|
|
added with <code class="docutils literal"><span class="pre">recollindex</span> <span class="pre">-s</span></code> (See the recollindex manual). Stemming
|
|
languages which are dynamically added will be deleted at the next
|
|
indexing pass unless they are also added in the configuration file.</li>
|
|
<li>Automatically add phrase to simple searches: a phrase will be
|
|
automatically built and added to simple searches when looking for
|
|
<code class="docutils literal"><span class="pre">Any</span> <span class="pre">terms</span></code>. This will give a relevance boost to the results where
|
|
the search terms appear as a phrase (consecutive and in order).</li>
|
|
<li>Autophrase term frequency threshold percentage: very frequent terms
|
|
should not be included in automatic phrase searches for performance
|
|
reasons. The parameter defines the cutoff percentage (percentage of
|
|
the documents where the term appears).</li>
|
|
<li>Replace abstracts from documents: this decides if we should
|
|
synthesize and display an abstract in place of an explicit abstract
|
|
found within the document itself.</li>
|
|
<li>Dynamically build abstracts: this decides if RCL tries to build
|
|
document abstracts (lists of <em>snippets</em>) when displaying the result
|
|
list. Abstracts are constructed by taking context from the document
|
|
information, around the search terms.</li>
|
|
<li>Synthetic abstract size: adjust to taste...</li>
|
|
<li>Synthetic abstract context words: how many words should be displayed
|
|
around each term occurrence.</li>
|
|
<li>Query language magic file name suffixes: a list of words which
|
|
automatically get turned into <code class="docutils literal"><span class="pre">ext:xxx</span></code> file name suffix clauses
|
|
when starting a query language query (e.g.: <code class="docutils literal"><span class="pre">doc</span> <span class="pre">xls</span> <span class="pre">xlsx...</span></code>).
|
|
This will save some typing for people who use file types a lot when
|
|
querying.</li>
|
|
</ul>
|
|
<p><strong>External indexes:.</strong></p>
|
|
<p>This panel will let you browse for additional indexes that you may want
|
|
to search. External indexes are designated by their database directory
|
|
(ie: <code class="docutils literal"><span class="pre">/home/someothergui/.recoll/xapiandb</span></code>,
|
|
<code class="docutils literal"><span class="pre">/usr/local/recollglobal/xapiandb</span></code>).</p>
|
|
<p>Once entered, the indexes will appear in the External indexes list, and
|
|
you can chose which ones you want to use at any moment by checking or
|
|
unchecking their entries.</p>
|
|
<p>Your main database (the one the current configuration indexes to), is
|
|
always implicitly active. If this is not desirable, you can set up your
|
|
configuration so that it indexes, for example, an empty directory. An
|
|
alternative indexer may also need to implement a way of purging the
|
|
index from stale data,</p>
|
|
<div class="section" id="the-result-list-format">
|
|
<h5>The result list format<a class="headerlink" href="#the-result-list-format" title="Permalink to this headline">¶</a></h5>
|
|
<p>Newer versions of Recoll (from 1.17) normally use WebKit HTML widgets
|
|
for the result list and the <a class="reference external" href="#RCL.SEARCH.GUI.RESULTLIST.MENU.SNIPPETS">snippets
|
|
window</a> (this may be
|
|
disabled at build time). Total customisation is possible with full
|
|
support for CSS and Javascript. Conversely, there are limits to what you
|
|
can do with the older Qt QTextBrowser, but still, it is possible to
|
|
decide what data each result will contain, and how it will be displayed.</p>
|
|
<p>The result list presentation can be exhaustively customized by adjusting
|
|
two elements:</p>
|
|
<ul class="simple">
|
|
<li>The paragraph format</li>
|
|
<li>HTML code inside the header section. For versions 1.21 and later,
|
|
this is also used for the <a class="reference external" href="#RCL.SEARCH.GUI.RESULTLIST.MENU.SNIPPETS">snippets
|
|
window</a></li>
|
|
</ul>
|
|
<p>The paragraph format and the header fragment can be edited from the
|
|
Result list tab of the GUI configuration.</p>
|
|
<p>The header fragment is used both for the result list and the snippets
|
|
window. The snippets list is a table and has a <code class="docutils literal"><span class="pre">snippets</span></code> class
|
|
attribute. Each paragraph in the result list is a table, with class
|
|
<code class="docutils literal"><span class="pre">respar</span></code>, but this can be changed by editing the paragraph format.</p>
|
|
<p>There are a few examples on the <a class="reference external" href="http://www.recoll.org/custom.html">page about customising the result
|
|
list</a> on the RCL web site.</p>
|
|
<div class="section" id="the-paragraph-format">
|
|
<h6>The paragraph format<a class="headerlink" href="#the-paragraph-format" title="Permalink to this headline">¶</a></h6>
|
|
<p>This is an arbitrary HTML string where the following printf-like <code class="docutils literal"><span class="pre">%</span></code>
|
|
substitutions will be performed:</p>
|
|
<ul>
|
|
<li><p class="first"><strong>%A.</strong></p>
|
|
<p>Abstract</p>
|
|
</li>
|
|
<li><p class="first"><strong>%D.</strong></p>
|
|
<p>Date</p>
|
|
</li>
|
|
<li><p class="first"><strong>%I.</strong></p>
|
|
<p>Icon image name. This is normally determined from the MIME type. The
|
|
associations are defined inside the <code class="docutils literal"><span class="pre">`mimeconf</span></code> configuration
|
|
file <#RCL.INSTALL.CONFIG.MIMECONF>`__. If a thumbnail for the file
|
|
is found at the standard Freedesktop location, this will be displayed
|
|
instead.</p>
|
|
</li>
|
|
<li><p class="first"><strong>%K.</strong></p>
|
|
<p>Keywords (if any)</p>
|
|
</li>
|
|
<li><p class="first"><strong>%L.</strong></p>
|
|
<p>Precooked Preview, Edit, and possibly Snippets links</p>
|
|
</li>
|
|
<li><p class="first"><strong>%M.</strong></p>
|
|
<p>MIME type</p>
|
|
</li>
|
|
<li><p class="first"><strong>%N.</strong></p>
|
|
<p>result Number inside the result page</p>
|
|
</li>
|
|
<li><p class="first"><strong>%P.</strong></p>
|
|
<p>Parent folder Url. In the case of an embedded document, this is the
|
|
parent folder for the top level container file.</p>
|
|
</li>
|
|
<li><p class="first"><strong>%R.</strong></p>
|
|
<p>Relevance percentage</p>
|
|
</li>
|
|
<li><p class="first"><strong>%S.</strong></p>
|
|
<p>Size information</p>
|
|
</li>
|
|
<li><p class="first"><strong>%T.</strong></p>
|
|
<p>Title or Filename if not set.</p>
|
|
</li>
|
|
<li><p class="first"><strong>%t.</strong></p>
|
|
<p>Title or empty.</p>
|
|
</li>
|
|
<li><p class="first"><strong>%(filename).</strong></p>
|
|
<p>File name.</p>
|
|
</li>
|
|
<li><p class="first"><strong>%U.</strong></p>
|
|
<p>Url</p>
|
|
</li>
|
|
</ul>
|
|
<p>The format of the Preview, Edit, and Snippets links is
|
|
<code class="docutils literal"><span class="pre"><a</span> <span class="pre">href="P%N"></span></code>, <code class="docutils literal"><span class="pre"><a</span> <span class="pre">href="E%N"></span></code> and <code class="docutils literal"><span class="pre"><a</span> <span class="pre">href="A%N"></span></code> where
|
|
docnum (%N) expands to the document number inside the result page).</p>
|
|
<p>A link target defined as <code class="docutils literal"><span class="pre">"F%N"</span></code> will open the document corresponding
|
|
to the <code class="docutils literal"><span class="pre">%P</span></code> parent folder expansion, usually creating a file manager
|
|
window on the folder where the container file resides. E.g.:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o"><</span><span class="n">a</span> <span class="n">href</span><span class="o">=</span><span class="s2">"F%N"</span><span class="o">>%</span><span class="n">P</span><span class="o"></</span><span class="n">a</span><span class="o">></span>
|
|
</pre></div>
|
|
</div>
|
|
<p>A link target defined as <code class="docutils literal"><span class="pre">R%N|scriptname</span></code> will run the corresponding
|
|
script on the result file (if the document is embedded, the script will
|
|
be started on the top-level parent). See the <a class="reference external" href="#RCL.SEARCH.GUI.RUNSCRIPT">section about defining
|
|
scripts</a>.</p>
|
|
<p>In addition to the predefined values above, all strings like
|
|
<code class="docutils literal"><span class="pre">%(fieldname)</span></code> will be replaced by the value of the field named
|
|
<code class="docutils literal"><span class="pre">fieldname</span></code> for this document. Only stored fields can be accessed in
|
|
this way, the value of indexed but not stored fields is not known at
|
|
this point in the search process (see <a class="reference external" href="#RCL.PROGRAM.FIELDS">field
|
|
configuration</a>). There are currently very few
|
|
fields stored by default, apart from the values above (only <code class="docutils literal"><span class="pre">author</span></code>
|
|
and <code class="docutils literal"><span class="pre">filename</span></code>), so this feature will need some custom local
|
|
configuration to be useful. An example candidate would be the
|
|
<code class="docutils literal"><span class="pre">recipient</span></code> field which is generated by the message input handlers.</p>
|
|
<p>The default value for the paragraph format string is:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="s2">"<table class=</span><span class="se">\"</span><span class="s2">respar</span><span class="se">\"</span><span class="s2">></span><span class="se">\n</span><span class="s2">"</span>
|
|
<span class="s2">"<tr></span><span class="se">\n</span><span class="s2">"</span>
|
|
<span class="s2">"<td><a href='%U'><img src='%I' width='64'></a></td></span><span class="se">\n</span><span class="s2">"</span>
|
|
<span class="s2">"<td>%L &nbsp;<i>%S</i> &nbsp;&nbsp;<b>%T</b><br></span><span class="se">\n</span><span class="s2">"</span>
|
|
<span class="s2">"<span style='white-space:nowrap'><i>%M</i>&nbsp;%D</span>&nbsp;&nbsp;&nbsp; <i>%U</i>&nbsp;</span><span class="si">%i</span><span class="s2"><br></span><span class="se">\n</span><span class="s2">"</span>
|
|
<span class="s2">"%A %K</td></span><span class="se">\n</span><span class="s2">"</span>
|
|
<span class="s2">"</tr></table></span><span class="se">\n</span><span class="s2">"</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>You may, for example, try the following for a more web-like experience:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o"><</span><span class="n">u</span><span class="o">><</span><span class="n">b</span><span class="o">><</span><span class="n">a</span> <span class="n">href</span><span class="o">=</span><span class="s2">"P%N"</span><span class="o">>%</span><span class="n">T</span><span class="o"></</span><span class="n">a</span><span class="o">></</span><span class="n">b</span><span class="o">></</span><span class="n">u</span><span class="o">><</span><span class="n">br</span><span class="o">></span>
|
|
<span class="o">%</span><span class="n">A</span><span class="o"><</span><span class="n">font</span> <span class="n">color</span><span class="o">=</span><span class="c1">#008000>%U - %S</font> - %L</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Note that the P%N link in the above paragraph makes the title a preview
|
|
link. Or the clean looking:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o"><</span><span class="n">img</span> <span class="n">src</span><span class="o">=</span><span class="s2">"%I"</span> <span class="n">align</span><span class="o">=</span><span class="s2">"left"</span><span class="o">>%</span><span class="n">L</span> <span class="o"><</span><span class="n">font</span> <span class="n">color</span><span class="o">=</span><span class="s2">"#900000"</span><span class="o">>%</span><span class="n">R</span><span class="o"></</span><span class="n">font</span><span class="o">></span>
|
|
<span class="o">&</span><span class="n">nbsp</span><span class="p">;</span><span class="o">&</span><span class="n">nbsp</span><span class="p">;</span><span class="o"><</span><span class="n">b</span><span class="o">>%</span><span class="n">T</span><span class="o">&</</span><span class="n">b</span><span class="o">><</span><span class="n">br</span><span class="o">>%</span><span class="n">S</span><span class="o">&</span><span class="n">nbsp</span><span class="p">;</span>
|
|
<span class="o"><</span><span class="n">font</span> <span class="n">color</span><span class="o">=</span><span class="s2">"#808080"</span><span class="o">><</span><span class="n">i</span><span class="o">>%</span><span class="n">U</span><span class="o"></</span><span class="n">i</span><span class="o">></</span><span class="n">font</span><span class="o">></span>
|
|
<span class="o"><</span><span class="n">table</span> <span class="n">bgcolor</span><span class="o">=</span><span class="s2">"#e0e0e0"</span><span class="o">></span>
|
|
<span class="o"><</span><span class="n">tr</span><span class="o">><</span><span class="n">td</span><span class="o">><</span><span class="n">div</span><span class="o">>%</span><span class="n">A</span><span class="o"></</span><span class="n">div</span><span class="o">></</span><span class="n">td</span><span class="o">></</span><span class="n">tr</span><span class="o">></span>
|
|
<span class="o"></</span><span class="n">table</span><span class="o">>%</span><span class="n">K</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>These samples, and some others are <a class="reference external" href="http://www.recoll.org/custom.html">on the web site, with pictures to
|
|
show how they look.</a></p>
|
|
<p>It is also possible to <a class="reference external" href="#RCL.SEARCH.GUI.CUSTOM.ABSSEP">define the value of the snippet separator inside
|
|
the abstract section</a>.</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="searching-with-the-kde-kio-slave">
|
|
<h3><a class="toc-backref" href="#id21">Searching with the KDE KIO slave</a><a class="headerlink" href="#searching-with-the-kde-kio-slave" title="Permalink to this headline">¶</a></h3>
|
|
<div class="section" id="what-s-this">
|
|
<h4>What’s this<a class="headerlink" href="#what-s-this" title="Permalink to this headline">¶</a></h4>
|
|
<p>The RCL KIO slave allows performing a RCL search by entering an
|
|
appropriate URL in a KDE open dialog, or with an HTML-based interface
|
|
displayed in <code class="docutils literal"><span class="pre">Konqueror</span></code>.</p>
|
|
<p>The HTML-based interface is similar to the Qt-based interface, but
|
|
slightly less powerful for now. Its advantage is that you can perform
|
|
your search while staying fully within the KDE framework: drag and drop
|
|
from the result list works normally and you have your normal choice of
|
|
applications for opening files.</p>
|
|
<p>The alternative interface uses a directory view of search results. Due
|
|
to limitations in the current KIO slave interface, it is currently not
|
|
obviously useful (to me).</p>
|
|
<p>The interface is described in more detail inside a help file which you
|
|
can access by entering <code class="docutils literal"><span class="pre">recoll:/</span></code> inside the <code class="docutils literal"><span class="pre">konqueror</span></code> URL line
|
|
(this works only if the recoll KIO slave has been previously installed).</p>
|
|
<p>The instructions for building this module are located in the source
|
|
tree. See: <code class="docutils literal"><span class="pre">kde/kio/recoll/00README.txt</span></code>. Some Linux distributions do
|
|
package the kio-recoll module, so check before diving into the build
|
|
process, maybe it’s already out there ready for one-click installation.</p>
|
|
</div>
|
|
<div class="section" id="searchable-documents">
|
|
<h4>Searchable documents<a class="headerlink" href="#searchable-documents" title="Permalink to this headline">¶</a></h4>
|
|
<p>As a sample application, the RCL KIO slave could allow preparing a set
|
|
of HTML documents (for example a manual) so that they become their own
|
|
search interface inside <code class="docutils literal"><span class="pre">konqueror</span></code>.</p>
|
|
<p>This can be done by either explicitly inserting
|
|
<code class="docutils literal"><span class="pre"><a</span> <span class="pre">href="recoll://..."></span></code> links around some document areas, or
|
|
automatically by adding a very small javascript program to the
|
|
documents, like the following example, which would initiate a search by
|
|
double-clicking any term:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o"><</span><span class="n">script</span> <span class="n">language</span><span class="o">=</span><span class="s2">"JavaScript"</span><span class="o">></span>
|
|
<span class="n">function</span> <span class="n">recollsearch</span><span class="p">()</span> <span class="p">{</span>
|
|
<span class="n">var</span> <span class="n">t</span> <span class="o">=</span> <span class="n">document</span><span class="o">.</span><span class="n">getSelection</span><span class="p">();</span>
|
|
<span class="n">window</span><span class="o">.</span><span class="n">location</span><span class="o">.</span><span class="n">href</span> <span class="o">=</span> <span class="s1">'recoll://search/query?qtp=a&p=0&q='</span> <span class="o">+</span>
|
|
<span class="n">encodeURIComponent</span><span class="p">(</span><span class="n">t</span><span class="p">);</span>
|
|
<span class="p">}</span>
|
|
<span class="o"></</span><span class="n">script</span><span class="o">></span>
|
|
<span class="o">....</span>
|
|
<span class="o"><</span><span class="n">body</span> <span class="n">ondblclick</span><span class="o">=</span><span class="s2">"recollsearch()"</span><span class="o">></span>
|
|
</pre></div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="searching-on-the-command-line">
|
|
<h3><a class="toc-backref" href="#id22">Searching on the command line</a><a class="headerlink" href="#searching-on-the-command-line" title="Permalink to this headline">¶</a></h3>
|
|
<p>There are several ways to obtain search results as a text stream,
|
|
without a graphical interface:</p>
|
|
<ul class="simple">
|
|
<li>By passing option <code class="docutils literal"><span class="pre">-t</span></code> to the <code class="docutils literal"><span class="pre">recoll</span></code> program, or by calling it
|
|
as <code class="docutils literal"><span class="pre">recollq</span></code> (through a link).</li>
|
|
<li>By using the <code class="docutils literal"><span class="pre">recollq</span></code> program.</li>
|
|
<li>By writing a custom Python program, using the <a class="reference external" href="#RCL.PROGRAM.PYTHONAPI">Recoll Python
|
|
API</a>.</li>
|
|
</ul>
|
|
<p>The first two methods work in the same way and accept/need the same
|
|
arguments (except for the additional <code class="docutils literal"><span class="pre">-t</span></code> to <code class="docutils literal"><span class="pre">recoll</span></code>). The query to
|
|
be executed is specified as command line arguments.</p>
|
|
<p><code class="docutils literal"><span class="pre">recollq</span></code> is not built by default. You can use the <code class="docutils literal"><span class="pre">Makefile</span></code> in the
|
|
<code class="docutils literal"><span class="pre">query</span></code> directory to build it. This is a very simple program, and if
|
|
you can program a little c++, you may find it useful to taylor its
|
|
output format to your needs. Not that recollq is only really useful on
|
|
systems where the Qt libraries (or even the X11 ones) are not available.
|
|
Otherwise, just use <code class="docutils literal"><span class="pre">recoll</span> <span class="pre">-t</span></code>, which takes the exact same parameters
|
|
and options which are described for <code class="docutils literal"><span class="pre">recollq</span></code></p>
|
|
<p><code class="docutils literal"><span class="pre">recollq</span></code> has a man page (not installed by default, look in the
|
|
<code class="docutils literal"><span class="pre">doc/man</span></code> directory). The Usage string is as follows:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span>recollq: usage:
|
|
-P: Show the date span for all the documents present in the index
|
|
[-o|-a|-f] [-q] <query string>
|
|
Runs a recoll query and displays result lines.
|
|
Default: will interpret the argument(s) as a xesam query string
|
|
query may be like:
|
|
implicit AND, Exclusion, field spec: t1 -t2 title:t3
|
|
OR has priority: t1 OR t2 t3 OR t4 means (t1 OR t2) AND (t3 OR t4)
|
|
Phrase: "t1 t2" (needs additional quoting on cmd line)
|
|
-o Emulate the GUI simple search in ANY TERM mode
|
|
-a Emulate the GUI simple search in ALL TERMS mode
|
|
-f Emulate the GUI simple search in filename mode
|
|
-q is just ignored (compatibility with the recoll GUI command line)
|
|
Common options:
|
|
-c <configdir> : specify config directory, overriding $RECOLL_CONFDIR
|
|
-d also dump file contents
|
|
-n [first-]<cnt> define the result slice. The default value for [first]
|
|
is 0. Without the option, the default max count is 2000.
|
|
Use n=0 for no limit
|
|
-b : basic. Just output urls, no mime types or titles
|
|
-Q : no result lines, just the processed query and result count
|
|
-m : dump the whole document meta[] array for each result
|
|
-A : output the document abstracts
|
|
-S fld : sort by field <fld>
|
|
-s stemlang : set stemming language to use (must exist in index...)
|
|
Use -s "" to turn off stem expansion
|
|
-D : sort descending
|
|
-i <dbdir> : additional index, several can be given
|
|
-e use url encoding (%xx) for urls
|
|
-F <field name list> : output exactly these fields for each result.
|
|
The field values are encoded in base64, output in one line and
|
|
separated by one space character. This is the recommended format
|
|
for use by other programs. Use a normal query with option -m to
|
|
see the field names.
|
|
</pre></div>
|
|
</div>
|
|
<p>Sample execution:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">recollq</span> <span class="s1">'ilur -nautique mime:text/html'</span>
|
|
<span class="n">Recoll</span> <span class="n">query</span><span class="p">:</span> <span class="p">((((</span><span class="n">ilur</span><span class="p">:(</span><span class="n">wqf</span><span class="o">=</span><span class="mi">11</span><span class="p">)</span> <span class="n">OR</span> <span class="n">ilurs</span><span class="p">)</span> <span class="n">AND_NOT</span> <span class="p">(</span><span class="n">nautique</span><span class="p">:(</span><span class="n">wqf</span><span class="o">=</span><span class="mi">11</span><span class="p">)</span>
|
|
<span class="n">OR</span> <span class="n">nautiques</span> <span class="n">OR</span> <span class="n">nautiqu</span> <span class="n">OR</span> <span class="n">nautiquement</span><span class="p">))</span> <span class="n">FILTER</span> <span class="n">Ttext</span><span class="o">/</span><span class="n">html</span><span class="p">))</span>
|
|
<span class="mi">4</span> <span class="n">results</span>
|
|
<span class="n">text</span><span class="o">/</span><span class="n">html</span> <span class="p">[</span><span class="n">file</span><span class="p">:</span><span class="o">///</span><span class="n">Users</span><span class="o">/</span><span class="n">uncrypted</span><span class="o">-</span><span class="n">dockes</span><span class="o">/</span><span class="n">projets</span><span class="o">/</span><span class="n">bateaux</span><span class="o">/</span><span class="n">ilur</span><span class="o">/</span><span class="n">comptes</span><span class="o">.</span><span class="n">html</span><span class="p">]</span> <span class="p">[</span><span class="n">comptes</span><span class="o">.</span><span class="n">html</span><span class="p">]</span> <span class="mi">18593</span> <span class="nb">bytes</span>
|
|
<span class="n">text</span><span class="o">/</span><span class="n">html</span> <span class="p">[</span><span class="n">file</span><span class="p">:</span><span class="o">///</span><span class="n">Users</span><span class="o">/</span><span class="n">uncrypted</span><span class="o">-</span><span class="n">dockes</span><span class="o">/</span><span class="n">projets</span><span class="o">/</span><span class="n">nautique</span><span class="o">/</span><span class="n">webnautique</span><span class="o">/</span><span class="n">articles</span><span class="o">/</span><span class="n">ilur1</span><span class="o">/</span><span class="n">index</span><span class="o">.</span><span class="n">html</span><span class="p">]</span> <span class="p">[</span><span class="n">Constructio</span><span class="o">...</span>
|
|
<span class="n">text</span><span class="o">/</span><span class="n">html</span> <span class="p">[</span><span class="n">file</span><span class="p">:</span><span class="o">///</span><span class="n">Users</span><span class="o">/</span><span class="n">uncrypted</span><span class="o">-</span><span class="n">dockes</span><span class="o">/</span><span class="n">projets</span><span class="o">/</span><span class="n">pagepers</span><span class="o">/</span><span class="n">index</span><span class="o">.</span><span class="n">html</span><span class="p">]</span> <span class="p">[</span><span class="n">psxtcl</span><span class="o">/</span><span class="n">writemime</span><span class="o">/</span><span class="n">recoll</span><span class="p">]</span><span class="o">...</span>
|
|
<span class="n">text</span><span class="o">/</span><span class="n">html</span> <span class="p">[</span><span class="n">file</span><span class="p">:</span><span class="o">///</span><span class="n">Users</span><span class="o">/</span><span class="n">uncrypted</span><span class="o">-</span><span class="n">dockes</span><span class="o">/</span><span class="n">projets</span><span class="o">/</span><span class="n">bateaux</span><span class="o">/</span><span class="n">ilur</span><span class="o">/</span><span class="n">factEtCie</span><span class="o">/</span><span class="n">recu</span><span class="o">-</span><span class="n">chasse</span><span class="o">-</span><span class="n">maree</span><span class="o">....</span>
|
|
</pre></div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="using-synonyms-1-22">
|
|
<h3><a class="toc-backref" href="#id23">Using Synonyms (1.22)</a><a class="headerlink" href="#using-synonyms-1-22" title="Permalink to this headline">¶</a></h3>
|
|
<p><strong>Term synonyms:.</strong></p>
|
|
<p>there are a number of ways to use term synonyms for searching text:</p>
|
|
<ul class="simple">
|
|
<li>At index creation time, they can be used to alter the indexed terms,
|
|
either increasing or decreasing their number, by expanding the
|
|
original terms to all synonyms, or by reducing all synonym terms to a
|
|
canonical one.</li>
|
|
<li>At query time, they can be used to match texts containing terms which
|
|
are synonyms of the ones specified by the user, either by expanding
|
|
the query for all synonyms, or by reducing the user entry to
|
|
canonical terms (the latter only works if the corresponding
|
|
processing has been performed while creating the index).</li>
|
|
</ul>
|
|
<p>RCL only uses synonyms at query time. A user query term which part of a
|
|
synonym group will be optionally expanded into an <code class="docutils literal"><span class="pre">OR</span></code> query for all
|
|
terms in the group.</p>
|
|
<p>Synonym groups are defined inside ordinary text files. Each line in the
|
|
file defines a group.</p>
|
|
<p>Example:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">hi</span> <span class="n">hello</span> <span class="s2">"good morning"</span>
|
|
|
|
<span class="c1"># not sure about "au revoir" though. Is this english ?</span>
|
|
<span class="n">bye</span> <span class="n">goodbye</span> <span class="s2">"see you"</span> \
|
|
<span class="s2">"au revoir"</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>As usual, lines beginning with a <code class="docutils literal"><span class="pre">#</span></code> are comments, empty lines are
|
|
ignored, and lines can be continued by ending them with a backslash.</p>
|
|
<p>Multi-word synonyms are supported, but be aware that these will generate
|
|
phrase queries, which may degrade performance and will disable stemming
|
|
expansion for the phrase terms.</p>
|
|
<p>The synonyms file can be specified in the Search parameters tab of the
|
|
GUI configuration Preferences menu entry, or as an option for
|
|
command-line searches.</p>
|
|
<p>Once the file is defined, the use of synonyms can be enabled or disabled
|
|
directly from the Preferences menu.</p>
|
|
<p>The synonyms are searched for matches with user terms after the latter
|
|
are stem-expanded, but the contents of the synonyms file itself is not
|
|
subjected to stem expansion. This means that a match will not be found
|
|
if the form present in the synonyms file is not present anywhere in the
|
|
document set.</p>
|
|
<p>The synonyms function is probably not going to help you find your
|
|
letters to Mr. Smith. It is best used for domain-specific searches. For
|
|
example, it was initially suggested by a user performing searches among
|
|
historical documents: the synonyms file would contains nicknames and
|
|
aliases for each of the persons of interest.</p>
|
|
</div>
|
|
<div class="section" id="path-translations">
|
|
<h3><a class="toc-backref" href="#id24">Path translations</a><a class="headerlink" href="#path-translations" title="Permalink to this headline">¶</a></h3>
|
|
<p>In some cases, the document paths stored inside the index do not match
|
|
the actual ones, so that document previews and accesses will fail. This
|
|
can occur in a number of circumstances:</p>
|
|
<ul class="simple">
|
|
<li>When using multiple indexes it is a relatively common occurrence that
|
|
some will actually reside on a remote volume, for exemple mounted via
|
|
NFS. In this case, the paths used to access the documents on the
|
|
local machine are not necessarily the same than the ones used while
|
|
indexing on the remote machine. For example, <code class="docutils literal"><span class="pre">/home/me</span></code> may have
|
|
been used as a <code class="docutils literal"><span class="pre">topdirs</span></code> elements while indexing, but the directory
|
|
might be mounted as <code class="docutils literal"><span class="pre">/net/server/home/me</span></code> on the local machine.</li>
|
|
<li>The case may also occur with removable disks. It is perfectly
|
|
possible to configure an index to live with the documents on the
|
|
removable disk, but it may happen that the disk is not mounted at the
|
|
same place so that the documents paths from the index are invalid.</li>
|
|
<li>As a last exemple, one could imagine that a big directory has been
|
|
moved, but that it is currently inconvenient to run the indexer.</li>
|
|
</ul>
|
|
<p>RCL has a facility for rewriting access paths when extracting the data
|
|
from the index. The translations can be defined for the main index and
|
|
for any additional query index.</p>
|
|
<p>The path translation facility will be useful whenever the documents
|
|
paths seen by the indexer are not the same as the ones which should be
|
|
used at query time.</p>
|
|
<p>In the above NFS example, RCL could be instructed to rewrite any
|
|
<code class="docutils literal"><span class="pre">file:///home/me</span></code> URL from the index to
|
|
<code class="docutils literal"><span class="pre">file:///net/server/home/me</span></code>, allowing accesses from the client.</p>
|
|
<p>The translations are defined in the
|
|
<code class="docutils literal"><span class="pre">`ptrans</span></code> <#RCL.INSTALL.CONFIG.PTRANS>`__ configuration file, which
|
|
can be edited by hand or from the GUI external indexes configuration
|
|
dialog: Preferences > External index dialog, then click the Paths
|
|
translations button on the right below the index list.</p>
|
|
<blockquote>
|
|
<div><p><strong>Note</strong></p>
|
|
<p>Due to a current bug, the GUI must be restarted after changing the
|
|
<code class="docutils literal"><span class="pre">ptrans</span></code> values (even when they were changed from the GUI).</p>
|
|
</div></blockquote>
|
|
</div>
|
|
<div class="section" id="the-query-language">
|
|
<h3><a class="toc-backref" href="#id25">The query language</a><a class="headerlink" href="#the-query-language" title="Permalink to this headline">¶</a></h3>
|
|
<p>The query language processor is activated in the GUI simple search entry
|
|
when the search mode selector is set to Query Language. It can also be
|
|
used with the KIO slave or the command line search. It broadly has the
|
|
same capabilities as the complex search interface in the GUI.</p>
|
|
<p>The language was based on the now defunct
|
|
<a class="reference external" href="http://www.xesam.org/main/XesamUserSearchLanguage95">Xesam</a> user
|
|
search language specification.</p>
|
|
<p>If the results of a query language search puzzle you and you doubt what
|
|
has been actually searched for, you can use the GUI <code class="docutils literal"><span class="pre">Show</span> <span class="pre">Query</span></code> link
|
|
at the top of the result list to check the exact query which was finally
|
|
executed by Xapian.</p>
|
|
<p>Here follows a sample request that we are going to explain:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">author</span><span class="p">:</span><span class="s2">"john doe"</span> <span class="n">Beatles</span> <span class="n">OR</span> <span class="n">Lennon</span> <span class="n">Live</span> <span class="n">OR</span> <span class="n">Unplugged</span> <span class="o">-</span><span class="n">potatoes</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>This would search for all documents with John Doe appearing as a phrase
|
|
in the author field (exactly what this is would depend on the document
|
|
type, ie: the <code class="docutils literal"><span class="pre">From:</span></code> header, for an email message), and containing
|
|
either beatles or lennon and either live or unplugged but not potatoes
|
|
(in any part of the document).</p>
|
|
<p>An element is composed of an optional field specification, and a value,
|
|
separated by a colon (the field separator is the last colon in the
|
|
element). Examples: Eugenie, author:balzac, dc:title:grandet
|
|
dc:title:”eugenie grandet”</p>
|
|
<p>The colon, if present, means “contains”. Xesam defines other relations,
|
|
which are mostly unsupported for now (except in special cases, described
|
|
further down).</p>
|
|
<p>All elements in the search entry are normally combined with an implicit
|
|
AND. It is possible to specify that elements be OR’ed instead, as in
|
|
Beatles <code class="docutils literal"><span class="pre">OR</span></code> Lennon. The <code class="docutils literal"><span class="pre">OR</span></code> must be entered literally (capitals),
|
|
and it has priority over the AND associations: word1 word2 <code class="docutils literal"><span class="pre">OR</span></code> word3
|
|
means word1 AND (word2 <code class="docutils literal"><span class="pre">OR</span></code> word3) not (word1 AND word2) <code class="docutils literal"><span class="pre">OR</span></code> word3.</p>
|
|
<p>RCL versions 1.21 and later, allow using parentheses to group elements,
|
|
which will sometimes make things clearer, and may allow expressing
|
|
combinations which would have been difficult otherwise.</p>
|
|
<p>An element preceded by a <code class="docutils literal"><span class="pre">-</span></code> specifies a term that should <em>not</em>
|
|
appear.</p>
|
|
<p>As usual, words inside quotes define a phrase (the order of words is
|
|
significant), so that title:”prejudice pride” is not the same as
|
|
title:prejudice title:pride, and is unlikely to find a result.</p>
|
|
<p>Words inside phrases and capitalized words are not stem-expanded.
|
|
Wildcards may be used anywhere inside a term. Specifying a wild-card on
|
|
the left of a term can produce a very slow search (or even an incorrect
|
|
one if the expansion is truncated because of excessive size). Also see
|
|
<a class="reference external" href="#RCL.SEARCH.WILDCARDS">More about wildcards</a>.</p>
|
|
<p>To save you some typing, recent RCL versions (1.20 and later) interpret
|
|
a comma-separated list of terms as an AND list inside the field. Use
|
|
slash characters (‘/’) for an OR list. No white space is allowed. So</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">author</span><span class="p">:</span><span class="n">john</span><span class="p">,</span><span class="n">lennon</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>will search for documents with <code class="docutils literal"><span class="pre">john</span></code> and <code class="docutils literal"><span class="pre">lennon</span></code> inside the
|
|
<code class="docutils literal"><span class="pre">author</span></code> field (in any order), and</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">author</span><span class="p">:</span><span class="n">john</span><span class="o">/</span><span class="n">ringo</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>would search for <code class="docutils literal"><span class="pre">john</span></code> or <code class="docutils literal"><span class="pre">ringo</span></code>.</p>
|
|
<p>Modifiers can be set on a double-quote value, for example to specify a
|
|
proximity search (unordered). See <a class="reference external" href="#RCL.SEARCH.LANG.MODIFIERS">the modifier
|
|
section</a>. No space must separate the
|
|
final double-quote and the modifiers value, e.g. “two one”po10</p>
|
|
<p>RCL currently manages the following default fields:</p>
|
|
<ul class="simple">
|
|
<li><code class="docutils literal"><span class="pre">title</span></code>, <code class="docutils literal"><span class="pre">subject</span></code> or <code class="docutils literal"><span class="pre">caption</span></code> are synonyms which specify data
|
|
to be searched for in the document title or subject.</li>
|
|
<li><code class="docutils literal"><span class="pre">author</span></code> or <code class="docutils literal"><span class="pre">from</span></code> for searching the documents originators.</li>
|
|
<li><code class="docutils literal"><span class="pre">recipient</span></code> or <code class="docutils literal"><span class="pre">to</span></code> for searching the documents recipients.</li>
|
|
<li><code class="docutils literal"><span class="pre">keyword</span></code> for searching the document-specified keywords (few
|
|
documents actually have any).</li>
|
|
<li><code class="docutils literal"><span class="pre">filename</span></code> for the document’s file name. This is not necessarily
|
|
set for all documents: internal documents contained inside a compound
|
|
one (for example an EPUB section) do not inherit the container file
|
|
name any more, this was replaced by an explicit field (see next).
|
|
Sub-documents can still have a specific <code class="docutils literal"><span class="pre">filename</span></code>, if it is
|
|
implied by the document format, for example the attachment file name
|
|
for an email attachment.</li>
|
|
<li><code class="docutils literal"><span class="pre">containerfilename</span></code>. This is set for all documents, both top-level
|
|
and contained sub-documents, and is always the name of the filesystem
|
|
directory entry which contains the data. The terms from this field
|
|
can only be matched by an explicit field specification (as opposed to
|
|
terms from <code class="docutils literal"><span class="pre">filename</span></code> which are also indexed as general document
|
|
content). This avoids getting matches for all the sub-documents when
|
|
searching for the container file name.</li>
|
|
<li><code class="docutils literal"><span class="pre">ext</span></code> specifies the file name extension (Ex: <code class="docutils literal"><span class="pre">ext:html</span></code>)</li>
|
|
</ul>
|
|
<p>RCL 1.20 and later have a way to specify aliases for the field names,
|
|
which will save typing, for example by aliasing <code class="docutils literal"><span class="pre">filename</span></code> to fn or
|
|
<code class="docutils literal"><span class="pre">containerfilename</span></code> to cfn. See the <cite>section about the ``fields`</cite>
|
|
file <#RCL.INSTALL.CONFIG.FIELDS>`__</p>
|
|
<p>The document input handlers used while indexing have the possibility to
|
|
create other fields with arbitrary names, and aliases may be defined in
|
|
the configuration, so that the exact field search possibilities may be
|
|
different for you if someone took care of the customisation.</p>
|
|
<p>The field syntax also supports a few field-like, but special, criteria:</p>
|
|
<ul>
|
|
<li><p class="first"><code class="docutils literal"><span class="pre">dir</span></code> for filtering the results on file location (Ex:
|
|
<code class="docutils literal"><span class="pre">dir:/home/me/somedir</span></code>). <code class="docutils literal"><span class="pre">-dir</span></code> also works to find results not in
|
|
the specified directory (release >= 1.15.8). Tilde expansion will be
|
|
performed as usual (except for a bug in versions 1.19 to 1.19.11p1).
|
|
Wildcards will be expanded, but please <a class="reference external" href="#RCL.SEARCH.WILDCARDS.PATH">have a
|
|
look</a> at an important limitation of
|
|
wildcards in path filters.</p>
|
|
<p>Relative paths also make sense, for example, <code class="docutils literal"><span class="pre">dir:share/doc</span></code> would
|
|
match either <code class="docutils literal"><span class="pre">/usr/share/doc</span></code> or <code class="docutils literal"><span class="pre">/usr/local/share/doc</span></code></p>
|
|
<p>Several <code class="docutils literal"><span class="pre">dir</span></code> clauses can be specified, both positive and negative.
|
|
For example the following makes sense:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="nb">dir</span><span class="p">:</span><span class="n">recoll</span> <span class="nb">dir</span><span class="p">:</span><span class="n">src</span> <span class="o">-</span><span class="nb">dir</span><span class="p">:</span><span class="n">utils</span> <span class="o">-</span><span class="nb">dir</span><span class="p">:</span><span class="n">common</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>This would select results which have both <code class="docutils literal"><span class="pre">recoll</span></code> and <code class="docutils literal"><span class="pre">src</span></code> in
|
|
the path (in any order), and which have not either <code class="docutils literal"><span class="pre">utils</span></code> or
|
|
<code class="docutils literal"><span class="pre">common</span></code>.</p>
|
|
<p>You can also use <code class="docutils literal"><span class="pre">OR</span></code> conjunctions with <code class="docutils literal"><span class="pre">dir:</span></code> clauses.</p>
|
|
<p>A special aspect of <code class="docutils literal"><span class="pre">dir</span></code> clauses is that the values in the index
|
|
are not transcoded to UTF-8, and never lower-cased or unaccented, but
|
|
stored as binary. This means that you need to enter the values in the
|
|
exact lower or upper case, and that searches for names with
|
|
diacritics may sometimes be impossible because of character set
|
|
conversion issues. Non-ASCII UNIX file paths are an unending source
|
|
of trouble and are best avoided.</p>
|
|
<p>You need to use double-quotes around the path value if it contains
|
|
space characters.</p>
|
|
</li>
|
|
<li><p class="first"><code class="docutils literal"><span class="pre">size</span></code> for filtering the results on file size. Example:
|
|
<code class="docutils literal"><span class="pre">size<10000</span></code>. You can use <code class="docutils literal"><span class="pre"><</span></code>, <code class="docutils literal"><span class="pre">></span></code> or <code class="docutils literal"><span class="pre">=</span></code> as operators. You
|
|
can specify a range like the following: <code class="docutils literal"><span class="pre">size>100</span> <span class="pre">size<1000</span></code>. The
|
|
usual <code class="docutils literal"><span class="pre">k/K,</span> <span class="pre">m/M,</span> <span class="pre">g/G,</span> <span class="pre">t/T</span></code> can be used as (decimal) multipliers.
|
|
Ex: <code class="docutils literal"><span class="pre">size>1k</span></code> to search for files bigger than 1000 bytes.</p>
|
|
</li>
|
|
<li><p class="first"><code class="docutils literal"><span class="pre">date</span></code> for searching or filtering on dates. The syntax for the
|
|
argument is based on the ISO8601 standard for dates and time
|
|
intervals. Only dates are supported, no times. The general syntax is
|
|
2 elements separated by a <code class="docutils literal"><span class="pre">/</span></code> character. Each element can be a date
|
|
or a period of time. Periods are specified as
|
|
<code class="docutils literal"><span class="pre">P</span></code>n<code class="docutils literal"><span class="pre">Y</span></code>n<code class="docutils literal"><span class="pre">M</span></code>n<code class="docutils literal"><span class="pre">D</span></code>. The n numbers are the respective
|
|
numbers of years, months or days, any of which may be missing. Dates
|
|
are specified as YYYY-MM-DD. The days and months parts may be
|
|
missing. If the <code class="docutils literal"><span class="pre">/</span></code> is present but an element is missing, the
|
|
missing element is interpreted as the lowest or highest date in the
|
|
index. Examples:</p>
|
|
<ul class="simple">
|
|
<li><code class="docutils literal"><span class="pre">2001-03-01/2002-05-01</span></code> the basic syntax for an interval of
|
|
dates.</li>
|
|
<li><code class="docutils literal"><span class="pre">2001-03-01/P1Y2M</span></code> the same specified with a period.</li>
|
|
<li><code class="docutils literal"><span class="pre">2001/</span></code> from the beginning of 2001 to the latest date in the
|
|
index.</li>
|
|
<li><code class="docutils literal"><span class="pre">2001</span></code> the whole year of 2001</li>
|
|
<li><code class="docutils literal"><span class="pre">P2D/</span></code> means 2 days ago up to now if there are no documents with
|
|
dates in the future.</li>
|
|
<li><code class="docutils literal"><span class="pre">/2003</span></code> all documents from 2003 or older.</li>
|
|
</ul>
|
|
<p>Periods can also be specified with small letters (ie: p2y).</p>
|
|
</li>
|
|
<li><p class="first"><code class="docutils literal"><span class="pre">mime</span></code> or <code class="docutils literal"><span class="pre">format</span></code> for specifying the MIME type. These clauses
|
|
are processed besides the normal Boolean logic of the search.
|
|
Multiple values will be OR’ed (instead of the normal AND). You can
|
|
specify types to be excluded, with the usual <code class="docutils literal"><span class="pre">-</span></code>, and use
|
|
wildcards. Example: mime:text/* -mime:text/plain Specifying an
|
|
explicit boolean operator before a <code class="docutils literal"><span class="pre">mime</span></code> specification is not
|
|
supported and will produce strange results.</p>
|
|
</li>
|
|
<li><p class="first"><code class="docutils literal"><span class="pre">type</span></code> or <code class="docutils literal"><span class="pre">rclcat</span></code> for specifying the category (as in
|
|
text/media/presentation/etc.). The classification of MIME types in
|
|
categories is defined in the RCL configuration (<code class="docutils literal"><span class="pre">mimeconf</span></code>), and
|
|
can be modified or extended. The default category names are those
|
|
which permit filtering results in the main GUI screen. Categories are
|
|
OR’ed like MIME types above, and can be negated with <code class="docutils literal"><span class="pre">-</span></code>.</p>
|
|
<blockquote>
|
|
<div><p><strong>Note</strong></p>
|
|
<p><code class="docutils literal"><span class="pre">mime</span></code>, <code class="docutils literal"><span class="pre">rclcat</span></code>, <code class="docutils literal"><span class="pre">size</span></code> and <code class="docutils literal"><span class="pre">date</span></code> criteria always affect
|
|
the whole query (they are applied as a final filter), even if set
|
|
with other terms inside a parenthese.</p>
|
|
<p><strong>Note</strong></p>
|
|
<p><code class="docutils literal"><span class="pre">mime</span></code> (or the equivalent <code class="docutils literal"><span class="pre">rclcat</span></code>) is the <em>only</em> field with an
|
|
<code class="docutils literal"><span class="pre">OR</span></code> default. You do need to use <code class="docutils literal"><span class="pre">OR</span></code> with <code class="docutils literal"><span class="pre">ext</span></code> terms for
|
|
example.</p>
|
|
</div></blockquote>
|
|
</li>
|
|
</ul>
|
|
<div class="section" id="range-clauses">
|
|
<h4>Range clauses<a class="headerlink" href="#range-clauses" title="Permalink to this headline">¶</a></h4>
|
|
<p>RCL 1.24 and later support range clauses on fields which have been
|
|
configured to support it. No default field uses them currently, so this
|
|
paragraph is only interesting if you modified the fields configuration
|
|
and possibly use a custom input handler.</p>
|
|
<p>A range clause looks like one of the following:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">myfield</span><span class="p">:</span><span class="n">small</span><span class="o">..</span><span class="n">big</span>
|
|
<span class="n">myfield</span><span class="p">:</span><span class="n">small</span><span class="o">..</span>
|
|
<span class="n">myfield</span><span class="p">:</span><span class="o">..</span><span class="n">big</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>The nature of the clause is indicated by the two dots <code class="docutils literal"><span class="pre">..</span></code>, and the
|
|
effect is to filter the results for which the myfield value is in the
|
|
possibly open-ended interval.</p>
|
|
<p>See the section about the <code class="docutils literal"><span class="pre">`fields</span></code> configuration
|
|
file <#RCL.INSTALL.CONFIG.FIELDS>`__ for the details of configuring a
|
|
field for range searches (list them in the [values] section).</p>
|
|
</div>
|
|
<div class="section" id="modifiers">
|
|
<h4>Modifiers<a class="headerlink" href="#modifiers" title="Permalink to this headline">¶</a></h4>
|
|
<p>Some characters are recognized as search modifiers when found
|
|
immediately after the closing double quote of a phrase, as in
|
|
<code class="docutils literal"><span class="pre">"some</span> <span class="pre">term"modifierchars</span></code>. The actual “phrase” can be a single term
|
|
of course. Supported modifiers:</p>
|
|
<ul class="simple">
|
|
<li><code class="docutils literal"><span class="pre">l</span></code> can be used to turn off stemming (mostly makes sense with <code class="docutils literal"><span class="pre">p</span></code>
|
|
because stemming is off by default for phrases).</li>
|
|
<li><code class="docutils literal"><span class="pre">s</span></code> can be used to turn off synonym expansion, if a synonyms file
|
|
is in place (only for RCL 1.22 and later).</li>
|
|
<li><code class="docutils literal"><span class="pre">o</span></code> can be used to specify a “slack” for phrase and proximity
|
|
searches: the number of additional terms that may be found between
|
|
the specified ones. If <code class="docutils literal"><span class="pre">o</span></code> is followed by an integer number, this
|
|
is the slack, else the default is 10.</li>
|
|
<li><code class="docutils literal"><span class="pre">p</span></code> can be used to turn the default phrase search into a proximity
|
|
one (unordered). Example: <code class="docutils literal"><span class="pre">"order</span> <span class="pre">any</span> <span class="pre">in"p</span></code></li>
|
|
<li><code class="docutils literal"><span class="pre">C</span></code> will turn on case sensitivity (if the index supports it).</li>
|
|
<li><code class="docutils literal"><span class="pre">D</span></code> will turn on diacritics sensitivity (if the index supports it).</li>
|
|
<li>A weight can be specified for a query element by specifying a decimal
|
|
value at the start of the modifiers. Example: <code class="docutils literal"><span class="pre">"Important"2.5</span></code>.</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="search-case-and-diacritics-sensitivity">
|
|
<h3><a class="toc-backref" href="#id26">Search case and diacritics sensitivity</a><a class="headerlink" href="#search-case-and-diacritics-sensitivity" title="Permalink to this headline">¶</a></h3>
|
|
<p>For RCL versions 1.18 and later, and <em>when working with a raw index</em>
|
|
(not the default), searches can be sensitive to character case and
|
|
diacritics. How this happens is controlled by configuration variables
|
|
and what search data is entered.</p>
|
|
<p>The general default is that searches entered without upper-case or
|
|
accented characters are insensitive to case and diacritics. An entry of
|
|
<code class="docutils literal"><span class="pre">resume</span></code> will match any of <code class="docutils literal"><span class="pre">Resume</span></code>, <code class="docutils literal"><span class="pre">RESUME</span></code>, <code class="docutils literal"><span class="pre">résumé</span></code>,
|
|
<code class="docutils literal"><span class="pre">Résumé</span></code> etc.</p>
|
|
<p>Two configuration variables can automate switching on sensitivity (they
|
|
were documented but actually did nothing until RCL 1.22):</p>
|
|
<dl class="docutils">
|
|
<dt>autodiacsens</dt>
|
|
<dd>If this is set, search sensitivity to diacritics will be turned on
|
|
as soon as an accented character exists in a search term. When the
|
|
variable is set to true, <code class="docutils literal"><span class="pre">resume</span></code> will start a
|
|
diacritics-unsensitive search, but <code class="docutils literal"><span class="pre">résumé</span></code> will be matched
|
|
exactly. The default value is <em>false</em>.</dd>
|
|
<dt>autocasesens</dt>
|
|
<dd>If this is set, search sensitivity to character case will be turned
|
|
on as soon as an upper-case character exists in a search term
|
|
<em>except for the first one</em>. When the variable is set to true, <code class="docutils literal"><span class="pre">us</span></code>
|
|
or <code class="docutils literal"><span class="pre">Us</span></code> will start a diacritics-unsensitive search, but <code class="docutils literal"><span class="pre">US</span></code>
|
|
will be matched exactly. The default value is <em>true</em> (contrary to
|
|
<code class="docutils literal"><span class="pre">autodiacsens</span></code>).</dd>
|
|
</dl>
|
|
<p>As in the past, capitalizing the first letter of a word will turn off
|
|
its stem expansion and have no effect on case-sensitivity.</p>
|
|
<p>You can also explicitely activate case and diacritics sensitivity by
|
|
using modifiers with the query language. <code class="docutils literal"><span class="pre">C</span></code> will make the term
|
|
case-sensitive, and <code class="docutils literal"><span class="pre">D</span></code> will make it diacritics-sensitive. Examples:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="s2">"us"</span><span class="n">C</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>will search for the term <code class="docutils literal"><span class="pre">us</span></code> exactly (<code class="docutils literal"><span class="pre">Us</span></code> will not be a match).</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="s2">"resume"</span><span class="n">D</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>will search for the term <code class="docutils literal"><span class="pre">resume</span></code> exactly (<code class="docutils literal"><span class="pre">résumé</span></code> will not be a
|
|
match).</p>
|
|
<p>When either case or diacritics sensitivity is activated, stem expansion
|
|
is turned off. Having both does not make much sense.</p>
|
|
</div>
|
|
<div class="section" id="anchored-searches-and-wildcards">
|
|
<h3><a class="toc-backref" href="#id27">Anchored searches and wildcards</a><a class="headerlink" href="#anchored-searches-and-wildcards" title="Permalink to this headline">¶</a></h3>
|
|
<p>Some special characters are interpreted by RCL in search strings to
|
|
expand or specialize the search. Wildcards expand a root term in
|
|
controlled ways. Anchor characters can restrict a search to succeed only
|
|
if the match is found at or near the beginning of the document or one of
|
|
its fields.</p>
|
|
<div class="section" id="more-about-wildcards">
|
|
<h4>More about wildcards<a class="headerlink" href="#more-about-wildcards" title="Permalink to this headline">¶</a></h4>
|
|
<p>All words entered in RCL search fields will be processed for wildcard
|
|
expansion before the request is finally executed.</p>
|
|
<p>The wildcard characters are:</p>
|
|
<ul class="simple">
|
|
<li><code class="docutils literal"><span class="pre">*</span></code> which matches 0 or more characters.</li>
|
|
<li><code class="docutils literal"><span class="pre">?</span></code> which matches a single character.</li>
|
|
<li><code class="docutils literal"><span class="pre">[]</span></code> which allow defining sets of characters to be matched (ex:
|
|
<code class="docutils literal"><span class="pre">[</span></code><code class="docutils literal"><span class="pre">abc</span></code><code class="docutils literal"><span class="pre">]</span></code> matches a single character which may be ‘a’ or
|
|
‘b’ or ‘c’, <code class="docutils literal"><span class="pre">[</span></code><code class="docutils literal"><span class="pre">0-9</span></code><code class="docutils literal"><span class="pre">]</span></code> matches any number.</li>
|
|
</ul>
|
|
<p>You should be aware of a few things when using wildcards.</p>
|
|
<ul class="simple">
|
|
<li>Using a wildcard character at the beginning of a word can make for a
|
|
slow search because RCL will have to scan the whole index term list
|
|
to find the matches. However, this is much less a problem for field
|
|
searches, and queries like author:*@domain.com can sometimes be very
|
|
useful.</li>
|
|
<li>For RCL version 18 only, when working with a raw index (preserving
|
|
character case and diacritics), the literal part of a wildcard
|
|
expression will be matched exactly for case and diacritics. This is
|
|
not true any more for versions 19 and later.</li>
|
|
<li>Using a <code class="docutils literal"><span class="pre">*</span></code> at the end of a word can produce more matches than you
|
|
would think, and strange search results. You can use the <a class="reference external" href="#RCL.SEARCH.GUI.TERMEXPLORER">term
|
|
explorer</a> tool to check what
|
|
completions exist for a given term. You can also see exactly what
|
|
search was performed by clicking on the link at the top of the result
|
|
list. In general, for natural language terms, stem expansion will
|
|
produce better results than an ending <code class="docutils literal"><span class="pre">*</span></code> (stem expansion is turned
|
|
off when any wildcard character appears in the term).</li>
|
|
</ul>
|
|
<div class="section" id="wildcards-and-path-filtering">
|
|
<h5>Wildcards and path filtering<a class="headerlink" href="#wildcards-and-path-filtering" title="Permalink to this headline">¶</a></h5>
|
|
<p>Due to the way that RCL processes wildcards inside <code class="docutils literal"><span class="pre">dir</span></code> path
|
|
filtering clauses, they will have a multiplicative effect on the query
|
|
size. A clause containg wildcards in several paths elements, like, for
|
|
example, <code class="docutils literal"><span class="pre">dir:</span></code>/home/me/*/*/docdir, will almost certainly fail if
|
|
your indexed tree is of any realistic size.</p>
|
|
<p>Depending on the case, you may be able to work around the issue by
|
|
specifying the paths elements more narrowly, with a constant prefix, or
|
|
by using 2 separate <code class="docutils literal"><span class="pre">dir:</span></code> clauses instead of multiple wildcards, as
|
|
in <code class="docutils literal"><span class="pre">dir:</span></code>/home/me <code class="docutils literal"><span class="pre">dir:</span></code>docdir. The latter query is not equivalent
|
|
to the initial one because it does not specify a number of directory
|
|
levels, but that’s the best we can do (and it may be actually more
|
|
useful in some cases).</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="anchored-searches">
|
|
<h4>Anchored searches<a class="headerlink" href="#anchored-searches" title="Permalink to this headline">¶</a></h4>
|
|
<p>Two characters are used to specify that a search hit should occur at the
|
|
beginning or at the end of the text. <code class="docutils literal"><span class="pre">^</span></code> at the beginning of a term or
|
|
phrase constrains the search to happen at the start, <code class="docutils literal"><span class="pre">$</span></code> at the end
|
|
force it to happen at the end.</p>
|
|
<p>As this function is implemented as a phrase search it is possible to
|
|
specify a maximum distance at which the hit should occur, either through
|
|
the controls of the advanced search panel, or using the query language,
|
|
for example, as in:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="s2">"^someterm"</span><span class="n">o10</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>which would force <code class="docutils literal"><span class="pre">someterm</span></code> to be found within 10 terms of the start
|
|
of the text. This can be combined with a field search as in
|
|
<code class="docutils literal"><span class="pre">somefield:"^someterm"o10</span></code> or <code class="docutils literal"><span class="pre">somefield:someterm$</span></code>.</p>
|
|
<p>This feature can also be used with an actual phrase search, but in this
|
|
case, the distance applies to the whole phrase and anchor, so that, for
|
|
example, <code class="docutils literal"><span class="pre">bla</span> <span class="pre">bla</span> <span class="pre">my</span> <span class="pre">unexpected</span> <span class="pre">term</span></code> at the beginning of the text
|
|
would be a match for <code class="docutils literal"><span class="pre">"^my</span> <span class="pre">term"o5</span></code>.</p>
|
|
<p>Anchored searches can be very useful for searches inside somewhat
|
|
structured documents like scientific articles, in case explicit metadata
|
|
has not been supplied (a most frequent case), for example for looking
|
|
for matches inside the abstract or the list of authors (which occur at
|
|
the top of the document).</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="desktop-integration">
|
|
<h3><a class="toc-backref" href="#id28">Desktop integration</a><a class="headerlink" href="#desktop-integration" title="Permalink to this headline">¶</a></h3>
|
|
<p>Being independant of the desktop type has its drawbacks: RCL desktop
|
|
integration is minimal. However there are a few tools available:</p>
|
|
<ul class="simple">
|
|
<li>The KDE KIO Slave was described in a <a class="reference external" href="#RCL.SEARCH.KIO">previous
|
|
section</a>.</li>
|
|
<li>If you use a recent version of Ubuntu Linux, you may find the <a class="reference external" href="&FAQS;UnityLens">Ubuntu
|
|
Unity Lens</a> module useful.</li>
|
|
<li>There is also an independantly developed <a class="reference external" href="http://kde-apps.org/content/show.php/recollrunner?content=128203">Krunner
|
|
plugin</a>.</li>
|
|
</ul>
|
|
<p>Here follow a few other things that may help.</p>
|
|
<div class="section" id="hotkeying-recoll">
|
|
<h4>Hotkeying recoll<a class="headerlink" href="#hotkeying-recoll" title="Permalink to this headline">¶</a></h4>
|
|
<p>It is surprisingly convenient to be able to show or hide the RCL GUI
|
|
with a single keystroke. Recoll comes with a small Python script, based
|
|
on the libwnck window manager interface library, which will allow you to
|
|
do just this. The detailed instructions are on <a class="reference external" href="&FAQS;HotRecoll">this wiki
|
|
page</a>.</p>
|
|
</div>
|
|
<div class="section" id="the-kde-kicker-recoll-applet">
|
|
<h4>The KDE Kicker Recoll applet<a class="headerlink" href="#the-kde-kicker-recoll-applet" title="Permalink to this headline">¶</a></h4>
|
|
<p>This is probably obsolete now. Anyway:</p>
|
|
<p>The RCL source tree contains the source code to the recoll_applet, a
|
|
small application derived from the find_applet. This can be used to add
|
|
a small RCL launcher to the KDE panel.</p>
|
|
<p>The applet is not automatically built with the main RCL programs, nor is
|
|
it included with the main source distribution (because the KDE build
|
|
boilerplate makes it relatively big). You can download its source from
|
|
the recoll.org download page. Use the omnipotent
|
|
<code class="docutils literal"><span class="pre">configure;make;make</span> <span class="pre">install</span></code> incantation to build and install.</p>
|
|
<p>You can then add the applet to the panel by right-clicking the panel and
|
|
choosing the Add applet entry.</p>
|
|
<p>The recoll_applet has a small text window where you can type a RCL
|
|
query (in query language form), and an icon which can be used to
|
|
restrict the search to certain types of files. It is quite primitive,
|
|
and launches a new recoll GUI instance every time (even if it is already
|
|
running). You may find it useful anyway.</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="removable-volumes">
|
|
<h2><a class="toc-backref" href="#id29">Removable volumes</a><a class="headerlink" href="#removable-volumes" title="Permalink to this headline">¶</a></h2>
|
|
<p>RCL used to have no support for indexing removable volumes (portable
|
|
disks, USB keys, etc.). Recent versions have improved the situation and
|
|
support indexing removable volumes in two different ways:</p>
|
|
<ul class="simple">
|
|
<li>By storing a volume index on the volume itself (RCL 1.24).</li>
|
|
<li>By indexing the volume in the main, fixed, index, and ensuring that
|
|
the volume data is not purged if the indexing runs while the volume
|
|
is mounted. (RCL 1.25.2).</li>
|
|
</ul>
|
|
<div class="section" id="indexing-removable-volumes-in-the-main-index">
|
|
<h3><a class="toc-backref" href="#id30">Indexing removable volumes in the main index</a><a class="headerlink" href="#indexing-removable-volumes-in-the-main-index" title="Permalink to this headline">¶</a></h3>
|
|
<p>As of version 1.25.2, RCL has a simple way to ensure that the index data
|
|
for an absent volume will not be purged: the volume mount point must be
|
|
a member of the <code class="docutils literal"><span class="pre">topdirs</span></code> list, and the mount directory must be empty
|
|
(when the volume is not mounted). If <code class="docutils literal"><span class="pre">recollindex</span></code> finds that one of
|
|
the <code class="docutils literal"><span class="pre">topdirs</span></code> is empty when starting up, any existing data for the
|
|
tree will be preserved by the indexing pass (no purge for this area).</p>
|
|
</div>
|
|
<div class="section" id="self-contained-volumes">
|
|
<h3><a class="toc-backref" href="#id31">Self contained volumes</a><a class="headerlink" href="#self-contained-volumes" title="Permalink to this headline">¶</a></h3>
|
|
<p>As of RCL 1.24, it has become easy to build self-contained datasets
|
|
including a RCL configuration directory and index together with the
|
|
indexed documents, and to move such a dataset around (for example
|
|
copying it to an USB drive), without having to adjust the configuration
|
|
for querying the index.</p>
|
|
<blockquote>
|
|
<div><p><strong>Note</strong></p>
|
|
<p>This is a query-time feature only. The index must only be updated in
|
|
its original location. If an update is necessary in a different
|
|
location, the index must be reset.</p>
|
|
</div></blockquote>
|
|
<p>To make a long story short, here follows a script to create a RCL
|
|
configuration and index under a given directory (given as single
|
|
parameter). The resulting data set (files + recoll directory) can later
|
|
to be moved to a CDROM or thumb drive. Longer explanations come after
|
|
the script.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span>#!/bin/sh
|
|
|
|
fatal()
|
|
{
|
|
echo $*;exit 1
|
|
}
|
|
usage()
|
|
{
|
|
fatal "Usage: init-recoll-volume.sh <top-directory>"
|
|
}
|
|
|
|
test $# = 1 || usage
|
|
topdir=$1
|
|
test -d "$topdir" || fatal $topdir should be a directory
|
|
|
|
confdir="$topdir/recoll-config"
|
|
test ! -d "$confdir" || fatal $confdir should not exist
|
|
|
|
mkdir "$confdir"
|
|
cd "$topdir"
|
|
topdir=`pwd`
|
|
cd "$confdir"
|
|
confdir=`pwd`
|
|
|
|
(echo topdirs = '"'$topdir'"'; \
|
|
echo orgidxconfdir = $topdir/recoll-config) > "$confdir/recoll.conf"
|
|
|
|
recollindex -c "$confdir"
|
|
</pre></div>
|
|
</div>
|
|
<p>The examples below will assume that you have a dataset under
|
|
<code class="docutils literal"><span class="pre">/home/me/mydata/</span></code>, with the index configuration and data stored
|
|
inside <code class="docutils literal"><span class="pre">/home/me/mydata/recoll-confdir</span></code>.</p>
|
|
<p>In order to be able to run queries after the dataset has been moved, you
|
|
must ensure the following:</p>
|
|
<ul class="simple">
|
|
<li>The main configuration file must define the
|
|
<a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF.ORGIDXCONFDIR">orgidxconfdir</a>
|
|
variable to be the original location of the configuration directory
|
|
(<code class="docutils literal"><span class="pre">orgidxconfdir=/home/me/mydata/recoll-confdir</span></code> must be set inside
|
|
<code class="docutils literal"><span class="pre">/home/me/mydata/recoll-confdir/recoll.conf</span></code> in the example above).</li>
|
|
<li>The configuration directory must exist with the documents, somewhere
|
|
under the directory which will be moved. E.g. if you are moving
|
|
<code class="docutils literal"><span class="pre">/home/me/mydata</span></code> around, the configuration directory must exist
|
|
somewhere below this point, for example
|
|
<code class="docutils literal"><span class="pre">/home/me/mydata/recoll-confdir</span></code>, or
|
|
<code class="docutils literal"><span class="pre">/home/me/mydata/sub/recoll-confdir</span></code>.</li>
|
|
<li>You should keep the default locations for the index elements (they
|
|
are relative to the configuration directory by default). Only the
|
|
paths referring to the documents themselves (e.g. <code class="docutils literal"><span class="pre">topdirs</span></code> values)
|
|
should be absolute (in general, they are only used when indexing
|
|
anyway).</li>
|
|
</ul>
|
|
<p>Only the first point needs an explicit user action, the RCL defaults are
|
|
compatible with the second one, and the third is natural.</p>
|
|
<p>If, after the move, the configuration directory needs to be copied out
|
|
of the dataset (for example because the thumb drive is too slow), you
|
|
can set the
|
|
<a class="reference external" href="#RCL.INSTALL.CONFIG.RECOLLCONF.CURIDXCONFDIR">curidxconfdir</a>,
|
|
variable inside the copied configuration to define the location of the
|
|
moved one. For example if <code class="docutils literal"><span class="pre">/home/me/mydata</span></code> is now mounted onto
|
|
<code class="docutils literal"><span class="pre">/media/me/somelabel</span></code>, but the configuration directory and index has
|
|
been copied to <code class="docutils literal"><span class="pre">/tmp/tempconfig</span></code>, you would set <code class="docutils literal"><span class="pre">curidxconfdir</span></code> to
|
|
<code class="docutils literal"><span class="pre">/media/me/somelabel/recoll-confdir</span></code> inside
|
|
<code class="docutils literal"><span class="pre">/tmp/tempconfig/recoll.conf</span></code>. <code class="docutils literal"><span class="pre">orgidxconfdir</span></code> would still be
|
|
<code class="docutils literal"><span class="pre">/home/me/mydata/recoll-confdir</span></code> in the original and the copy.</p>
|
|
<p>If you are regularly copying the configuration out of the dataset, it
|
|
will be useful to write a script to automate the procedure. This can’t
|
|
really be done inside RCL because there are probably many possible
|
|
variants. One example would be to copy the configuration to make it
|
|
writable, but keep the index data on the medium because it is too big -
|
|
in this case, the script would also need to set <code class="docutils literal"><span class="pre">dbdir</span></code> in the copied
|
|
configuration.</p>
|
|
<p>The same set of modifications (RCL 1.24) has also made it possible to
|
|
run queries from a readonly configuration directory (with slightly
|
|
reduced function of course, such as not recording the query history).</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="programming-interface">
|
|
<h2><a class="toc-backref" href="#id32">Programming interface</a><a class="headerlink" href="#programming-interface" title="Permalink to this headline">¶</a></h2>
|
|
<p>RCL has an Application Programming Interface, usable both for indexing
|
|
and searching, currently accessible from the Python language.</p>
|
|
<p>Another less radical way to extend the application is to write input
|
|
handlers for new types of documents.</p>
|
|
<p>The processing of metadata attributes for documents (<code class="docutils literal"><span class="pre">fields</span></code>) is
|
|
highly configurable.</p>
|
|
<div class="section" id="writing-a-document-input-handler">
|
|
<h3><a class="toc-backref" href="#id33">Writing a document input handler</a><a class="headerlink" href="#writing-a-document-input-handler" title="Permalink to this headline">¶</a></h3>
|
|
<blockquote>
|
|
<div><p><strong>Note</strong></p>
|
|
<p>The small programs or pieces of code which handle the processing of
|
|
the different document types for RCL used to be called <code class="docutils literal"><span class="pre">filters</span></code>,
|
|
which is still reflected in the name of the directory which holds
|
|
them and many configuration variables. They were named this way
|
|
because one of their primary functions is to filter out the
|
|
formatting directives and keep the text content. However these
|
|
modules may have other behaviours, and the term <code class="docutils literal"><span class="pre">input</span> <span class="pre">handler</span></code> is
|
|
now progressively substituted in the documentation. <code class="docutils literal"><span class="pre">filter</span></code> is
|
|
still used in many places though.</p>
|
|
</div></blockquote>
|
|
<p>RCL input handlers cooperate to translate from the multitude of input
|
|
document formats, simple ones as opendocument, acrobat, or compound ones
|
|
such as Zip or Email, into the final RCL indexing input format, which is
|
|
plain text (in many cases the processing pipeline has an intermediary
|
|
HTML step, which may be used for better previewing presentation). Most
|
|
input handlers are executable programs or scripts. A few handlers are
|
|
coded in C++ and live inside <code class="docutils literal"><span class="pre">recollindex</span></code>. This latter kind will not
|
|
be described here.</p>
|
|
<p>There are currently (since version 1.13) two kinds of external
|
|
executable input handlers:</p>
|
|
<ul class="simple">
|
|
<li>Simple <code class="docutils literal"><span class="pre">exec</span></code> handlers run once and exit. They can be bare programs
|
|
like <code class="docutils literal"><span class="pre">antiword</span></code>, or scripts using other programs. They are very
|
|
simple to write, because they just need to print the converted
|
|
document to the standard output. Their output can be plain text or
|
|
HTML. HTML is usually preferred because it can store metadata fields
|
|
and it allows preserving some of the formatting for the GUI preview.
|
|
However, these handlers have limitations:<ul>
|
|
<li>They can only process one document per file.</li>
|
|
<li>The output MIME type must be known and fixed.</li>
|
|
<li>The character encoding, if relevant, must be known and fixed (or
|
|
possibly just depending on location).</li>
|
|
</ul>
|
|
</li>
|
|
<li>Multiple <code class="docutils literal"><span class="pre">execm</span></code> handlers can process multiple files (sparing the
|
|
process startup time which can be very significant), or multiple
|
|
documents per file (e.g.: for archives or multi-chapter
|
|
publications). They communicate with the indexer through a simple
|
|
protocol, but are nevertheless a bit more complicated than the older
|
|
kind. Most of the new handlers are written in Python (exception:
|
|
<code class="docutils literal"><span class="pre">rclimg</span></code> which is written in Perl because <code class="docutils literal"><span class="pre">exiftool</span></code> has no real
|
|
Python equivalent). The Python handlers use common modules to factor
|
|
out the boilerplate, which can make them very simple in favorable
|
|
cases. The subdocuments output by these handlers can be directly
|
|
indexable (text or HTML), or they can be other simple or compound
|
|
documents that will need to be processed by another handler.</li>
|
|
</ul>
|
|
<p>In both cases, handlers deal with regular file system files, and can
|
|
process either a single document, or a linear list of documents in each
|
|
file. RCL is responsible for performing up to date checks, deal with
|
|
more complex embedding and other upper level issues.</p>
|
|
<p>A simple handler returning a document in <code class="docutils literal"><span class="pre">text/plain</span></code> format, can
|
|
transfer no metadata to the indexer. Generic metadata, like document
|
|
size or modification date, will be gathered and stored by the indexer.</p>
|
|
<p>Handlers that produce <code class="docutils literal"><span class="pre">text/html</span></code> format can return an arbitrary
|
|
amount of metadata inside HTML <code class="docutils literal"><span class="pre">meta</span></code> tags. These will be processed
|
|
according to the directives found in the <code class="docutils literal"><span class="pre">`fields</span></code> configuration
|
|
file <#RCL.PROGRAM.FIELDS>`__.</p>
|
|
<p>The handlers that can handle multiple documents per file return a single
|
|
piece of data to identify each document inside the file. This piece of
|
|
data, called an <code class="docutils literal"><span class="pre">ipath</span></code> will be sent back by RCL to extract the
|
|
document at query time, for previewing, or for creating a temporary file
|
|
to be opened by a viewer. These handlers can also return metadata either
|
|
as HTML <code class="docutils literal"><span class="pre">meta</span></code> tags, or as named data through the communication
|
|
protocol.</p>
|
|
<p>The following section describes the simple handlers, and the next one
|
|
gives a few explanations about the <code class="docutils literal"><span class="pre">execm</span></code> ones. You could conceivably
|
|
write a simple handler with only the elements in the manual. This will
|
|
not be the case for the other ones, for which you will have to look at
|
|
the code.</p>
|
|
<div class="section" id="simple-input-handlers">
|
|
<h4>Simple input handlers<a class="headerlink" href="#simple-input-handlers" title="Permalink to this headline">¶</a></h4>
|
|
<p>RCL simple handlers are usually shell-scripts, but this is in no way
|
|
necessary. Extracting the text from the native format is the difficult
|
|
part. Outputting the format expected by RCL is trivial. Happily enough,
|
|
most document formats have translators or text extractors which can be
|
|
called from the handler. In some cases the output of the translating
|
|
program is completely appropriate, and no intermediate shell-script is
|
|
needed.</p>
|
|
<p>Input handlers are called with a single argument which is the source
|
|
file name. They should output the result to stdout.</p>
|
|
<p>When writing a handler, you should decide if it will output plain text
|
|
or HTML. Plain text is simpler, but you will not be able to add metadata
|
|
or vary the output character encoding (this will be defined in a
|
|
configuration file). Additionally, some formatting may be easier to
|
|
preserve when previewing HTML. Actually the deciding factor is metadata:
|
|
RCL has a way to <a class="reference external" href="#RCL.PROGRAM.FILTERS.HTML">extract metadata from the HTML header and use it for
|
|
field searches.</a>.</p>
|
|
<p>The RECOLL_FILTER_FORPREVIEW environment variable (values <code class="docutils literal"><span class="pre">yes</span></code>,
|
|
<code class="docutils literal"><span class="pre">no</span></code>) tells the handler if the operation is for indexing or
|
|
previewing. Some handlers use this to output a slightly different
|
|
format, for example stripping uninteresting repeated keywords (ie:
|
|
<code class="docutils literal"><span class="pre">Subject:</span></code> for email) when indexing. This is not essential.</p>
|
|
<p>You should look at one of the simple handlers, for example <code class="docutils literal"><span class="pre">rclps</span></code> for
|
|
a starting point.</p>
|
|
<p>Don’t forget to make your handler executable before testing !</p>
|
|
</div>
|
|
<div class="section" id="multiple-handlers">
|
|
<h4>“Multiple” handlers<a class="headerlink" href="#multiple-handlers" title="Permalink to this headline">¶</a></h4>
|
|
<p>If you can program and want to write an <code class="docutils literal"><span class="pre">execm</span></code> handler, it should not
|
|
be too difficult to make sense of one of the existing handlers.</p>
|
|
<p>The existing handlers differ in the amount of helper code which they are
|
|
using:</p>
|
|
<ul class="simple">
|
|
<li><code class="docutils literal"><span class="pre">rclimg</span></code> is written in Perl and handles the execm protocol all by
|
|
itself (showing how trivial it is).</li>
|
|
<li>All the Python handlers share at least the <code class="docutils literal"><span class="pre">rclexecm.py</span></code> module,
|
|
which handles the communication. Have a look at, for example,
|
|
<code class="docutils literal"><span class="pre">rclzip</span></code> for a handler which uses <code class="docutils literal"><span class="pre">rclexecm.py</span></code> directly.</li>
|
|
<li>Most Python handlers which process single-document files by executing
|
|
another command are further abstracted by using the <code class="docutils literal"><span class="pre">rclexec1.py</span></code>
|
|
module. See for example <code class="docutils literal"><span class="pre">rclrtf.py</span></code> for a simple one, or
|
|
<code class="docutils literal"><span class="pre">rcldoc.py</span></code> for a slightly more complicated one (possibly executing
|
|
several commands).</li>
|
|
<li>Handlers which extract text from an XML document by using an XSLT
|
|
style sheet are now executed inside <code class="docutils literal"><span class="pre">recollindex</span></code>, with only the
|
|
style sheet stored in the <code class="docutils literal"><span class="pre">filters/</span></code> directory. These can use a
|
|
single style sheet (e.g. <code class="docutils literal"><span class="pre">abiword.xsl</span></code>), or two sheets for the data
|
|
and metadata (e.g. <code class="docutils literal"><span class="pre">opendoc-body.xsl</span></code> and <code class="docutils literal"><span class="pre">opendoc-meta.xsl</span></code>).
|
|
The <code class="docutils literal"><span class="pre">mimeconf</span></code> configuration file defines how the sheets are used,
|
|
have a look. Before the C++ import, the xsl-based handlers used a
|
|
common module <code class="docutils literal"><span class="pre">rclgenxslt.py</span></code>, it is still around but unused. The
|
|
handler for OpenXML presentations is still the Python version because
|
|
the format did not fit with what the C++ code does. It would be a
|
|
good base for another similar issue.</li>
|
|
</ul>
|
|
<p>There is a sample trivial handler based on <code class="docutils literal"><span class="pre">rclexecm.py</span></code>, with many
|
|
comments, not actually used by RCL. It would index a text file as one
|
|
document per line. Look for <code class="docutils literal"><span class="pre">rcltxtlines.py</span></code> in the <code class="docutils literal"><span class="pre">src/filters</span></code>
|
|
directory in the online RCL <a class="reference external" href="https://opensourceprojects.eu/p/recoll1/">Git
|
|
repository</a> (the sample not
|
|
in the distributed release at the moment).</p>
|
|
<p>You can also have a look at the slightly more complex <code class="docutils literal"><span class="pre">rclzip</span></code> which
|
|
uses Zip file paths as identifiers (<code class="docutils literal"><span class="pre">ipath</span></code>).</p>
|
|
<p><code class="docutils literal"><span class="pre">execm</span></code> handlers sometimes need to make a choice for the nature of the
|
|
<code class="docutils literal"><span class="pre">ipath</span></code> elements that they use in communication with the indexer. Here
|
|
are a few guidelines:</p>
|
|
<ul class="simple">
|
|
<li>Use ASCII or UTF-8 (if the identifier is an integer print it, for
|
|
example, like printf %d would do).</li>
|
|
<li>If at all possible, the data should make some kind of sense when
|
|
printed to a log file to help with debugging.</li>
|
|
<li>RCL uses a colon (<code class="docutils literal"><span class="pre">:</span></code>) as a separator to store a complex path
|
|
internally (for deeper embedding). Colons inside the <code class="docutils literal"><span class="pre">ipath</span></code>
|
|
elements output by a handler will be escaped, but would be a bad
|
|
choice as a handler-specific separator (mostly, again, for debugging
|
|
issues).</li>
|
|
</ul>
|
|
<p>In any case, the main goal is that it should be easy for the handler to
|
|
extract the target document, given the file name and the <code class="docutils literal"><span class="pre">ipath</span></code>
|
|
element.</p>
|
|
<p><code class="docutils literal"><span class="pre">execm</span></code> handlers will also produce a document with a null <code class="docutils literal"><span class="pre">ipath</span></code>
|
|
element. Depending on the type of document, this may have some
|
|
associated data (e.g. the body of an email message), or none (typical
|
|
for an archive file). If it is empty, this document will be useful
|
|
anyway for some operations, as the parent of the actual data documents.</p>
|
|
</div>
|
|
<div class="section" id="telling-rcl-about-the-handler">
|
|
<h4>Telling RCL about the handler<a class="headerlink" href="#telling-rcl-about-the-handler" title="Permalink to this headline">¶</a></h4>
|
|
<p>There are two elements that link a file to the handler which should
|
|
process it: the association of file to MIME type and the association of
|
|
a MIME type with a handler.</p>
|
|
<p>The association of files to MIME types is mostly based on name suffixes.
|
|
The types are defined inside the <code class="docutils literal"><span class="pre">`mimemap</span></code>
|
|
file <#RCL.INSTALL.CONFIG.MIMEMAP>`__. Example:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o">.</span><span class="n">doc</span> <span class="o">=</span> <span class="n">application</span><span class="o">/</span><span class="n">msword</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>If no suffix association is found for the file name, RCL will try to
|
|
execute a system command (typically <code class="docutils literal"><span class="pre">file</span> <span class="pre">-i</span></code> or <code class="docutils literal"><span class="pre">xdg-mime</span></code>) to
|
|
determine a MIME type.</p>
|
|
<p>The second element is the association of MIME types to handlers in the
|
|
<code class="docutils literal"><span class="pre">`mimeconf</span></code> file <#RCL.INSTALL.CONFIG.MIMECONF>`__. A sample will
|
|
probably be better than a long explanation:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">index</span><span class="p">]</span>
|
|
<span class="n">application</span><span class="o">/</span><span class="n">msword</span> <span class="o">=</span> <span class="n">exec</span> <span class="n">antiword</span> <span class="o">-</span><span class="n">t</span> <span class="o">-</span><span class="n">i</span> <span class="mi">1</span> <span class="o">-</span><span class="n">m</span> <span class="n">UTF</span><span class="o">-</span><span class="mi">8</span><span class="p">;</span>\
|
|
<span class="n">mimetype</span> <span class="o">=</span> <span class="n">text</span><span class="o">/</span><span class="n">plain</span> <span class="p">;</span> <span class="n">charset</span><span class="o">=</span><span class="n">utf</span><span class="o">-</span><span class="mi">8</span>
|
|
|
|
<span class="n">application</span><span class="o">/</span><span class="n">ogg</span> <span class="o">=</span> <span class="n">exec</span> <span class="n">rclogg</span>
|
|
|
|
<span class="n">text</span><span class="o">/</span><span class="n">rtf</span> <span class="o">=</span> <span class="n">exec</span> <span class="n">unrtf</span> <span class="o">--</span><span class="n">nopict</span> <span class="o">--</span><span class="n">html</span><span class="p">;</span> <span class="n">charset</span><span class="o">=</span><span class="n">iso</span><span class="o">-</span><span class="mi">8859</span><span class="o">-</span><span class="mi">1</span><span class="p">;</span> <span class="n">mimetype</span><span class="o">=</span><span class="n">text</span><span class="o">/</span><span class="n">html</span>
|
|
|
|
<span class="n">application</span><span class="o">/</span><span class="n">x</span><span class="o">-</span><span class="n">chm</span> <span class="o">=</span> <span class="n">execm</span> <span class="n">rclchm</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>The fragment specifies that:</p>
|
|
<ul class="simple">
|
|
<li><code class="docutils literal"><span class="pre">application/msword</span></code> files are processed by executing the
|
|
<code class="docutils literal"><span class="pre">antiword</span></code> program, which outputs <code class="docutils literal"><span class="pre">text/plain</span></code> encoded in
|
|
<code class="docutils literal"><span class="pre">utf-8</span></code>.</li>
|
|
<li><code class="docutils literal"><span class="pre">application/ogg</span></code> files are processed by the <code class="docutils literal"><span class="pre">rclogg</span></code> script,
|
|
with default output type (<code class="docutils literal"><span class="pre">text/html</span></code>, with encoding specified in
|
|
the header, or <code class="docutils literal"><span class="pre">utf-8</span></code> by default).</li>
|
|
<li><code class="docutils literal"><span class="pre">text/rtf</span></code> is processed by <code class="docutils literal"><span class="pre">unrtf</span></code>, which outputs <code class="docutils literal"><span class="pre">text/html</span></code>.
|
|
The <code class="docutils literal"><span class="pre">iso-8859-1</span></code> encoding is specified because it is not the
|
|
<code class="docutils literal"><span class="pre">utf-8</span></code> default, and not output by <code class="docutils literal"><span class="pre">unrtf</span></code> in the HTML header
|
|
section.</li>
|
|
<li><code class="docutils literal"><span class="pre">application/x-chm</span></code> is processed by a persistant handler. This is
|
|
determined by the <code class="docutils literal"><span class="pre">execm</span></code> keyword.</li>
|
|
</ul>
|
|
</div>
|
|
<div class="section" id="input-handler-output">
|
|
<h4>Input handler output<a class="headerlink" href="#input-handler-output" title="Permalink to this headline">¶</a></h4>
|
|
<p>Both the simple and persistent input handlers can return any MIME type
|
|
to Recoll, which will further process the data according to the MIME
|
|
configuration.</p>
|
|
<p>Most input filters filters produce either <code class="docutils literal"><span class="pre">text/plain</span></code> or
|
|
<code class="docutils literal"><span class="pre">text/html</span></code> data. There are exceptions, for example, filters which
|
|
process archive file (<code class="docutils literal"><span class="pre">zip</span></code>, <code class="docutils literal"><span class="pre">tar</span></code>, etc.) will usually return the
|
|
documents as they are found, without processing them further.</p>
|
|
<p>There is nothing to say about <code class="docutils literal"><span class="pre">text/plain</span></code> output, except that its
|
|
character encoding should be consistent with what is specified in the
|
|
<code class="docutils literal"><span class="pre">mimeconf</span></code> file.</p>
|
|
<p>For filters producing HTML, the output could be very minimal like the
|
|
following example:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o"><</span><span class="n">html</span><span class="o">></span>
|
|
<span class="o"><</span><span class="n">head</span><span class="o">></span>
|
|
<span class="o"><</span><span class="n">meta</span> <span class="n">http</span><span class="o">-</span><span class="n">equiv</span><span class="o">=</span><span class="s2">"Content-Type"</span> <span class="n">content</span><span class="o">=</span><span class="s2">"text/html;charset=UTF-8"</span><span class="o">></span>
|
|
<span class="o"></</span><span class="n">head</span><span class="o">></span>
|
|
<span class="o"><</span><span class="n">body</span><span class="o">></span>
|
|
<span class="n">Some</span> <span class="n">text</span> <span class="n">content</span>
|
|
<span class="o"></</span><span class="n">body</span><span class="o">></span>
|
|
<span class="o"></</span><span class="n">html</span><span class="o">></span>
|
|
</pre></div>
|
|
</div>
|
|
<p>You should take care to escape some characters inside the text by
|
|
transforming them into appropriate entities. At the very minimum,
|
|
“<code class="docutils literal"><span class="pre">&</span></code>” should be transformed into “<code class="docutils literal"><span class="pre">&amp;</span></code>”, “<code class="docutils literal"><span class="pre"><</span></code>” should be
|
|
transformed into “<code class="docutils literal"><span class="pre">&lt;</span></code>”. This is not always properly done by
|
|
external helper programs which output HTML, and of course never by those
|
|
which output plain text.</p>
|
|
<p>When encapsulating plain text in an HTML body, the display of a preview
|
|
may be improved by enclosing the text inside <code class="docutils literal"><span class="pre"><pre></span></code> tags.</p>
|
|
<p>The character set needs to be specified in the header. It does not need
|
|
to be UTF-8 (RCL will take care of translating it), but it must be
|
|
accurate for good results.</p>
|
|
<p>RCL will process <code class="docutils literal"><span class="pre">meta</span></code> tags inside the header as possible document
|
|
fields candidates. Documents fields can be processed by the indexer in
|
|
different ways, for searching or displaying inside query results. This
|
|
is described in a <a class="reference external" href="#RCL.PROGRAM.FIELDS">following section.</a></p>
|
|
<p>By default, the indexer will process the standard header fields if they
|
|
are present: <code class="docutils literal"><span class="pre">title</span></code>, <code class="docutils literal"><span class="pre">meta/description</span></code>, and <code class="docutils literal"><span class="pre">meta/keywords</span></code> are
|
|
both indexed and stored for query-time display.</p>
|
|
<p>A predefined non-standard <code class="docutils literal"><span class="pre">meta</span></code> tag will also be processed by RCL
|
|
without further configuration: if a <code class="docutils literal"><span class="pre">date</span></code> tag is present and has the
|
|
right format, it will be used as the document date (for display and
|
|
sorting), in preference to the file modification date. The date format
|
|
should be as follows:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o"><</span><span class="n">meta</span> <span class="n">name</span><span class="o">=</span><span class="s2">"date"</span> <span class="n">content</span><span class="o">=</span><span class="s2">"YYYY-mm-dd HH:MM:SS"</span><span class="o">></span>
|
|
<span class="ow">or</span>
|
|
<span class="o"><</span><span class="n">meta</span> <span class="n">name</span><span class="o">=</span><span class="s2">"date"</span> <span class="n">content</span><span class="o">=</span><span class="s2">"YYYY-mm-ddTHH:MM:SS"</span><span class="o">></span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Example:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o"><</span><span class="n">meta</span> <span class="n">name</span><span class="o">=</span><span class="s2">"date"</span> <span class="n">content</span><span class="o">=</span><span class="s2">"2013-02-24 17:50:00"</span><span class="o">></span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Input handlers also have the possibility to “invent” field names. This
|
|
should also be output as meta tags:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o"><</span><span class="n">meta</span> <span class="n">name</span><span class="o">=</span><span class="s2">"somefield"</span> <span class="n">content</span><span class="o">=</span><span class="s2">"Some textual data"</span> <span class="o">/></span>
|
|
</pre></div>
|
|
</div>
|
|
<p>You can embed HTML markup inside the content of custom fields, for
|
|
improving the display inside result lists. In this case, add a (wildly
|
|
non-standard) <code class="docutils literal"><span class="pre">markup</span></code> attribute to tell RCL that the value is HTML
|
|
and should not be escaped for display.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o"><</span><span class="n">meta</span> <span class="n">name</span><span class="o">=</span><span class="s2">"somefield"</span> <span class="n">markup</span><span class="o">=</span><span class="s2">"html"</span> <span class="n">content</span><span class="o">=</span><span class="s2">"Some <i>textual</i> data"</span> <span class="o">/></span>
|
|
</pre></div>
|
|
</div>
|
|
<p>As written above, the processing of fields is described in a <a class="reference external" href="#RCL.PROGRAM.FIELDS">further
|
|
section</a>.</p>
|
|
<p>Persistent filters can use another, probably simpler, method to produce
|
|
metadata, by calling the <code class="docutils literal"><span class="pre">setfield()</span></code> helper method. This avoids the
|
|
necessity to produce HTML, and any issue with HTML quoting. See, for
|
|
example, <code class="docutils literal"><span class="pre">rclaudio</span></code> in RCL 1.23 and later for an example of handler
|
|
which outputs <code class="docutils literal"><span class="pre">text/plain</span></code> and uses <code class="docutils literal"><span class="pre">setfield()</span></code> to produce
|
|
metadata.</p>
|
|
</div>
|
|
<div class="section" id="page-numbers">
|
|
<h4>Page numbers<a class="headerlink" href="#page-numbers" title="Permalink to this headline">¶</a></h4>
|
|
<p>The indexer will interpret <code class="docutils literal"><span class="pre">^L</span></code> characters in the handler output as
|
|
indicating page breaks, and will record them. At query time, this allows
|
|
starting a viewer on the right page for a hit or a snippet. Currently,
|
|
only the PDF, Postscript and DVI handlers generate page breaks.</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="field-data-processing">
|
|
<h3><a class="toc-backref" href="#id34">Field data processing</a><a class="headerlink" href="#field-data-processing" title="Permalink to this headline">¶</a></h3>
|
|
<p><code class="docutils literal"><span class="pre">Fields</span></code> are named pieces of information in or about documents, like
|
|
<code class="docutils literal"><span class="pre">title</span></code>, <code class="docutils literal"><span class="pre">author</span></code>, <code class="docutils literal"><span class="pre">abstract</span></code>.</p>
|
|
<p>The field values for documents can appear in several ways during
|
|
indexing: either output by input handlers as <code class="docutils literal"><span class="pre">meta</span></code> fields in the HTML
|
|
header section, or extracted from file extended attributes, or added as
|
|
attributes of the <code class="docutils literal"><span class="pre">Doc</span></code> object when using the API, or again
|
|
synthetized internally by RCL.</p>
|
|
<p>The RCL query language allows searching for text in a specific field.</p>
|
|
<p>RCL defines a number of default fields. Additional ones can be output by
|
|
handlers, and described in the <code class="docutils literal"><span class="pre">fields</span></code> configuration file.</p>
|
|
<p>Fields can be:</p>
|
|
<ul class="simple">
|
|
<li><code class="docutils literal"><span class="pre">indexed</span></code>, meaning that their terms are separately stored in
|
|
inverted lists (with a specific prefix), and that a field-specific
|
|
search is possible.</li>
|
|
<li><code class="docutils literal"><span class="pre">stored</span></code>, meaning that their value is recorded in the index data
|
|
record for the document, and can be returned and displayed with
|
|
search results.</li>
|
|
</ul>
|
|
<p>A field can be either or both indexed and stored. This and other aspects
|
|
of fields handling is defined inside the <code class="docutils literal"><span class="pre">fields</span></code> configuration file.</p>
|
|
<p>Some fields may also designated as supporting range queries, meaning
|
|
that the results may be selected for an interval of its values. See the
|
|
<a class="reference external" href="#RCL.INSTALL.CONFIG.FIELDS">configuration section</a> for more details.</p>
|
|
<p>The sequence of events for field processing is as follows:</p>
|
|
<ul class="simple">
|
|
<li>During indexing, <code class="docutils literal"><span class="pre">recollindex</span></code> scans all <code class="docutils literal"><span class="pre">meta</span></code> fields in HTML
|
|
documents (most document types are transformed into HTML at some
|
|
point). It compares the name for each element to the configuration
|
|
defining what should be done with fields (the <code class="docutils literal"><span class="pre">fields</span></code> file)</li>
|
|
<li>If the name for the <code class="docutils literal"><span class="pre">meta</span></code> element matches one for a field that
|
|
should be indexed, the contents are processed and the terms are
|
|
entered into the index with the prefix defined in the <code class="docutils literal"><span class="pre">fields</span></code>
|
|
file.</li>
|
|
<li>If the name for the <code class="docutils literal"><span class="pre">meta</span></code> element matches one for a field that
|
|
should be stored, the content of the element is stored with the
|
|
document data record, from which it can be extracted and displayed at
|
|
query time.</li>
|
|
<li>At query time, if a field search is performed, the index prefix is
|
|
computed and the match is only performed against appropriately
|
|
prefixed terms in the index.</li>
|
|
<li>At query time, the field can be displayed inside the result list by
|
|
using the appropriate directive in the definition of the <a class="reference external" href="#RCL.SEARCH.GUI.CUSTOM.RESLIST">result list
|
|
paragraph format</a>. All fields are
|
|
displayed on the fields screen of the preview window (which you can
|
|
reach through the right-click menu). This is independant of the fact
|
|
that the search which produced the results used the field or not.</li>
|
|
</ul>
|
|
<p>You can find more information in the <cite>section about the ``fields`</cite>
|
|
file <#RCL.INSTALL.CONFIG.FIELDS>`__, or in comments inside the file.</p>
|
|
<p>You can also have a look at the <a class="reference external" href="&FAQS;HandleCustomField">example in the FAQs
|
|
area</a>, detailing how one could add a <em>page
|
|
count</em> field to pdf documents for displaying inside result lists.</p>
|
|
</div>
|
|
<div class="section" id="python-api">
|
|
<h3><a class="toc-backref" href="#id35">Python API</a><a class="headerlink" href="#python-api" title="Permalink to this headline">¶</a></h3>
|
|
<div class="section" id="id3">
|
|
<h4>Introduction<a class="headerlink" href="#id3" title="Permalink to this headline">¶</a></h4>
|
|
<p>The RCL Python programming interface can be used both for searching and
|
|
for creating/updating an index. Bindings exist for Python2 and Python3.</p>
|
|
<p>The search interface is used in a number of active projects: the RCL
|
|
Gnome Shell Search Provider, the RCL Web UI, and the upmpdcli UPnP Media
|
|
Server, in addition to many small scripts.</p>
|
|
<p>The index update section of the API may be used to create and update RCL
|
|
indexes on specific configurations (separate from the ones created by
|
|
<code class="docutils literal"><span class="pre">recollindex</span></code>). The resulting databases can be queried alone, or in
|
|
conjunction with regular ones, through the GUI or any of the query
|
|
interfaces.</p>
|
|
<p>The search API is modeled along the Python database API specification.
|
|
There were two major changes along RCL versions:</p>
|
|
<ul class="simple">
|
|
<li>The basis for the RCL API changed from Python database API version
|
|
1.0 (RCL versions up to 1.18.1), to version 2.0 (RCL 1.18.2 and
|
|
later).</li>
|
|
<li>The <code class="docutils literal"><span class="pre">recoll</span></code> module became a package (with an internal <code class="docutils literal"><span class="pre">recoll</span></code>
|
|
module) as of RCL version 1.19, in order to add more functions. For
|
|
existing code, this only changes the way the interface must be
|
|
imported.</li>
|
|
</ul>
|
|
<p>We will describe the new API and package structure here. A paragraph at
|
|
the end of this section will explain a few differences and ways to write
|
|
code compatible with both versions.</p>
|
|
<p>The <code class="docutils literal"><span class="pre">recoll</span></code> package now contains two modules:</p>
|
|
<ul class="simple">
|
|
<li>The <code class="docutils literal"><span class="pre">recoll</span></code> module contains functions and classes used to query
|
|
(or update) the index.</li>
|
|
<li>The <code class="docutils literal"><span class="pre">rclextract</span></code> module contains functions and classes used at
|
|
query time to access document data.</li>
|
|
</ul>
|
|
<p>There is a good chance that your system repository has packages for the
|
|
Recoll Python API, sometimes in a package separate from the main one
|
|
(maybe named something like python-recoll). Else refer to the <a class="reference external" href="#RCL.INSTALL.BUILDING">Building
|
|
from source chapter</a>.</p>
|
|
<p>As an introduction, the following small sample will run a query and list
|
|
the title and url for each of the results. It would work with RCL 1.19
|
|
and later. The <code class="docutils literal"><span class="pre">python/samples</span></code> source directory contains several
|
|
examples of Python programming with RCL, exercising the extension more
|
|
completely, and especially its data extraction features.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="ch">#!/usr/bin/env python</span>
|
|
|
|
<span class="kn">from</span> <span class="nn">recoll</span> <span class="k">import</span> <span class="n">recoll</span>
|
|
|
|
<span class="n">db</span> <span class="o">=</span> <span class="n">recoll</span><span class="o">.</span><span class="n">connect</span><span class="p">()</span>
|
|
<span class="n">query</span> <span class="o">=</span> <span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">()</span>
|
|
<span class="n">nres</span> <span class="o">=</span> <span class="n">query</span><span class="o">.</span><span class="n">execute</span><span class="p">(</span><span class="s2">"some query"</span><span class="p">)</span>
|
|
<span class="n">results</span> <span class="o">=</span> <span class="n">query</span><span class="o">.</span><span class="n">fetchmany</span><span class="p">(</span><span class="mi">20</span><span class="p">)</span>
|
|
<span class="k">for</span> <span class="n">doc</span> <span class="ow">in</span> <span class="n">results</span><span class="p">:</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2"> </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="p">(</span><span class="n">doc</span><span class="o">.</span><span class="n">url</span><span class="p">,</span> <span class="n">doc</span><span class="o">.</span><span class="n">title</span><span class="p">))</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>You can also take a look at the source for the <a class="reference external" href="https://opensourceprojects.eu/p/recollwebui/code/ci/78ddb20787b2a894b5e4661a8d5502c4511cf71e/tree/">Recoll
|
|
WebUI</a>,
|
|
the <a class="reference external" href="https://opensourceprojects.eu/p/upmpdcli/code/ci/c8c8e75bd181ad9db2df14da05934e53ca867a06/tree/src/mediaserver/cdplugins/uprcl/uprclfolders.py">upmpdcli local media
|
|
server</a>,
|
|
or the <a class="reference external" href="https://opensourceprojects.eu/p/recollgssp/code/ci/3f120108e099f9d687306c0be61593994326d52d/tree/gssp-recoll.py">Gnome Shell Search
|
|
Provider</a>.</p>
|
|
</div>
|
|
<div class="section" id="interface-elements">
|
|
<h4>Interface elements<a class="headerlink" href="#interface-elements" title="Permalink to this headline">¶</a></h4>
|
|
<p>A few elements in the interface are specific and and need an
|
|
explanation.</p>
|
|
<dl class="docutils">
|
|
<dt>ipath</dt>
|
|
<dd>This data value (set as a field in the Doc object) is stored, along
|
|
with the URL, but not indexed by RCL. Its contents are not
|
|
interpreted by the index layer, and its use is up to the
|
|
application. For example, the RCL file system indexer uses the
|
|
<code class="docutils literal"><span class="pre">ipath</span></code> to store the part of the document access path internal to
|
|
(possibly imbricated) container documents. <code class="docutils literal"><span class="pre">ipath</span></code> in this case is
|
|
a vector of access elements (e.g, the first part could be a path
|
|
inside a zip file to an archive member which happens to be an mbox
|
|
file, the second element would be the message sequential number
|
|
inside the mbox etc.). <code class="docutils literal"><span class="pre">url</span></code> and <code class="docutils literal"><span class="pre">ipath</span></code> are returned in every
|
|
search result and define the access to the original document.
|
|
<code class="docutils literal"><span class="pre">ipath</span></code> is empty for top-level document/files (e.g. a PDF document
|
|
which is a filesystem file). The RCL GUI knows about the structure
|
|
of the <code class="docutils literal"><span class="pre">ipath</span></code> values used by the filesystem indexer, and uses it
|
|
for such functions as opening the parent of a given document.</dd>
|
|
<dt>udi</dt>
|
|
<dd>An <code class="docutils literal"><span class="pre">udi</span></code> (unique document identifier) identifies a document.
|
|
Because of limitations inside the index engine, it is restricted in
|
|
length (to 200 bytes), which is why a regular URI cannot be used.
|
|
The structure and contents of the <code class="docutils literal"><span class="pre">udi</span></code> is defined by the
|
|
application and opaque to the index engine. For example, the
|
|
internal file system indexer uses the complete document path (file
|
|
path + internal path), truncated to length, the suppressed part
|
|
being replaced by a hash value. The <code class="docutils literal"><span class="pre">udi</span></code> is not explicit in the
|
|
query interface (it is used “under the hood” by the <code class="docutils literal"><span class="pre">rclextract</span></code>
|
|
module), but it is an explicit element of the update interface.</dd>
|
|
<dt>parent_udi</dt>
|
|
<dd>If this attribute is set on a document when entering it in the
|
|
index, it designates its physical container document. In a
|
|
multilevel hierarchy, this may not be the immediate parent.
|
|
<code class="docutils literal"><span class="pre">parent_udi</span></code> is optional, but its use by an indexer may simplify
|
|
index maintenance, as RCL will automatically delete all children
|
|
defined by <code class="docutils literal"><span class="pre">parent_udi</span> <span class="pre">==</span> <span class="pre">udi</span></code> when the document designated by
|
|
<code class="docutils literal"><span class="pre">udi</span></code> is destroyed. e.g. if a <code class="docutils literal"><span class="pre">Zip</span></code> archive contains entries
|
|
which are themselves containers, like <code class="docutils literal"><span class="pre">mbox</span></code> files, all the
|
|
subdocuments inside the <code class="docutils literal"><span class="pre">Zip</span></code> file (mbox, messages, message
|
|
attachments, etc.) would have the same <code class="docutils literal"><span class="pre">parent_udi</span></code>, matching the
|
|
<code class="docutils literal"><span class="pre">udi</span></code> for the <code class="docutils literal"><span class="pre">Zip</span></code> file, and all would be destroyed when the
|
|
<code class="docutils literal"><span class="pre">Zip</span></code> file (identified by its <code class="docutils literal"><span class="pre">udi</span></code>) is removed from the index.
|
|
The standard filesystem indexer uses <code class="docutils literal"><span class="pre">parent_udi</span></code>.</dd>
|
|
<dt>Stored and indexed fields</dt>
|
|
<dd>The <code class="docutils literal"><span class="pre">`fields</span></code> file <#RCL.INSTALL.CONFIG.FIELDS>`__ inside the RCL
|
|
configuration defines which document fields are either <code class="docutils literal"><span class="pre">indexed</span></code>
|
|
(searchable), <code class="docutils literal"><span class="pre">stored</span></code> (retrievable with search results), or both.
|
|
Apart from a few standard/internal fields, only the <code class="docutils literal"><span class="pre">stored</span></code>
|
|
fields are retrievable through the Python search interface.</dd>
|
|
</dl>
|
|
</div>
|
|
<div class="section" id="python-search-interface">
|
|
<h4>Python search interface<a class="headerlink" href="#python-search-interface" title="Permalink to this headline">¶</a></h4>
|
|
<div class="section" id="the-recoll-module">
|
|
<h5>The recoll module<a class="headerlink" href="#the-recoll-module" title="Permalink to this headline">¶</a></h5>
|
|
<p>The <code class="docutils literal"><span class="pre">connect()</span></code> function connects to one or several RCL index(es) and
|
|
returns a <code class="docutils literal"><span class="pre">Db</span></code> object.</p>
|
|
<p>This call initializes the recoll module, and it should always be
|
|
performed before any other call or object creation.</p>
|
|
<ul class="simple">
|
|
<li><code class="docutils literal"><span class="pre">confdir</span></code> may specify a configuration directory. The usual defaults
|
|
apply.</li>
|
|
<li><code class="docutils literal"><span class="pre">extra_dbs</span></code> is a list of additional indexes (Xapian directories).</li>
|
|
<li><code class="docutils literal"><span class="pre">writable</span></code> decides if we can index new data through this
|
|
connection.</li>
|
|
</ul>
|
|
<p>A Db object is created by a <code class="docutils literal"><span class="pre">connect()</span></code> call and holds a connection to
|
|
a Recoll index.</p>
|
|
<dl class="docutils">
|
|
<dt>Db.close()</dt>
|
|
<dd>Closes the connection. You can’t do anything with the <code class="docutils literal"><span class="pre">Db</span></code> object
|
|
after this.</dd>
|
|
<dt>Db.query(), Db.cursor()</dt>
|
|
<dd>These aliases return a blank <code class="docutils literal"><span class="pre">Query</span></code> object for this index.</dd>
|
|
<dt>Db.setAbstractParams(maxchars, contextwords)</dt>
|
|
<dd>Set the parameters used to build snippets (sets of keywords in
|
|
context text fragments). <code class="docutils literal"><span class="pre">maxchars</span></code> defines the maximum total size
|
|
of the abstract. <code class="docutils literal"><span class="pre">contextwords</span></code> defines how many terms are shown
|
|
around the keyword.</dd>
|
|
</dl>
|
|
<p>Db.termMatch(match_type, expr, field=’‘, maxlen=-1, casesens=False,
|
|
diacsens=False, lang=’english’)</p>
|
|
<blockquote>
|
|
<div>Expand an expression against the index term list. Performs the basic
|
|
function from the GUI term explorer tool. <code class="docutils literal"><span class="pre">match_type</span></code> can be
|
|
either of <code class="docutils literal"><span class="pre">wildcard</span></code>, <code class="docutils literal"><span class="pre">regexp</span></code> or <code class="docutils literal"><span class="pre">stem</span></code>. Returns a list of
|
|
terms expanded from the input expression.</div></blockquote>
|
|
<p>A <code class="docutils literal"><span class="pre">Query</span></code> object (equivalent to a cursor in the Python DB API) is
|
|
created by a <code class="docutils literal"><span class="pre">Db.query()</span></code> call. It is used to execute index searches.</p>
|
|
<dl class="docutils">
|
|
<dt>Query.sortby(fieldname, ascending=True)</dt>
|
|
<dd>Sort results by fieldname, in ascending or descending order. Must be
|
|
called before executing the search.</dd>
|
|
</dl>
|
|
<p>Query.execute(query_string, stemming=1, stemlang=”english”,
|
|
fetchtext=False)</p>
|
|
<blockquote>
|
|
<div>Starts a search for query_string, a RCL search language string. If
|
|
the index stores the document texts and <code class="docutils literal"><span class="pre">fetchtext</span></code> is True, store
|
|
the document extracted text in <code class="docutils literal"><span class="pre">doc.text</span></code>.</div></blockquote>
|
|
<dl class="docutils">
|
|
<dt>Query.executesd(SearchData, fetchtext=False)</dt>
|
|
<dd>Starts a search for the query defined by the SearchData object. If
|
|
the index stores the document texts and <code class="docutils literal"><span class="pre">fetchtext</span></code> is True, store
|
|
the document extracted text in <code class="docutils literal"><span class="pre">doc.text</span></code>.</dd>
|
|
<dt>Query.fetchmany(size=query.arraysize)</dt>
|
|
<dd>Fetches the next <code class="docutils literal"><span class="pre">Doc</span></code> objects in the current search results, and
|
|
returns them as an array of the required size, which is by default
|
|
the value of the <code class="docutils literal"><span class="pre">arraysize</span></code> data member.</dd>
|
|
<dt>Query.fetchone()</dt>
|
|
<dd>Fetches the next <code class="docutils literal"><span class="pre">Doc</span></code> object from the current search results.
|
|
Generates a StopIteration exception if there are no results left.</dd>
|
|
<dt>Query.close()</dt>
|
|
<dd>Closes the query. The object is unusable after the call.</dd>
|
|
<dt>Query.scroll(value, mode=’relative’)</dt>
|
|
<dd>Adjusts the position in the current result set. <code class="docutils literal"><span class="pre">mode</span></code> can be
|
|
<code class="docutils literal"><span class="pre">relative</span></code> or <code class="docutils literal"><span class="pre">absolute</span></code>.</dd>
|
|
<dt>Query.getgroups()</dt>
|
|
<dd>Retrieves the expanded query terms as a list of pairs. Meaningful
|
|
only after executexx In each pair, the first entry is a list of user
|
|
terms (of size one for simple terms, or more for group and phrase
|
|
clauses), the second a list of query terms as derived from the user
|
|
terms and used in the Xapian Query.</dd>
|
|
<dt>Query.getxquery()</dt>
|
|
<dd>Return the Xapian query description as a Unicode string. Meaningful
|
|
only after executexx.</dd>
|
|
<dt>Query.highlight(text, ishtml = 0, methods = object)</dt>
|
|
<dd>Will insert <span “class=rclmatch”>, </span> tags around the match
|
|
areas in the input text and return the modified text. <code class="docutils literal"><span class="pre">ishtml</span></code> can
|
|
be set to indicate that the input text is HTML and that HTML special
|
|
characters should not be escaped. <code class="docutils literal"><span class="pre">methods</span></code> if set should be an
|
|
object with methods startMatch(i) and endMatch() which will be
|
|
called for each match and should return a begin and end tag</dd>
|
|
<dt>Query.makedocabstract(doc, methods = object))</dt>
|
|
<dd>Create a snippets abstract for <code class="docutils literal"><span class="pre">doc</span></code> (a <code class="docutils literal"><span class="pre">Doc</span></code> object) by
|
|
selecting text around the match terms. If methods is set, will also
|
|
perform highlighting. See the highlight method.</dd>
|
|
<dt>Query.__iter__() and Query.next()</dt>
|
|
<dd>So that things like <code class="docutils literal"><span class="pre">for</span> <span class="pre">doc</span> <span class="pre">in</span> <span class="pre">query:</span></code> will work.</dd>
|
|
<dt>Query.arraysize</dt>
|
|
<dd>Default number of records processed by fetchmany (r/w).</dd>
|
|
<dt>Query.rowcount</dt>
|
|
<dd>Number of records returned by the last execute.</dd>
|
|
<dt>Query.rownumber</dt>
|
|
<dd>Next index to be fetched from results. Normally increments after
|
|
each fetchone() call, but can be set/reset before the call to effect
|
|
seeking (equivalent to using <code class="docutils literal"><span class="pre">scroll()</span></code>). Starts at 0.</dd>
|
|
</dl>
|
|
<p>A <code class="docutils literal"><span class="pre">Doc</span></code> object contains index data for a given document. The data is
|
|
extracted from the index when searching, or set by the indexer program
|
|
when updating. The Doc object has many attributes to be read or set by
|
|
its user. It mostly matches the Rcl::Doc C++ object. Some of the
|
|
attributes are predefined, but, especially when indexing, others can be
|
|
set, the name of which will be processed as field names by the indexing
|
|
configuration. Inputs can be specified as Unicode or strings. Outputs
|
|
are Unicode objects. All dates are specified as Unix timestamps, printed
|
|
as strings. Please refer to the <code class="docutils literal"><span class="pre">rcldb/rcldoc.cpp</span></code> C++ file for a full
|
|
description of the predefined attributes. Here follows a short list.</p>
|
|
<ul class="simple">
|
|
<li><code class="docutils literal"><span class="pre">url</span></code> the document URL but see also <code class="docutils literal"><span class="pre">getbinurl()</span></code></li>
|
|
<li><code class="docutils literal"><span class="pre">ipath</span></code> the document <code class="docutils literal"><span class="pre">ipath</span></code> for embedded documents.</li>
|
|
<li><code class="docutils literal"><span class="pre">fbytes,</span> <span class="pre">dbytes</span></code> the document file and text sizes.</li>
|
|
<li><code class="docutils literal"><span class="pre">fmtime,</span> <span class="pre">dmtime</span></code> the document file and document times.</li>
|
|
<li><code class="docutils literal"><span class="pre">xdocid</span></code> the document Xapian document ID. This is useful if you
|
|
want to access the document through a direct Xapian operation.</li>
|
|
<li><code class="docutils literal"><span class="pre">mtype</span></code> the document MIME type.</li>
|
|
<li>Fields stored by default: <code class="docutils literal"><span class="pre">author</span></code>, <code class="docutils literal"><span class="pre">filename</span></code>, <code class="docutils literal"><span class="pre">keywords</span></code>,
|
|
<code class="docutils literal"><span class="pre">recipient</span></code></li>
|
|
</ul>
|
|
<p>At query time, only the fields that are defined as <code class="docutils literal"><span class="pre">stored</span></code> either by
|
|
default or in the <code class="docutils literal"><span class="pre">fields</span></code> configuration file will be meaningful in
|
|
the <code class="docutils literal"><span class="pre">Doc</span></code> object. The document processed text may be present or not,
|
|
depending if the index stores the text at all, and if it does, on the
|
|
<code class="docutils literal"><span class="pre">fetchtext</span></code> query execute option. See also the <code class="docutils literal"><span class="pre">rclextract</span></code> module
|
|
for accessing document contents.</p>
|
|
<dl class="docutils">
|
|
<dt>get(key), [] operator</dt>
|
|
<dd>Retrieve the named document attribute. You can also use
|
|
<code class="docutils literal"><span class="pre">getattr(doc,</span> <span class="pre">key)</span></code> or <code class="docutils literal"><span class="pre">doc.key</span></code>.</dd>
|
|
<dt>doc.key = value</dt>
|
|
<dd>Set the the named document attribute. You can also use
|
|
<code class="docutils literal"><span class="pre">setattr(doc,</span> <span class="pre">key,</span> <span class="pre">value)</span></code>.</dd>
|
|
<dt>getbinurl()</dt>
|
|
<dd>Retrieve the URL in byte array format (no transcoding), for use as
|
|
parameter to a system call.</dd>
|
|
<dt>setbinurl(url)</dt>
|
|
<dd>Set the URL in byte array format (no transcoding).</dd>
|
|
<dt>items()</dt>
|
|
<dd>Return a dictionary of doc object keys/values</dd>
|
|
<dt>keys()</dt>
|
|
<dd>list of doc object keys (attribute names).</dd>
|
|
</dl>
|
|
<p>A <code class="docutils literal"><span class="pre">SearchData</span></code> object allows building a query by combining clauses,
|
|
for execution by <code class="docutils literal"><span class="pre">Query.executesd()</span></code>. It can be used in replacement of
|
|
the query language approach. The interface is going to change a little,
|
|
so no detailed doc for now...</p>
|
|
<p>addclause(type=’and’|’or’|’excl’|’phrase’|’near’|’sub’,
|
|
qstring=string, slack=0, field=’‘, stemming=1, subSearch=SearchData)</p>
|
|
</div>
|
|
<div class="section" id="the-rclextract-module">
|
|
<h5>The rclextract module<a class="headerlink" href="#the-rclextract-module" title="Permalink to this headline">¶</a></h5>
|
|
<p>Prior to RCL 1.25, index queries could not provide document content
|
|
because it was never stored. RCL 1.25 and later usually store the
|
|
document text, which can be optionally retrieved when running a query
|
|
(see <code class="docutils literal"><span class="pre">query.execute()</span></code> above - the result is always plain text).</p>
|
|
<p>The <code class="docutils literal"><span class="pre">rclextract</span></code> module can give access to the original document and
|
|
to the document text content (if not stored by the index, or to access
|
|
an HTML version of the text). Acessing the original document is
|
|
particularly useful if it is embedded (e.g. an email attachment).</p>
|
|
<p>You need to import the <code class="docutils literal"><span class="pre">recoll</span></code> module before the <code class="docutils literal"><span class="pre">rclextract</span></code>
|
|
module.</p>
|
|
<dl class="docutils">
|
|
<dt>Extractor(doc)</dt>
|
|
<dd>An <code class="docutils literal"><span class="pre">Extractor</span></code> object is built from a <code class="docutils literal"><span class="pre">Doc</span></code> object, output from
|
|
a query.</dd>
|
|
<dt>Extractor.textextract(ipath)</dt>
|
|
<dd><p class="first">Extract document defined by ipath and return a <code class="docutils literal"><span class="pre">Doc</span></code> object. The
|
|
<code class="docutils literal"><span class="pre">doc.text</span></code> field has the document text converted to either
|
|
text/plain or text/html according to <code class="docutils literal"><span class="pre">doc.mimetype</span></code>. The typical
|
|
use would be as follows:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">recoll</span> <span class="k">import</span> <span class="n">recoll</span><span class="p">,</span> <span class="n">rclextract</span>
|
|
|
|
<span class="n">qdoc</span> <span class="o">=</span> <span class="n">query</span><span class="o">.</span><span class="n">fetchone</span><span class="p">()</span>
|
|
<span class="n">extractor</span> <span class="o">=</span> <span class="n">recoll</span><span class="o">.</span><span class="n">Extractor</span><span class="p">(</span><span class="n">qdoc</span><span class="p">)</span>
|
|
<span class="n">doc</span> <span class="o">=</span> <span class="n">extractor</span><span class="o">.</span><span class="n">textextract</span><span class="p">(</span><span class="n">qdoc</span><span class="o">.</span><span class="n">ipath</span><span class="p">)</span>
|
|
<span class="c1"># use doc.text, e.g. for previewing</span>
|
|
</pre></div>
|
|
</div>
|
|
<p class="last">Passing <code class="docutils literal"><span class="pre">qdoc.ipath</span></code> to <code class="docutils literal"><span class="pre">textextract()</span></code> is redundant, but
|
|
reflects the fact that the <code class="docutils literal"><span class="pre">Extractor</span></code> object actually has the
|
|
capability to access the other entries in a compound document.</p>
|
|
</dd>
|
|
<dt>Extractor.idoctofile(ipath, targetmtype, outfile=’‘)</dt>
|
|
<dd><p class="first">Extracts document into an output file, which can be given explicitly
|
|
or will be created as a temporary file to be deleted by the caller.
|
|
Typical use:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">recoll</span> <span class="k">import</span> <span class="n">recoll</span><span class="p">,</span> <span class="n">rclextract</span>
|
|
|
|
<span class="n">qdoc</span> <span class="o">=</span> <span class="n">query</span><span class="o">.</span><span class="n">fetchone</span><span class="p">()</span>
|
|
<span class="n">extractor</span> <span class="o">=</span> <span class="n">recoll</span><span class="o">.</span><span class="n">Extractor</span><span class="p">(</span><span class="n">qdoc</span><span class="p">)</span>
|
|
<span class="n">filename</span> <span class="o">=</span> <span class="n">extractor</span><span class="o">.</span><span class="n">idoctofile</span><span class="p">(</span><span class="n">qdoc</span><span class="o">.</span><span class="n">ipath</span><span class="p">,</span> <span class="n">qdoc</span><span class="o">.</span><span class="n">mimetype</span><span class="p">)</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>In all cases the output is a copy, even if the requested document is
|
|
a regular system file, which may be wasteful in some cases. If you
|
|
want to avoid this, you can test for a simple file document as
|
|
follows:</p>
|
|
<div class="last highlight-default"><div class="highlight"><pre><span></span><span class="ow">not</span> <span class="n">doc</span><span class="o">.</span><span class="n">ipath</span> <span class="ow">and</span> <span class="p">(</span><span class="ow">not</span> <span class="s2">"rclbes"</span> <span class="ow">in</span> <span class="n">doc</span><span class="o">.</span><span class="n">keys</span><span class="p">()</span> <span class="ow">or</span> <span class="n">doc</span><span class="p">[</span><span class="s2">"rclbes"</span><span class="p">]</span> <span class="o">==</span> <span class="s2">"FS"</span><span class="p">)</span>
|
|
</pre></div>
|
|
</div>
|
|
</dd>
|
|
</dl>
|
|
</div>
|
|
<div class="section" id="search-api-usage-example">
|
|
<h5>Search API usage example<a class="headerlink" href="#search-api-usage-example" title="Permalink to this headline">¶</a></h5>
|
|
<p>The following sample would query the index with a user language string.
|
|
See the <code class="docutils literal"><span class="pre">python/samples</span></code> directory inside the RCL source for other
|
|
examples. The <code class="docutils literal"><span class="pre">recollgui</span></code> subdirectory has a very embryonic GUI which
|
|
demonstrates the highlighting and data extraction functions.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="ch">#!/usr/bin/env python</span>
|
|
|
|
<span class="kn">from</span> <span class="nn">recoll</span> <span class="k">import</span> <span class="n">recoll</span>
|
|
|
|
<span class="n">db</span> <span class="o">=</span> <span class="n">recoll</span><span class="o">.</span><span class="n">connect</span><span class="p">()</span>
|
|
<span class="n">db</span><span class="o">.</span><span class="n">setAbstractParams</span><span class="p">(</span><span class="n">maxchars</span><span class="o">=</span><span class="mi">80</span><span class="p">,</span> <span class="n">contextwords</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
|
|
|
|
<span class="n">query</span> <span class="o">=</span> <span class="n">db</span><span class="o">.</span><span class="n">query</span><span class="p">()</span>
|
|
<span class="n">nres</span> <span class="o">=</span> <span class="n">query</span><span class="o">.</span><span class="n">execute</span><span class="p">(</span><span class="s2">"some user question"</span><span class="p">)</span>
|
|
<span class="nb">print</span> <span class="s2">"Result count: "</span><span class="p">,</span> <span class="n">nres</span>
|
|
<span class="k">if</span> <span class="n">nres</span> <span class="o">></span> <span class="mi">5</span><span class="p">:</span>
|
|
<span class="n">nres</span> <span class="o">=</span> <span class="mi">5</span>
|
|
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">nres</span><span class="p">):</span>
|
|
<span class="n">doc</span> <span class="o">=</span> <span class="n">query</span><span class="o">.</span><span class="n">fetchone</span><span class="p">()</span>
|
|
<span class="nb">print</span> <span class="s2">"Result #</span><span class="si">%d</span><span class="s2">"</span> <span class="o">%</span> <span class="p">(</span><span class="n">query</span><span class="o">.</span><span class="n">rownumber</span><span class="p">,)</span>
|
|
<span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="p">(</span><span class="s2">"title"</span><span class="p">,</span> <span class="s2">"size"</span><span class="p">):</span>
|
|
<span class="nb">print</span> <span class="n">k</span><span class="p">,</span> <span class="s2">":"</span><span class="p">,</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">doc</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">'utf-8'</span><span class="p">)</span>
|
|
<span class="nb">abs</span> <span class="o">=</span> <span class="n">db</span><span class="o">.</span><span class="n">makeDocAbstract</span><span class="p">(</span><span class="n">doc</span><span class="p">,</span> <span class="n">query</span><span class="p">)</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">'utf-8'</span><span class="p">)</span>
|
|
<span class="nb">print</span> <span class="nb">abs</span>
|
|
<span class="nb">print</span>
|
|
</pre></div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="creating-python-external-indexers">
|
|
<h4>Creating Python external indexers<a class="headerlink" href="#creating-python-external-indexers" title="Permalink to this headline">¶</a></h4>
|
|
<p>The update API can be used to create an index from data which is not
|
|
accessible to the regular RCL indexer, or structured to present
|
|
difficulties to the RCL input handlers.</p>
|
|
<p>An indexer created using this API will be have equivalent work to do as
|
|
the the Recoll file system indexer: look for modified documents, extract
|
|
their text, call the API for indexing it, take care of purging the index
|
|
out of data from documents which do not exist in the document store any
|
|
more.</p>
|
|
<p>The data for such an external indexer should be stored in an index
|
|
separate from any used by the RCL internal file system indexer. The
|
|
reason is that the main document indexer purge pass (removal of deleted
|
|
documents) would also remove all the documents belonging to the external
|
|
indexer, as they were not seen during the filesystem walk. The main
|
|
indexer documents would also probably be a problem for the external
|
|
indexer own purge operation.</p>
|
|
<p>While there would be ways to enable multiple foreign indexers to
|
|
cooperate on a single index, it is just simpler to use separate ones,
|
|
and use the multiple index access capabilities of the query interface,
|
|
if needed.</p>
|
|
<p>There are two parts in the update interface:</p>
|
|
<ul class="simple">
|
|
<li>Methods inside the <code class="docutils literal"><span class="pre">recoll</span></code> module allow inserting data into the
|
|
index, to make it accessible by the normal query interface.</li>
|
|
<li>An interface based on scripts execution is defined to allow either
|
|
the GUI or the <code class="docutils literal"><span class="pre">rclextract</span></code> module to access original document data
|
|
for previewing or editing.</li>
|
|
</ul>
|
|
<div class="section" id="python-update-interface">
|
|
<h5>Python update interface<a class="headerlink" href="#python-update-interface" title="Permalink to this headline">¶</a></h5>
|
|
<p>The update methods are part of the <code class="docutils literal"><span class="pre">recoll</span></code> module described above.
|
|
The connect() method is used with a <code class="docutils literal"><span class="pre">writable=true</span></code> parameter to
|
|
obtain a writable <code class="docutils literal"><span class="pre">Db</span></code> object. The following <code class="docutils literal"><span class="pre">Db</span></code> object methods are
|
|
then available.</p>
|
|
<dl class="docutils">
|
|
<dt>addOrUpdate(udi, doc, parent_udi=None)</dt>
|
|
<dd>Add or update index data for a given document The <code class="docutils literal"><span class="pre">udi</span></code> string
|
|
must define a unique id for the document. It is an opaque interface
|
|
element and not interpreted inside Recoll. <code class="docutils literal"><span class="pre">doc</span></code> is a <code class="docutils literal"><span class="pre">Doc</span></code>
|
|
object, created from the data to be indexed (the main text should be
|
|
in <code class="docutils literal"><span class="pre">doc.text</span></code>). If <code class="docutils literal"><span class="pre">parent_udi</span></code> is set, this is a unique
|
|
identifier for the top-level container (e.g. for the filesystem
|
|
indexer, this would be the one which is an actual file).</dd>
|
|
<dt>delete(udi)</dt>
|
|
<dd>Purge index from all data for <code class="docutils literal"><span class="pre">udi</span></code>, and all documents (if any)
|
|
which have a matrching <code class="docutils literal"><span class="pre">parent_udi</span></code>.</dd>
|
|
<dt>needUpdate(udi, sig)</dt>
|
|
<dd><p class="first">Test if the index needs to be updated for the document identified by
|
|
<code class="docutils literal"><span class="pre">udi</span></code>. If this call is to be used, the <code class="docutils literal"><span class="pre">doc.sig</span></code> field should
|
|
contain a signature value when calling <code class="docutils literal"><span class="pre">addOrUpdate()</span></code>. The
|
|
<code class="docutils literal"><span class="pre">needUpdate()</span></code> call then compares its parameter value with the
|
|
stored <code class="docutils literal"><span class="pre">sig</span></code> for <code class="docutils literal"><span class="pre">udi</span></code>. <code class="docutils literal"><span class="pre">sig</span></code> is an opaque value, compared as
|
|
a string.</p>
|
|
<p>The filesystem indexer uses a concatenation of the decimal string
|
|
values for file size and update time, but a hash of the contents
|
|
could also be used.</p>
|
|
<p>As a side effect, if the return value is false (the index is up to
|
|
date), the call will set the existence flag for the document (and
|
|
any subdocument defined by its <code class="docutils literal"><span class="pre">parent_udi</span></code>), so that a later
|
|
<code class="docutils literal"><span class="pre">purge()</span></code> call will preserve them).</p>
|
|
<p class="last">The use of <code class="docutils literal"><span class="pre">needUpdate()</span></code> and <code class="docutils literal"><span class="pre">purge()</span></code> is optional, and the
|
|
indexer may use another method for checking the need to reindex or
|
|
to delete stale entries.</p>
|
|
</dd>
|
|
<dt>purge()</dt>
|
|
<dd>Delete all documents that were not touched during the just finished
|
|
indexing pass (since open-for-write). These are the documents for
|
|
the needUpdate() call was not performed, indicating that they no
|
|
longer exist in the primary storage system.</dd>
|
|
</dl>
|
|
</div>
|
|
<div class="section" id="query-data-access-for-external-indexers-1-23">
|
|
<h5>Query data access for external indexers (1.23)<a class="headerlink" href="#query-data-access-for-external-indexers-1-23" title="Permalink to this headline">¶</a></h5>
|
|
<p>RCL has internal methods to access document data for its internal
|
|
(filesystem) indexer. An external indexer needs to provide data access
|
|
methods if it needs integration with the GUI (e.g. preview function), or
|
|
support for the <code class="docutils literal"><span class="pre">rclextract</span></code> module.</p>
|
|
<p>The index data and the access method are linked by the <code class="docutils literal"><span class="pre">rclbes</span></code>
|
|
(recoll backend storage) <code class="docutils literal"><span class="pre">Doc</span></code> field. You should set this to a short
|
|
string value identifying your indexer (e.g. the filesystem indexer uses
|
|
either “FS” or an empty value, the Web history indexer uses “BGL”).</p>
|
|
<p>The link is actually performed inside a <code class="docutils literal"><span class="pre">backends</span></code> configuration file
|
|
(stored in the configuration directory). This defines commands to
|
|
execute to access data from the specified indexer. Example, for the mbox
|
|
indexing sample found in the Recoll source (which sets
|
|
<code class="docutils literal"><span class="pre">rclbes="MBOX"</span></code>):</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">MBOX</span><span class="p">]</span>
|
|
<span class="n">fetch</span> <span class="o">=</span> <span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">recoll</span><span class="o">/</span><span class="n">src</span><span class="o">/</span><span class="n">python</span><span class="o">/</span><span class="n">samples</span><span class="o">/</span><span class="n">rclmbox</span><span class="o">.</span><span class="n">py</span> <span class="n">fetch</span>
|
|
<span class="n">makesig</span> <span class="o">=</span> <span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">recoll</span><span class="o">/</span><span class="n">src</span><span class="o">/</span><span class="n">python</span><span class="o">/</span><span class="n">samples</span><span class="o">/</span><span class="n">rclmbox</span><span class="o">.</span><span class="n">py</span> <span class="n">makesig</span>
|
|
</pre></div>
|
|
</div>
|
|
<p><code class="docutils literal"><span class="pre">fetch</span></code> and <code class="docutils literal"><span class="pre">makesig</span></code> define two commands to execute to respectively
|
|
retrieve the document text and compute the document signature (the
|
|
example implementation uses the same script with different first
|
|
parameters to perform both operations).</p>
|
|
<p>The scripts are called with three additional arguments: <code class="docutils literal"><span class="pre">udi</span></code>,
|
|
<code class="docutils literal"><span class="pre">url</span></code>, <code class="docutils literal"><span class="pre">ipath</span></code>, stored with the document when it was indexed, and
|
|
may use any or all to perform the requested operation. The caller
|
|
expects the result data on <code class="docutils literal"><span class="pre">stdout</span></code>.</p>
|
|
</div>
|
|
<div class="section" id="external-indexer-samples">
|
|
<h5>External indexer samples<a class="headerlink" href="#external-indexer-samples" title="Permalink to this headline">¶</a></h5>
|
|
<p>The Recoll source tree has two samples of external indexers in the
|
|
<code class="docutils literal"><span class="pre">src/python/samples</span></code> directory. The more interesting one is
|
|
<code class="docutils literal"><span class="pre">rclmbox.py</span></code> which indexes a directory containing <code class="docutils literal"><span class="pre">mbox</span></code> folder
|
|
files. It exercises most features in the update interface, and has a
|
|
data access interface.</p>
|
|
<p>See the comments inside the file for more information.</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="package-compatibility-with-the-previous-version">
|
|
<h4>Package compatibility with the previous version<a class="headerlink" href="#package-compatibility-with-the-previous-version" title="Permalink to this headline">¶</a></h4>
|
|
<p>The following code fragments can be used to ensure that code can run
|
|
with both the old and the new API (as long as it does not use the new
|
|
abilities of the new API of course).</p>
|
|
<p>Adapting to the new package structure:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="k">try</span><span class="p">:</span>
|
|
<span class="kn">from</span> <span class="nn">recoll</span> <span class="k">import</span> <span class="n">recoll</span>
|
|
<span class="kn">from</span> <span class="nn">recoll</span> <span class="k">import</span> <span class="n">rclextract</span>
|
|
<span class="n">hasextract</span> <span class="o">=</span> <span class="kc">True</span>
|
|
<span class="k">except</span><span class="p">:</span>
|
|
<span class="kn">import</span> <span class="nn">recoll</span>
|
|
<span class="n">hasextract</span> <span class="o">=</span> <span class="kc">False</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Adapting to the change of nature of the <code class="docutils literal"><span class="pre">next</span></code> <code class="docutils literal"><span class="pre">Query</span></code> member. The
|
|
same test can be used to choose to use the <code class="docutils literal"><span class="pre">scroll()</span></code> method (new) or
|
|
set the <code class="docutils literal"><span class="pre">next</span></code> value (old).</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">rownum</span> <span class="o">=</span> <span class="n">query</span><span class="o">.</span><span class="n">next</span> <span class="k">if</span> <span class="nb">type</span><span class="p">(</span><span class="n">query</span><span class="o">.</span><span class="n">next</span><span class="p">)</span> <span class="o">==</span> <span class="nb">int</span> <span class="k">else</span> \
|
|
<span class="n">query</span><span class="o">.</span><span class="n">rownumber</span>
|
|
</pre></div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="installation-and-configuration">
|
|
<h2><a class="toc-backref" href="#id36">Installation and configuration</a><a class="headerlink" href="#installation-and-configuration" title="Permalink to this headline">¶</a></h2>
|
|
<div class="section" id="installing-a-binary-copy">
|
|
<h3><a class="toc-backref" href="#id37">Installing a binary copy</a><a class="headerlink" href="#installing-a-binary-copy" title="Permalink to this headline">¶</a></h3>
|
|
<p>RCL binary copies are always distributed as regular packages for your
|
|
system. They can be obtained either through the system’s normal software
|
|
distribution framework (e.g. Debian/Ubuntu apt, FreeBSD ports, etc.), or
|
|
from some type of “backports” repository providing versions newer than
|
|
the standard ones, or found on the RCL WEB site in some cases. The most
|
|
up-to-date information about Recoll packages can usually be found on the
|
|
<a class="reference external" href="http://www.recoll.org/download.html">Recoll WEB site downloads page</a></p>
|
|
<p>There used to exist another form of binary install, as pre-compiled
|
|
source trees, but these are just less convenient than the packages and
|
|
don’t exist any more.</p>
|
|
<p>The package management tools will usually automatically deal with hard
|
|
dependancies for packages obtained from a proper package repository. You
|
|
will have to deal with them by hand for downloaded packages (for
|
|
example, when <code class="docutils literal"><span class="pre">dpkg</span></code> complains about missing dependancies).</p>
|
|
<p>In all cases, you will have to check or install <a class="reference external" href="#RCL.INSTALL.EXTERNAL">supporting
|
|
applications</a> for the file types that you want
|
|
to index beyond those that are natively processed by RCL (text, HTML,
|
|
email files, and a few others).</p>
|
|
<p>You should also maybe have a look at the <a class="reference external" href="#RCL.INSTALL.CONFIG">configuration
|
|
section</a> (but this may not be necessary for a
|
|
quick test with default parameters). Most parameters can be more
|
|
conveniently set from the GUI interface.</p>
|
|
</div>
|
|
<div class="section" id="supporting-packages">
|
|
<h3><a class="toc-backref" href="#id38">Supporting packages</a><a class="headerlink" href="#supporting-packages" title="Permalink to this headline">¶</a></h3>
|
|
<blockquote>
|
|
<div><p><strong>Note</strong></p>
|
|
<p>The WIN installation of RCL is self-contained, and only needs Python
|
|
2.7 to be externally installed. WIN users can skip this section.</p>
|
|
</div></blockquote>
|
|
<p>RCL uses external applications to index some file types. You need to
|
|
install them for the file types that you wish to have indexed (these are
|
|
run-time optional dependencies. None is needed for building or running
|
|
RCL except for indexing their specific file type).</p>
|
|
<p>After an indexing pass, the commands that were found missing can be
|
|
displayed from the <code class="docutils literal"><span class="pre">recoll</span></code> File menu. The list is stored in the
|
|
<code class="docutils literal"><span class="pre">missing</span></code> text file inside the configuration directory.</p>
|
|
<p>A list of common file types which need external commands follows. Many
|
|
of the handlers need the <code class="docutils literal"><span class="pre">iconv</span></code> command, which is not always listed
|
|
as a dependancy.</p>
|
|
<p>Please note that, due to the relatively dynamic nature of this
|
|
information, the most up to date version is now kept on RCLAPPS along
|
|
with links to the home pages or best source/patches pages, and misc
|
|
tips. The list below is not updated often and may be quite stale.</p>
|
|
<p>For many Linux distributions, most of the commands listed can be
|
|
installed from the package repositories. However, the packages are
|
|
sometimes outdated, or not the best version for RCL, so you should take
|
|
a look at RCLAPPS if a file type is important to you.</p>
|
|
<p>As of RCL release 1.14, a number of XML-based formats that were handled
|
|
by ad hoc handler code now use the <code class="docutils literal"><span class="pre">xsltproc</span></code> command, which usually
|
|
comes with libxslt. These are: abiword, fb2 (ebooks), kword, openoffice,
|
|
svg.</p>
|
|
<p>Now for the list:</p>
|
|
<ul class="simple">
|
|
<li>Openoffice files need <code class="docutils literal"><span class="pre">unzip</span></code> and <code class="docutils literal"><span class="pre">xsltproc</span></code>.</li>
|
|
<li>PDF files need <code class="docutils literal"><span class="pre">pdftotext</span></code> which is part of Poppler (usually comes
|
|
with the <code class="docutils literal"><span class="pre">poppler-utils</span></code> package). Avoid the original one from
|
|
Xpdf.</li>
|
|
<li>Postscript files need <code class="docutils literal"><span class="pre">pstotext</span></code>. The original version has an issue
|
|
with shell character in file names, which is corrected in recent
|
|
packages. See RCLAPPS for more detail.</li>
|
|
<li>MS Word needs <code class="docutils literal"><span class="pre">antiword</span></code>. It is also useful to have <code class="docutils literal"><span class="pre">wvWare</span></code>
|
|
installed as it may be be used as a fallback for some files which
|
|
<code class="docutils literal"><span class="pre">antiword</span></code> does not handle.</li>
|
|
<li>MS Excel and PowerPoint are processed by internal <code class="docutils literal"><span class="pre">Python</span></code>
|
|
handlers.</li>
|
|
<li><dl class="first docutils">
|
|
<dt>MS Open XML (docx) needs ``</dt>
|
|
<dd>xsltproc``.</dd>
|
|
</dl>
|
|
</li>
|
|
<li>Wordperfect files need <code class="docutils literal"><span class="pre">wpd2html</span></code> from the libwpd (or libwpd-tools
|
|
on Ubuntu) package.</li>
|
|
<li>RTF files need <code class="docutils literal"><span class="pre">unrtf</span></code>, which, in its older versions, has much
|
|
trouble with non-western character sets. Many Linux distributions
|
|
carry outdated <code class="docutils literal"><span class="pre">unrtf</span></code> versions. Check RCLAPPS for details.</li>
|
|
<li>TeX files need <code class="docutils literal"><span class="pre">untex</span></code> or <code class="docutils literal"><span class="pre">detex</span></code>. Check RCLAPPS for sources if
|
|
it’s not packaged for your distribution.</li>
|
|
<li>dvi files need <code class="docutils literal"><span class="pre">dvips</span></code>.</li>
|
|
<li>djvu files need <code class="docutils literal"><span class="pre">djvutxt</span></code> and <code class="docutils literal"><span class="pre">djvused</span></code> from the DjVuLibre
|
|
package.</li>
|
|
<li>Audio files: RCL releases 1.14 and later use a single Python handler
|
|
based on mutagen for all audio file types.</li>
|
|
<li>Pictures: RCL uses the Exiftool Perl package to extract tag
|
|
information. Most image file formats are supported. Note that there
|
|
may not be much interest in indexing the technical tags (image size,
|
|
aperture, etc.). This is only of interest if you store personal tags
|
|
or textual descriptions inside the image files.</li>
|
|
<li>chm: files in Microsoft help format need Python and the pychm module
|
|
(which needs chmlib).</li>
|
|
<li>ICS: up to RCL 1.13, iCalendar files need Python and the icalendar
|
|
module. icalendar is not needed for newer versions, which use
|
|
internal code.</li>
|
|
<li>Zip archives need Python (and the standard zipfile module).</li>
|
|
<li>Rar archives need Python, the rarfile Python module and the <code class="docutils literal"><span class="pre">unrar</span></code>
|
|
utility.</li>
|
|
<li>Midi karaoke files need Python and the <a class="reference external" href="http://pypi.python.org/pypi/midi/0.2.1">Midi
|
|
module</a></li>
|
|
<li>Konqueror webarchive format with Python (uses the Tarfile module).</li>
|
|
<li>Mimehtml web archive format (support based on the email handler,
|
|
which introduces some mild weirdness, but still usable).</li>
|
|
</ul>
|
|
<p>Text, HTML, email folders, and Scribus files are processed internally.
|
|
Lyx is used to index Lyx files. Many handlers need <code class="docutils literal"><span class="pre">iconv</span></code> and the
|
|
standard <code class="docutils literal"><span class="pre">sed</span></code> and <code class="docutils literal"><span class="pre">awk</span></code>.</p>
|
|
</div>
|
|
<div class="section" id="building-from-source">
|
|
<h3><a class="toc-backref" href="#id39">Building from source</a><a class="headerlink" href="#building-from-source" title="Permalink to this headline">¶</a></h3>
|
|
<div class="section" id="prerequisites">
|
|
<h4>Prerequisites<a class="headerlink" href="#prerequisites" title="Permalink to this headline">¶</a></h4>
|
|
<p>The following prerequisites are described in broad terms and not as
|
|
specific package names (which will depend on the exact platform). The
|
|
dependancies should be available as packages on most common Unix
|
|
derivatives, and it should be quite uncommon that you would have to
|
|
build one of them.</p>
|
|
<p>The shopping list:</p>
|
|
<ul>
|
|
<li><p class="first">The <code class="docutils literal"><span class="pre">autoconf</span></code>, <code class="docutils literal"><span class="pre">automake</span></code> and <code class="docutils literal"><span class="pre">libtool</span></code> triad. Only
|
|
<code class="docutils literal"><span class="pre">autoconf</span></code> is needed for RCL 1.21 and earlier.</p>
|
|
</li>
|
|
<li><p class="first">C++ compiler. Recent versions require C++11 compatibility (1.23 and
|
|
later).</p>
|
|
</li>
|
|
<li><p class="first"><code class="docutils literal"><span class="pre">bison</span></code> command (for RCL 1.21 and later).</p>
|
|
</li>
|
|
<li><p class="first"><code class="docutils literal"><span class="pre">xsltproc</span></code> command. For building the documentation (for RCL 1.21
|
|
and later). This sometimes comes with the libxslt package. And also
|
|
the Docbook XML and style sheet files.</p>
|
|
</li>
|
|
<li><p class="first">Development files for <a class="reference external" href="http://www.xapian.org">Xapian core</a>.</p>
|
|
<blockquote>
|
|
<div><p><strong>Important</strong></p>
|
|
<p>If you are building Xapian for an older CPU (before Pentium 4 or
|
|
Athlon 64), you need to add the <code class="docutils literal"><span class="pre">--disable-sse</span></code> flag to the
|
|
configure command. Else all Xapian application will crash with an
|
|
<code class="docutils literal"><span class="pre">illegal</span> <span class="pre">instruction</span></code> error.</p>
|
|
</div></blockquote>
|
|
</li>
|
|
<li><p class="first">Development files for <a class="reference external" href="http://qt-project.org/downloads">Qt 4 or Qt
|
|
5</a>. RCL 1.15.9 was the last
|
|
version to support Qt 3. If you do not want to install or build the
|
|
Qt Webkit module, RCL has a configuration option to disable its use
|
|
(see further in the configuration section).</p>
|
|
</li>
|
|
<li><p class="first">Development files for X11 and zlib.</p>
|
|
</li>
|
|
<li><p class="first">Development files for Python (or use <code class="docutils literal"><span class="pre">--disable-python-module</span></code>).</p>
|
|
</li>
|
|
<li><p class="first">You may also need
|
|
<a class="reference external" href="http://www.gnu.org/software/libiconv/">libiconv</a>. On Linux
|
|
systems, the iconv interface is part of libc and you should not need
|
|
to do anything special.</p>
|
|
</li>
|
|
</ul>
|
|
<p>Check the <a class="reference external" href="http://www.recoll.org/download.html">RCL download page</a>
|
|
for up to date version information.</p>
|
|
</div>
|
|
<div class="section" id="building">
|
|
<h4>Building<a class="headerlink" href="#building" title="Permalink to this headline">¶</a></h4>
|
|
<p>RCL has been built on Linux, FreeBSD, Mac OS X, and Solaris, most
|
|
versions after 2005 should be ok, maybe some older ones too (Solaris 8
|
|
is ok). If you build on another system, and need to modify things, <a class="reference external" href="mailto:jfd%40recoll.org">I
|
|
would very much welcome patches</a>.</p>
|
|
<p><strong>Configure options:.</strong></p>
|
|
<ul class="simple">
|
|
<li><code class="docutils literal"><span class="pre">--without-aspell</span></code> will disable the code for phonetic matching of
|
|
search terms.</li>
|
|
<li><code class="docutils literal"><span class="pre">--with-fam</span></code> or <code class="docutils literal"><span class="pre">--with-inotify</span></code> will enable the code for real
|
|
time indexing. Inotify support is enabled by default on recent Linux
|
|
systems.</li>
|
|
<li><code class="docutils literal"><span class="pre">--with-qzeitgeist</span></code> will enable sending Zeitgeist events about the
|
|
visited search results, and needs the qzeitgeist package.</li>
|
|
<li><code class="docutils literal"><span class="pre">--disable-webkit</span></code> is available from version 1.17 to implement the
|
|
result list with a Qt QTextBrowser instead of a WebKit widget if you
|
|
do not or can’t depend on the latter.</li>
|
|
<li><code class="docutils literal"><span class="pre">--disable-idxthreads</span></code> is available from version 1.19 to suppress
|
|
multithreading inside the indexing process. You can also use the
|
|
run-time configuration to restrict <code class="docutils literal"><span class="pre">recollindex</span></code> to using a single
|
|
thread, but the compile-time option may disable a few more unused
|
|
locks. This only applies to the use of multithreading for the core
|
|
index processing (data input). The RCL monitor mode always uses at
|
|
least two threads of execution.</li>
|
|
<li><code class="docutils literal"><span class="pre">--disable-python-module</span></code> will avoid building the Python module.</li>
|
|
<li><code class="docutils literal"><span class="pre">--disable-xattr</span></code> will prevent fetching data from file extended
|
|
attributes. Beyond a few standard attributes, fetching extended
|
|
attributes data can only be useful is some application stores data in
|
|
there, and also needs some simple configuration (see comments in the
|
|
<code class="docutils literal"><span class="pre">fields</span></code> configuration file).</li>
|
|
<li><code class="docutils literal"><span class="pre">--enable-camelcase</span></code> will enable splitting camelCase words. This is
|
|
not enabled by default as it has the unfortunate side-effect of
|
|
making some phrase searches quite confusing: ie, <code class="docutils literal"><span class="pre">"MySQL</span> <span class="pre">manual"</span></code>
|
|
would be matched by <code class="docutils literal"><span class="pre">"MySQL</span> <span class="pre">manual"</span></code> and <code class="docutils literal"><span class="pre">"my</span> <span class="pre">sql</span> <span class="pre">manual"</span></code> but
|
|
not <code class="docutils literal"><span class="pre">"mysql</span> <span class="pre">manual"</span></code> (only inside phrase searches).</li>
|
|
<li><code class="docutils literal"><span class="pre">--with-file-command</span></code> Specify the version of the ‘file’ command to
|
|
use (ie: –with-file-command=/usr/local/bin/file). Can be useful to
|
|
enable the gnu version on systems where the native one is bad.</li>
|
|
<li><code class="docutils literal"><span class="pre">--disable-qtgui</span></code> Disable the Qt interface. Will allow building the
|
|
indexer and the command line search program in absence of a Qt
|
|
environment.</li>
|
|
<li><code class="docutils literal"><span class="pre">--disable-x11mon</span></code> Disable X11 connection monitoring inside
|
|
recollindex. Together with –disable-qtgui, this allows building
|
|
recoll without Qt and X11.</li>
|
|
<li><code class="docutils literal"><span class="pre">--disable-userdoc</span></code> will avoid building the user manual. This
|
|
avoids having to install the Docbook XML/XSL files and the TeX
|
|
toolchain used for translating the manual to PDF.</li>
|
|
<li><code class="docutils literal"><span class="pre">--disable-pic</span></code> (RCL versions up to 1.21 only) will compile RCL
|
|
with position-dependant code. This is incompatible with building the
|
|
KIO or the Python or PHP extensions, but might yield very marginally
|
|
faster code.</li>
|
|
<li>Of course the usual autoconf <code class="docutils literal"><span class="pre">configure</span></code> options, like <code class="docutils literal"><span class="pre">--prefix</span></code>
|
|
apply.</li>
|
|
</ul>
|
|
<p>Normal procedure (for source extracted from a tar distribution):</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">cd</span> <span class="n">recoll</span><span class="o">-</span><span class="n">xxx</span>
|
|
<span class="o">./</span><span class="n">configure</span>
|
|
<span class="n">make</span>
|
|
<span class="p">(</span><span class="n">practices</span> <span class="n">usual</span> <span class="n">hardship</span><span class="o">-</span><span class="n">repelling</span> <span class="n">invocations</span><span class="p">)</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>When building from source cloned from the git repository, you also need
|
|
to install autoconf, automake, and libtool and you must execute
|
|
<code class="docutils literal"><span class="pre">sh</span> <span class="pre">autogen.sh</span></code> in the top source directory before running
|
|
<code class="docutils literal"><span class="pre">configure</span></code>.</p>
|
|
</div>
|
|
<div class="section" id="installing">
|
|
<h4>Installing<a class="headerlink" href="#installing" title="Permalink to this headline">¶</a></h4>
|
|
<p>Use <code class="docutils literal"><span class="pre">make</span> <span class="pre">install</span></code> in the root of the source tree. This will copy the
|
|
commands to <code class="docutils literal"><span class="pre">prefix/bin</span></code> and the sample configuration files, scripts
|
|
and other shared data to <code class="docutils literal"><span class="pre">prefix/share/recoll</span></code>.</p>
|
|
</div>
|
|
<div class="section" id="python-api-package">
|
|
<h4>Python API package<a class="headerlink" href="#python-api-package" title="Permalink to this headline">¶</a></h4>
|
|
<p>The Python interface can be found in the source tree, under the
|
|
<code class="docutils literal"><span class="pre">python/recoll</span></code> directory.</p>
|
|
<p>As of RCL 1.19, the module can be compiled for Python3.</p>
|
|
<p>The normal RCL build procedure (see above) installs the API package for
|
|
the default system version (python) along with the main code. The
|
|
package for other Python versions (e.g. python3 if the system default is
|
|
python2) must be explicitely built and installed.</p>
|
|
<p>The <code class="docutils literal"><span class="pre">python/recoll/</span></code> directory contains the usual <code class="docutils literal"><span class="pre">setup.py</span></code>. After
|
|
configuring and building the main RCL code, you can use the script to
|
|
build and install the Python module:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">cd</span> <span class="n">recoll</span><span class="o">-</span><span class="n">xxx</span><span class="o">/</span><span class="n">python</span><span class="o">/</span><span class="n">recoll</span>
|
|
<span class="n">pythonX</span> <span class="n">setup</span><span class="o">.</span><span class="n">py</span> <span class="n">build</span>
|
|
<span class="n">sudo</span> <span class="n">pythonX</span> <span class="n">setup</span><span class="o">.</span><span class="n">py</span> <span class="n">install</span>
|
|
</pre></div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="building-on-solaris">
|
|
<h4>Building on Solaris<a class="headerlink" href="#building-on-solaris" title="Permalink to this headline">¶</a></h4>
|
|
<p>We did not test building the GUI on Solaris for recent versions. You
|
|
will need at least Qt 4.4. There are some hints on <a class="reference external" href="http://www.recoll.org/download-1.14.html">an old web site
|
|
page</a>, they may still be
|
|
valid.</p>
|
|
<p>Someone did test the 1.19 indexer and Python module build, they do work,
|
|
with a few minor glitches. Be sure to use GNU <code class="docutils literal"><span class="pre">make</span></code> and <code class="docutils literal"><span class="pre">install</span></code>.</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="configuration-overview">
|
|
<h3><a class="toc-backref" href="#id40">Configuration overview</a><a class="headerlink" href="#configuration-overview" title="Permalink to this headline">¶</a></h3>
|
|
<p>Most of the parameters specific to the <code class="docutils literal"><span class="pre">recoll</span></code> GUI are set through
|
|
the Preferences menu and stored in the standard Qt place
|
|
(<code class="docutils literal"><span class="pre">$HOME/.config/Recoll.org/recoll.conf</span></code>). You probably do not want to
|
|
edit this by hand.</p>
|
|
<p>RCL indexing options are set inside text configuration files located in
|
|
a configuration directory. There can be several such directories, each
|
|
of which defines the parameters for one index.</p>
|
|
<p>The configuration files can be edited by hand or through the Index
|
|
configuration dialog (Preferences menu). The GUI tool will try to
|
|
respect your formatting and comments as much as possible, so it is quite
|
|
possible to use both approaches on the same configuration.</p>
|
|
<p>The most accurate documentation for the configuration parameters is
|
|
given by comments inside the default files, and we will just give a
|
|
general overview here.</p>
|
|
<p>For each index, there are at least two sets of configuration files.
|
|
System-wide configuration files are kept in a directory named like
|
|
<code class="docutils literal"><span class="pre">/usr/share/recoll/examples</span></code>, and define default values, shared by all
|
|
indexes. For each index, a parallel set of files defines the customized
|
|
parameters.</p>
|
|
<p>The default location of the customized configuration is the <code class="docutils literal"><span class="pre">.recoll</span></code>
|
|
directory in your home. Most people will only use this directory.</p>
|
|
<p>This location can be changed, or others can be added with the
|
|
RECOLL_CONFDIR environment variable or the <code class="docutils literal"><span class="pre">-c</span></code> option parameter to
|
|
<code class="docutils literal"><span class="pre">recoll</span></code> and <code class="docutils literal"><span class="pre">recollindex</span></code>.</p>
|
|
<p>In addition (as of RCL version 1.19.7), it is possible to specify two
|
|
additional configuration directories which will be stacked before and
|
|
after the user configuration directory. These are defined by the
|
|
RECOLL_CONFTOP and RECOLL_CONFMID environment variables. Values from
|
|
configuration files inside the top directory will override user ones,
|
|
values from configuration files inside the middle directory will
|
|
override system ones and be overriden by user ones. These two variables
|
|
may be of use to applications which augment RCL functionality, and need
|
|
to add configuration data without disturbing the user’s files. Please
|
|
note that the two, currently single, values will probably be interpreted
|
|
as colon-separated lists in the future: do not use colon characters
|
|
inside the directory paths.</p>
|
|
<p>If the <code class="docutils literal"><span class="pre">.recoll</span></code> directory does not exist when <code class="docutils literal"><span class="pre">recoll</span></code> or
|
|
<code class="docutils literal"><span class="pre">recollindex</span></code> are started, it will be created with a set of empty
|
|
configuration files. <code class="docutils literal"><span class="pre">recoll</span></code> will give you a chance to edit the
|
|
configuration file before starting indexing. <code class="docutils literal"><span class="pre">recollindex</span></code> will
|
|
proceed immediately. To avoid mistakes, the automatic directory creation
|
|
will only occur for the default location, not if <code class="docutils literal"><span class="pre">-c</span></code> or
|
|
RECOLL_CONFDIR were used (in the latter cases, you will have to create
|
|
the directory).</p>
|
|
<p>All configuration files share the same format. For example, a short
|
|
extract of the main configuration file might look as follows:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="c1"># Space-separated list of files and directories to index.</span>
|
|
<span class="n">topdirs</span> <span class="o">=</span> <span class="o">~/</span><span class="n">docs</span> <span class="o">/</span><span class="n">usr</span><span class="o">/</span><span class="n">share</span><span class="o">/</span><span class="n">doc</span>
|
|
|
|
<span class="p">[</span><span class="o">~/</span><span class="n">somedirectory</span><span class="o">-</span><span class="k">with</span><span class="o">-</span><span class="n">utf8</span><span class="o">-</span><span class="n">txt</span><span class="o">-</span><span class="n">files</span><span class="p">]</span>
|
|
<span class="n">defaultcharset</span> <span class="o">=</span> <span class="n">utf</span><span class="o">-</span><span class="mi">8</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>There are three kinds of lines:</p>
|
|
<ul class="simple">
|
|
<li>Comment (starts with <em>#</em>) or empty.</li>
|
|
<li>Parameter affectation (<em>name = value</em>).</li>
|
|
<li>Section definition ([<em>somedirname</em>]).</li>
|
|
</ul>
|
|
<p>Long lines can be broken by ending each incomplete part with a backslash
|
|
(<code class="docutils literal"><span class="pre">\</span></code>).</p>
|
|
<p>Depending on the type of configuration file, section definitions either
|
|
separate groups of parameters or allow redefining some parameters for a
|
|
directory sub-tree. They stay in effect until another section
|
|
definition, or the end of file, is encountered. Some of the parameters
|
|
used for indexing are looked up hierarchically from the current
|
|
directory location upwards. Not all parameters can be meaningfully
|
|
redefined, this is specified for each in the next section.</p>
|
|
<blockquote>
|
|
<div><p><strong>Important</strong></p>
|
|
<p>Global parameters <em>must not</em> be defined in a directory subsection,
|
|
else they will not be found at all by the RCL code, which looks for
|
|
them at the top level (e.g. <code class="docutils literal"><span class="pre">skippedPaths</span></code>).</p>
|
|
</div></blockquote>
|
|
<p>When found at the beginning of a file path, the tilde character (~) is
|
|
expanded to the name of the user’s home directory, as a shell would do.</p>
|
|
<p>Some parameters are lists of strings. White space is used for
|
|
separation. List elements with embedded spaces can be quoted using
|
|
double-quotes. Double quotes inside these elements can be escaped with a
|
|
backslash.</p>
|
|
<p>No value inside a configuration file can contain a newline character.
|
|
Long lines can be continued by escaping the physical newline with
|
|
backslash, even inside quoted strings.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">astringlist</span> <span class="o">=</span> <span class="s2">"some string </span><span class="se">\</span>
|
|
<span class="s2">with spaces"</span>
|
|
<span class="n">thesame</span> <span class="o">=</span> <span class="s2">"some string with spaces"</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Parameters which are not part of string lists can’t be quoted, and
|
|
leading and trailing space characters are stripped before the value is
|
|
used.</p>
|
|
<p><strong>Encoding issues.</strong></p>
|
|
<p>Most of the configuration parameters are plain ASCII. Two particular
|
|
sets of values may cause encoding issues:</p>
|
|
<ul class="simple">
|
|
<li>File path parameters may contain non-ascii characters and should use
|
|
the exact same byte values as found in the file system directory.
|
|
Usually, this means that the configuration file should use the system
|
|
default locale encoding.</li>
|
|
<li>The unac_except_trans parameter should be encoded in UTF-8. If your
|
|
system locale is not UTF-8, and you need to also specify non-ascii
|
|
file paths, this poses a difficulty because common text editors
|
|
cannot handle multiple encodings in a single file. In this relatively
|
|
unlikely case, you can edit the configuration file as two separate
|
|
text files with appropriate encodings, and concatenate them to create
|
|
the complete configuration.</li>
|
|
</ul>
|
|
<div class="section" id="environment-variables">
|
|
<h4>Environment variables<a class="headerlink" href="#environment-variables" title="Permalink to this headline">¶</a></h4>
|
|
<dl class="docutils">
|
|
<dt><code class="docutils literal"><span class="pre">RECOLL_CONFDIR</span></code></dt>
|
|
<dd>Defines the main configuration directory.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">RECOLL_TMPDIR,</span> <span class="pre">TMPDIR</span></code></dt>
|
|
<dd>Locations for temporary files, in this order of priority. The
|
|
default if none of these is set is to use <code class="docutils literal"><span class="pre">/tmp</span></code>. Big temporary
|
|
files may be created during indexing, mostly for decompressing, and
|
|
also for processing, e.g. email attachments.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">RECOLL_CONFTOP,</span> <span class="pre">RECOLL_CONFMID</span></code></dt>
|
|
<dd>Allow adding configuration directories with priorities below and
|
|
above the user directory (see above the Configuration overview
|
|
section for details).</dd>
|
|
<dt><code class="docutils literal"><span class="pre">RECOLL_EXTRA_DBS,</span> <span class="pre">RECOLL_ACTIVE_EXTRA_DBS</span></code></dt>
|
|
<dd>Help for setting up external indexes. See <a class="reference external" href="#RCL.SEARCH.GUI.MULTIDB">this
|
|
paragraph</a> for explanations.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">RECOLL_DATADIR</span></code></dt>
|
|
<dd>Defines replacement for the default location of Recoll data files,
|
|
normally found in, e.g., <code class="docutils literal"><span class="pre">/usr/share/recoll</span></code>).</dd>
|
|
<dt><code class="docutils literal"><span class="pre">RECOLL_FILTERSDIR</span></code></dt>
|
|
<dd>Defines replacement for the default location of Recoll filters,
|
|
normally found in, e.g., <code class="docutils literal"><span class="pre">/usr/share/recoll/filters</span></code>).</dd>
|
|
<dt><code class="docutils literal"><span class="pre">ASPELL_PROG</span></code></dt>
|
|
<dd><code class="docutils literal"><span class="pre">aspell</span></code> program to use for creating the spelling dictionary. The
|
|
result has to be compatible with the <code class="docutils literal"><span class="pre">libaspell</span></code> which RCL is
|
|
using.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">VARNAME</span></code></dt>
|
|
<dd>Blabla</dd>
|
|
</dl>
|
|
</div>
|
|
<div class="section" id="recoll-main-configuration-file-recoll-conf">
|
|
<h4>Recoll main configuration file, recoll.conf<a class="headerlink" href="#recoll-main-configuration-file-recoll-conf" title="Permalink to this headline">¶</a></h4>
|
|
<div class="section" id="parameters-affecting-what-documents-we-index">
|
|
<h5>Parameters affecting what documents we index<a class="headerlink" href="#parameters-affecting-what-documents-we-index" title="Permalink to this headline">¶</a></h5>
|
|
<dl class="docutils">
|
|
<dt><code class="docutils literal"><span class="pre">topdirs</span></code></dt>
|
|
<dd>Space-separated list of files or directories to recursively index.
|
|
Default to ~ (indexes $HOME). You can use symbolic links in the
|
|
list, they will be followed, independantly of the value of the
|
|
followLinks variable.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">monitordirs</span></code></dt>
|
|
<dd>Space-separated list of files or directories to monitor for updates.
|
|
When running the real-time indexer, this allows monitoring only a
|
|
subset of the whole indexed area. The elements must be included in
|
|
the tree defined by the ‘topdirs’ members.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">skippedNames</span></code></dt>
|
|
<dd>Files and directories which should be ignored. White space separated
|
|
list of wildcard patterns (simple ones, not paths, must contain no /
|
|
), which will be tested against file and directory names. The list
|
|
in the default configuration does not exclude hidden directories
|
|
(names beginning with a dot), which means that it may index quite a
|
|
few things that you do not want. On the other hand, email user
|
|
agents like Thunderbird usually store messages in hidden
|
|
directories, and you probably want this indexed. One possible
|
|
solution is to have ”.*” in “skippedNames”, and add things like
|
|
“~/.thunderbird” “~/.evolution” to “topdirs”. Not even the file
|
|
names are indexed for patterns in this list, see the
|
|
“noContentSuffixes” variable for an alternative approach which
|
|
indexes the file names. Can be redefined for any subtree.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">skippedNames-</span></code></dt>
|
|
<dd>List of name endings to remove from the default skippedNames list.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">skippedNames+</span></code></dt>
|
|
<dd>List of name endings to add to the default skippedNames list.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">noContentSuffixes</span></code></dt>
|
|
<dd>List of name endings (not necessarily dot-separated suffixes) for
|
|
which we don’t try MIME type identification, and don’t uncompress or
|
|
index content. Only the names will be indexed. This complements the
|
|
now obsoleted recoll_noindex list from the mimemap file, which will
|
|
go away in a future release (the move from mimemap to recoll.conf
|
|
allows editing the list through the GUI). This is different from
|
|
skippedNames because these are name ending matches only (not
|
|
wildcard patterns), and the file name itself gets indexed normally.
|
|
This can be redefined for subdirectories.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">noContentSuffixes-</span></code></dt>
|
|
<dd>List of name endings to remove from the default noContentSuffixes
|
|
list.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">noContentSuffixes+</span></code></dt>
|
|
<dd>List of name endings to add to the default noContentSuffixes list.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">skippedPaths</span></code></dt>
|
|
<dd>Absolute paths we should not go into. Space-separated list of
|
|
wildcard expressions for absolute filesystem paths. Must be defined
|
|
at the top level of the configuration file, not in a subsection. Can
|
|
contain files and directories. The database and configuration
|
|
directories will automatically be added. The expressions are matched
|
|
using ‘fnmatch(3)’ with the FNM_PATHNAME flag set by default. This
|
|
means that ‘/’ characters must be matched explicitely. You can set
|
|
‘skippedPathsFnmPathname’ to 0 to disable the use of FNM_PATHNAME
|
|
(meaning that ‘/*/dir3’ will match ‘/dir1/dir2/dir3’). The default
|
|
value contains the usual mount point for removable media to remind
|
|
you that it is a bad idea to have Recoll work on these (esp. with
|
|
the monitor: media gets indexed on mount, all data gets erased on
|
|
unmount). Explicitely adding ‘/media/xxx’ to the ‘topdirs’ variable
|
|
will override this.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">skippedPathsFnmPathname</span></code></dt>
|
|
<dd>Set to 0 to override use of FNM_PATHNAME for matching skipped
|
|
paths.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">nowalkfn</span></code></dt>
|
|
<dd>File name which will cause its parent directory to be skipped. Any
|
|
directory containing a file with this name will be skipped as if it
|
|
was part of the skippedPaths list. Ex: .recoll-noindex</dd>
|
|
<dt><code class="docutils literal"><span class="pre">daemSkippedPaths</span></code></dt>
|
|
<dd>skippedPaths equivalent specific to real time indexing. This enables
|
|
having parts of the tree which are initially indexed but not
|
|
monitored. If daemSkippedPaths is not set, the daemon uses
|
|
skippedPaths.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">zipUseSkippedNames</span></code></dt>
|
|
<dd>Use skippedNames inside Zip archives. Fetched directly by the rclzip
|
|
handler. Skip the patterns defined by skippedNames inside Zip
|
|
archives. Can be redefined for subdirectories. See
|
|
<a class="reference external" href="https://www.lesbonscomptes.com/recoll/faqsandhowtos/FilteringOutZipArchiveMembers.html">https://www.lesbonscomptes.com/recoll/faqsandhowtos/FilteringOutZipArchiveMembers.html</a></dd>
|
|
<dt><code class="docutils literal"><span class="pre">zipSkippedNames</span></code></dt>
|
|
<dd>Space-separated list of wildcard expressions for names that should
|
|
be ignored inside zip archives. This is used directly by the zip
|
|
handler. If zipUseSkippedNames is not set, zipSkippedNames defines
|
|
the patterns to be skipped inside archives. If zipUseSkippedNames is
|
|
set, the two lists are concatenated and used. Can be redefined for
|
|
subdirectories. See
|
|
<a class="reference external" href="https://www.lesbonscomptes.com/recoll/faqsandhowtos/FilteringOutZipArchiveMembers.html">https://www.lesbonscomptes.com/recoll/faqsandhowtos/FilteringOutZipArchiveMembers.html</a></dd>
|
|
<dt><code class="docutils literal"><span class="pre">followLinks</span></code></dt>
|
|
<dd>Follow symbolic links during indexing. The default is to ignore
|
|
symbolic links to avoid multiple indexing of linked files. No effort
|
|
is made to avoid duplication when this option is set to true. This
|
|
option can be set individually for each of the ‘topdirs’ members by
|
|
using sections. It can not be changed below the ‘topdirs’ level.
|
|
Links in the ‘topdirs’ list itself are always followed.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">indexedmimetypes</span></code></dt>
|
|
<dd>Restrictive list of indexed mime types. Normally not set (in which
|
|
case all supported types are indexed). If it is set, only the types
|
|
from the list will have their contents indexed. The names will be
|
|
indexed anyway if indexallfilenames is set (default). MIME type
|
|
names should be taken from the mimemap file (the values may be
|
|
different from xdg-mime or file -i output in some cases). Can be
|
|
redefined for subtrees.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">excludedmimetypes</span></code></dt>
|
|
<dd>List of excluded MIME types. Lets you exclude some types from
|
|
indexing. MIME type names should be taken from the mimemap file (the
|
|
values may be different from xdg-mime or file -i output in some
|
|
cases) Can be redefined for subtrees.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">nomd5types</span></code></dt>
|
|
<dd>Don’t compute md5 for these types. md5 checksums are used only for
|
|
deduplicating results, and can be very expensive to compute on
|
|
multimedia or other big files. This list lets you turn off md5
|
|
computation for selected types. It is global (no redefinition for
|
|
subtrees). At the moment, it only has an effect for external
|
|
handlers (exec and execm). The file types can be specified by
|
|
listing either MIME types (e.g. audio/mpeg) or handler names (e.g.
|
|
rclaudio).</dd>
|
|
<dt><code class="docutils literal"><span class="pre">compressedfilemaxkbs</span></code></dt>
|
|
<dd>Size limit for compressed files. We need to decompress these in a
|
|
temporary directory for identification, which can be wasteful in
|
|
some cases. Limit the waste. Negative means no limit. 0 results in
|
|
no processing of any compressed file. Default 50 MB.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">textfilemaxmbs</span></code></dt>
|
|
<dd>Size limit for text files. Mostly for skipping monster logs. Default
|
|
20 MB.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">indexallfilenames</span></code></dt>
|
|
<dd>Index the file names of unprocessed files Index the names of files
|
|
the contents of which we don’t index because of an excluded or
|
|
unsupported MIME type.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">usesystemfilecommand</span></code></dt>
|
|
<dd>Use a system command for file MIME type guessing as a final step in
|
|
file type identification This is generally useful, but will usually
|
|
cause the indexing of many bogus ‘text’ files. See
|
|
‘systemfilecommand’ for the command used.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">systemfilecommand</span></code></dt>
|
|
<dd>Command used to guess MIME types if the internal methods fails This
|
|
should be a “file -i” workalike. The file path will be added as a
|
|
last parameter to the command line. ‘xdg-mime’ works better than the
|
|
traditional ‘file’ command, and is now the configured default (with
|
|
a hard-coded fallback to ‘file’)</dd>
|
|
<dt><code class="docutils literal"><span class="pre">processwebqueue</span></code></dt>
|
|
<dd>Decide if we process the Web queue. The queue is a directory where
|
|
the Recoll Web browser plugins create the copies of visited pages.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">textfilepagekbs</span></code></dt>
|
|
<dd>Page size for text files. If this is set, text/plain files will be
|
|
divided into documents of approximately this size. Will reduce
|
|
memory usage at index time and help with loading data in the preview
|
|
window at query time. Particularly useful with very big files, such
|
|
as application or system logs. Also see textfilemaxmbs and
|
|
compressedfilemaxkbs.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">membermaxkbs</span></code></dt>
|
|
<dd>Size limit for archive members. This is passed to the filters in the
|
|
environment as RECOLL_FILTER_MAXMEMBERKB.</dd>
|
|
</dl>
|
|
</div>
|
|
<div class="section" id="parameters-affecting-how-we-generate-terms-and-organize-the-index">
|
|
<h5>Parameters affecting how we generate terms and organize the index<a class="headerlink" href="#parameters-affecting-how-we-generate-terms-and-organize-the-index" title="Permalink to this headline">¶</a></h5>
|
|
<dl class="docutils">
|
|
<dt><code class="docutils literal"><span class="pre">indexStripChars</span></code></dt>
|
|
<dd>Decide if we store character case and diacritics in the index. If we
|
|
do, searches sensitive to case and diacritics can be performed, but
|
|
the index will be bigger, and some marginal weirdness may sometimes
|
|
occur. The default is a stripped index. When using multiple indexes
|
|
for a search, this parameter must be defined identically for all.
|
|
Changing the value implies an index reset.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">indexStoreDocText</span></code></dt>
|
|
<dd>Decide if we store the documents’ text content in the index. Storing
|
|
the text allows extracting snippets from it at query time, instead
|
|
of building them from index position data. Newer Xapian index
|
|
formats have rendered our use of positions list unacceptably slow in
|
|
some cases. The last Xapian index format with good performance for
|
|
the old method is Chert, which is default for 1.2, still supported
|
|
but not default in 1.4 and will be dropped in 1.6. The stored
|
|
document text is translated from its original format to UTF-8 plain
|
|
text, but not stripped of upper-case, diacritics, or punctuation
|
|
signs. Storing it increases the index size by 10-20% typically, but
|
|
also allows for nicer snippets, so it may be worth enabling it even
|
|
if not strictly needed for performance if you can afford the space.
|
|
The variable only has an effect when creating an index, meaning that
|
|
the xapiandb directory must not exist yet. Its exact effect depends
|
|
on the Xapian version. For Xapian 1.4, if the variable is set to 0,
|
|
the Chert format will be used, and the text will not be stored. If
|
|
the variable is 1, Glass will be used, and the text stored. For
|
|
Xapian 1.2, and for versions after 1.5 and newer, the index format
|
|
is always the default, but the variable controls if the text is
|
|
stored or not, and the abstract generation method. With Xapian 1.5
|
|
and later, and the variable set to 0, abstract generation may be
|
|
very slow, but this setting may still be useful to save space if you
|
|
do not use abstract generation at all.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">nonumbers</span></code></dt>
|
|
<dd>Decides if terms will be generated for numbers. For example “123”,
|
|
“1.5e6”, 192.168.1.4, would not be indexed if nonumbers is set
|
|
(“value123” would still be). Numbers are often quite interesting to
|
|
search for, and this should probably not be set except for special
|
|
situations, ie, scientific documents with huge amounts of numbers in
|
|
them, where setting nonumbers will reduce the index size. This can
|
|
only be set for a whole index, not for a subtree.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">dehyphenate</span></code></dt>
|
|
<dd>Determines if we index ‘coworker’ also when the input is
|
|
‘co-worker’. This is new in version 1.22, and on by default. Setting
|
|
the variable to off allows restoring the previous behaviour.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">backslashasletter</span></code></dt>
|
|
<dd>Process backslash as normal letter This may make sense for people
|
|
wanting to index TeX commands as such but is not of much general
|
|
use.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">maxtermlength</span></code></dt>
|
|
<dd>Maximum term length. Words longer than this will be discarded. The
|
|
default is 40 and used to be hard-coded, but it can now be adjusted.
|
|
You need an index reset if you change the value.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">nocjk</span></code></dt>
|
|
<dd>Decides if specific East Asian (Chinese Korean Japanese)
|
|
characters/word splitting is turned off. This will save a small
|
|
amount of CPU if you have no CJK documents. If your document base
|
|
does include such text but you are not interested in searching it,
|
|
setting nocjk may be a significant time and space saver.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">cjkngramlen</span></code></dt>
|
|
<dd>This lets you adjust the size of n-grams used for indexing CJK text.
|
|
The default value of 2 is probably appropriate in most cases. A
|
|
value of 3 would allow more precision and efficiency on longer
|
|
words, but the index will be approximately twice as large.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">indexstemminglanguages</span></code></dt>
|
|
<dd>Languages for which to create stemming expansion data. Stemmer names
|
|
can be found by executing ‘recollindex -l’, or this can also be set
|
|
from a list in the GUI.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">defaultcharset</span></code></dt>
|
|
<dd>Default character set. This is used for files which do not contain a
|
|
character set definition (e.g.: text/plain). Values found inside
|
|
files, e.g. a ‘charset’ tag in HTML documents, will override it. If
|
|
this is not set, the default character set is the one defined by the
|
|
NLS environment ($LC_ALL, $LC_CTYPE, $LANG), or ultimately
|
|
iso-8859-1 (cp-1252 in fact). If for some reason you want a general
|
|
default which does not match your LANG and is not 8859-1, use this
|
|
variable. This can be redefined for any sub-directory.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">unac_except_trans</span></code></dt>
|
|
<dd>A list of characters, encoded in UTF-8, which should be handled
|
|
specially when converting text to unaccented lowercase. For example,
|
|
in Swedish, the letter a with diaeresis has full alphabet
|
|
citizenship and should not be turned into an a. Each element in the
|
|
space-separated list has the special character as first element and
|
|
the translation following. The handling of both the lowercase and
|
|
upper-case versions of a character should be specified, as
|
|
appartenance to the list will turn-off both standard accent and case
|
|
processing. The value is global and affects both indexing and
|
|
querying. Examples: Swedish: unac_except_trans = ää Ää öö Öö üü Üü
|
|
ßss œoe Œoe æae Æae ffff fifi flfl åå Åå . German: unac_except_trans
|
|
= ää Ää öö Öö üü Üü ßss œoe Œoe æae Æae ffff fifi flfl In French, you
|
|
probably want to decompose oe and ae and nobody would type a German
|
|
ß unac_except_trans = ßss œoe Œoe æae Æae ffff fifi flfl . The
|
|
default for all until someone protests follows. These decompositions
|
|
are not performed by unac, but it is unlikely that someone would
|
|
type the composed forms in a search. unac_except_trans = ßss œoe
|
|
Œoe æae Æae ffff fifi flfl</dd>
|
|
<dt><code class="docutils literal"><span class="pre">maildefcharset</span></code></dt>
|
|
<dd>Overrides the default character set for email messages which don’t
|
|
specify one. This is mainly useful for readpst (libpst) dumps, which
|
|
are utf-8 but do not say so.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">localfields</span></code></dt>
|
|
<dd>Set fields on all files (usually of a specific fs area). Syntax is
|
|
the usual: name = value ; attr1 = val1 ; [...] value is empty so
|
|
this needs an initial semi-colon. This is useful, e.g., for setting
|
|
the rclaptg field for application selection inside mimeview.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">testmodifusemtime</span></code></dt>
|
|
<dd>Use mtime instead of ctime to test if a file has been modified. The
|
|
time is used in addition to the size, which is always used. Setting
|
|
this can reduce re-indexing on systems where extended attributes are
|
|
used (by some other application), but not indexed, because changing
|
|
extended attributes only affects ctime. Notes: - This may prevent
|
|
detection of change in some marginal file rename cases (the target
|
|
would need to have the same size and mtime). - You should probably
|
|
also set noxattrfields to 1 in this case, except if you still prefer
|
|
to perform xattr indexing, for example if the local file update
|
|
pattern makes it of value (as in general, there is a risk for pure
|
|
extended attributes updates without file modification to go
|
|
undetected). Perform a full index reset after changing this.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">noxattrfields</span></code></dt>
|
|
<dd>Disable extended attributes conversion to metadata fields. This
|
|
probably needs to be set if testmodifusemtime is set.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">metadatacmds</span></code></dt>
|
|
<dd>Define commands to gather external metadata, e.g. tmsu tags. There
|
|
can be several entries, separated by semi-colons, each defining
|
|
which field name the data goes into and the command to use. Don’t
|
|
forget the initial semi-colon. All the field names must be
|
|
different. You can use aliases in the “field” file if necessary. As
|
|
a not too pretty hack conceded to convenience, any field name
|
|
beginning with “rclmulti” will be taken as an indication that the
|
|
command returns multiple field values inside a text blob formatted
|
|
as a recoll configuration file (“fieldname = fieldvalue” lines). The
|
|
rclmultixx name will be ignored, and field names and values will be
|
|
parsed from the data. Example: metadatacmds = ; tags = tmsu tags %f;
|
|
rclmulti1 = cmdOutputsConf %f</dd>
|
|
</dl>
|
|
</div>
|
|
<div class="section" id="parameters-affecting-where-and-how-we-store-things">
|
|
<h5>Parameters affecting where and how we store things<a class="headerlink" href="#parameters-affecting-where-and-how-we-store-things" title="Permalink to this headline">¶</a></h5>
|
|
<dl class="docutils">
|
|
<dt><code class="docutils literal"><span class="pre">cachedir</span></code></dt>
|
|
<dd>Top directory for Recoll data. Recoll data directories are normally
|
|
located relative to the configuration directory (e.g.
|
|
~/.recoll/xapiandb, ~/.recoll/mboxcache). If ‘cachedir’ is set, the
|
|
directories are stored under the specified value instead (e.g. if
|
|
cachedir is ~/.cache/recoll, the default dbdir would be
|
|
~/.cache/recoll/xapiandb). This affects dbdir, webcachedir,
|
|
mboxcachedir, aspellDicDir, which can still be individually
|
|
specified to override cachedir. Note that if you have multiple
|
|
configurations, each must have a different cachedir, there is no
|
|
automatic computation of a subpath under cachedir.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">maxfsoccuppc</span></code></dt>
|
|
<dd>Maximum file system occupation over which we stop indexing. The
|
|
value is a percentage, corresponding to what the “Capacity” df
|
|
output column shows. The default value is 0, meaning no checking.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">dbdir</span></code></dt>
|
|
<dd>Xapian database directory location. This will be created on first
|
|
indexing. If the value is not an absolute path, it will be
|
|
interpreted as relative to cachedir if set, or the configuration
|
|
directory (-c argument or $RECOLL_CONFDIR). If nothing is
|
|
specified, the default is then ~/.recoll/xapiandb/</dd>
|
|
<dt><code class="docutils literal"><span class="pre">idxstatusfile</span></code></dt>
|
|
<dd>Name of the scratch file where the indexer process updates its
|
|
status. Default: idxstatus.txt inside the configuration directory.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">mboxcachedir</span></code></dt>
|
|
<dd>Directory location for storing mbox message offsets cache files.
|
|
This is normally ‘mboxcache’ under cachedir if set, or else under
|
|
the configuration directory, but it may be useful to share a
|
|
directory between different configurations.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">mboxcacheminmbs</span></code></dt>
|
|
<dd>Minimum mbox file size over which we cache the offsets. There is
|
|
really no sense in caching offsets for small files. The default is 5
|
|
MB.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">webcachedir</span></code></dt>
|
|
<dd>Directory where we store the archived web pages. This is only used
|
|
by the web history indexing code Default: cachedir/webcache if
|
|
cachedir is set, else $RECOLL_CONFDIR/webcache</dd>
|
|
<dt><code class="docutils literal"><span class="pre">webcachemaxmbs</span></code></dt>
|
|
<dd>Maximum size in MB of the Web archive. This is only used by the web
|
|
history indexing code. Default: 40 MB. Reducing the size will not
|
|
physically truncate the file.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">webqueuedir</span></code></dt>
|
|
<dd>The path to the Web indexing queue. This used to be hard-coded in
|
|
the old plugin as ~/.recollweb/ToIndex so there would be no need or
|
|
possibility to change it, but the WebExtensions plugin now downloads
|
|
the files to the user Downloads directory, and a script moves them
|
|
to webqueuedir. The script reads this value from the config so it
|
|
has become possible to change it.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">webdownloadsdir</span></code></dt>
|
|
<dd>The path to browser downloads directory. This is where the new
|
|
browser add-on extension has to create the files. They are then
|
|
moved by a script to webqueuedir.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">aspellDicDir</span></code></dt>
|
|
<dd>Aspell dictionary storage directory location. The aspell dictionary
|
|
(aspdict.(lang).rws) is normally stored in the directory specified
|
|
by cachedir if set, or under the configuration directory.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">filtersdir</span></code></dt>
|
|
<dd>Directory location for executable input handlers. If
|
|
RECOLL_FILTERSDIR is set in the environment, we use it instead.
|
|
Defaults to $prefix/share/recoll/filters. Can be redefined for
|
|
subdirectories.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">iconsdir</span></code></dt>
|
|
<dd>Directory location for icons. The only reason to change this would
|
|
be if you want to change the icons displayed in the result list.
|
|
Defaults to $prefix/share/recoll/images</dd>
|
|
</dl>
|
|
</div>
|
|
<div class="section" id="parameters-affecting-indexing-performance-and-resource-usage">
|
|
<h5>Parameters affecting indexing performance and resource usage<a class="headerlink" href="#parameters-affecting-indexing-performance-and-resource-usage" title="Permalink to this headline">¶</a></h5>
|
|
<dl class="docutils">
|
|
<dt><code class="docutils literal"><span class="pre">idxflushmb</span></code></dt>
|
|
<dd>Threshold (megabytes of new data) where we flush from memory to disk
|
|
index. Setting this allows some control over memory usage by the
|
|
indexer process. A value of 0 means no explicit flushing, which lets
|
|
Xapian perform its own thing, meaning flushing every
|
|
$XAPIAN_FLUSH_THRESHOLD documents created, modified or deleted: as
|
|
memory usage depends on average document size, not only document
|
|
count, the Xapian approach is is not very useful, and you should let
|
|
Recoll manage the flushes. The program compiled value is 0. The
|
|
configured default value (from this file) is now 50 MB, and should
|
|
be ok in many cases. You can set it as low as 10 to conserve memory,
|
|
but if you are looking for maximum speed, you may want to experiment
|
|
with values between 20 and 200. In my experience, values beyond this
|
|
are always counterproductive. If you find otherwise, please drop me
|
|
a note.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">filtermaxseconds</span></code></dt>
|
|
<dd>Maximum external filter execution time in seconds. Default 1200
|
|
(20mn). Set to 0 for no limit. This is mainly to avoid infinite
|
|
loops in postscript files (loop.ps)</dd>
|
|
<dt><code class="docutils literal"><span class="pre">filtermaxmbytes</span></code></dt>
|
|
<dd>Maximum virtual memory space for filter processes
|
|
(setrlimit(RLIMIT_AS)), in megabytes. Note that this includes any
|
|
mapped libs (there is no reliable Linux way to limit the data space
|
|
only), so we need to be a bit generous here. Anything over 2000 will
|
|
be ignored on 32 bits machines.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">thrQSizes</span></code></dt>
|
|
<dd>Stage input queues configuration. There are three internal queues in
|
|
the indexing pipeline stages (file data extraction, terms
|
|
generation, index update). This parameter defines the queue depths
|
|
for each stage (three integer values). If a value of -1 is given for
|
|
a given stage, no queue is used, and the thread will go on
|
|
performing the next stage. In practise, deep queues have not been
|
|
shown to increase performance. Default: a value of 0 for the first
|
|
queue tells Recoll to perform autoconfiguration based on the
|
|
detected number of CPUs (no need for the two other values in this
|
|
case). Use thrQSizes = -1 -1 -1 to disable multithreading entirely.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">thrTCounts</span></code></dt>
|
|
<dd>Number of threads used for each indexing stage. The three stages
|
|
are: file data extraction, terms generation, index update). The use
|
|
of the counts is also controlled by some special values in
|
|
thrQSizes: if the first queue depth is 0, all counts are ignored
|
|
(autoconfigured); if a value of -1 is used for a queue depth, the
|
|
corresponding thread count is ignored. It makes no sense to use a
|
|
value other than 1 for the last stage because updating the Xapian
|
|
index is necessarily single-threaded (and protected by a mutex).</dd>
|
|
</dl>
|
|
</div>
|
|
<div class="section" id="miscellaneous-parameters">
|
|
<h5>Miscellaneous parameters<a class="headerlink" href="#miscellaneous-parameters" title="Permalink to this headline">¶</a></h5>
|
|
<dl class="docutils">
|
|
<dt><code class="docutils literal"><span class="pre">loglevel</span></code></dt>
|
|
<dd>Log file verbosity 1-6. A value of 2 will print only errors and
|
|
warnings. 3 will print information like document updates, 4 is quite
|
|
verbose and 6 very verbose.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">logfilename</span></code></dt>
|
|
<dd>Log file destination. Use ‘stderr’ (default) to write to the
|
|
console.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">idxloglevel</span></code></dt>
|
|
<dd>Override loglevel for the indexer.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">idxlogfilename</span></code></dt>
|
|
<dd>Override logfilename for the indexer.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">daemloglevel</span></code></dt>
|
|
<dd>Override loglevel for the indexer in real time mode. The default is
|
|
to use the idx... values if set, else the log... values.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">daemlogfilename</span></code></dt>
|
|
<dd>Override logfilename for the indexer in real time mode. The default
|
|
is to use the idx... values if set, else the log... values.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">orgidxconfdir</span></code></dt>
|
|
<dd>Original location of the configuration directory. This is used
|
|
exclusively for movable datasets. Locating the configuration
|
|
directory inside the directory tree makes it possible to provide
|
|
automatic query time path translations once the data set has moved
|
|
(for example, because it has been mounted on another location).</dd>
|
|
<dt><code class="docutils literal"><span class="pre">curidxconfdir</span></code></dt>
|
|
<dd>Current location of the configuration directory. Complement
|
|
orgidxconfdir for movable datasets. This should be used if the
|
|
configuration directory has been copied from the dataset to another
|
|
location, either because the dataset is readonly and an r/w copy is
|
|
desired, or for performance reasons. This records the original moved
|
|
location before copy, to allow path translation computations. For
|
|
example if a dataset originally indexed as ‘/home/me/mydata/config’
|
|
has been mounted to ‘/media/me/mydata’, and the GUI is running from
|
|
a copied configuration, orgidxconfdir would be
|
|
‘/home/me/mydata/config’, and curidxconfdir (as set in the copied
|
|
configuration) would be ‘/media/me/mydata/config’.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">idxrundir</span></code></dt>
|
|
<dd>Indexing process current directory. The input handlers sometimes
|
|
leave temporary files in the current directory, so it makes sense to
|
|
have recollindex chdir to some temporary directory. If the value is
|
|
empty, the current directory is not changed. If the value is
|
|
(literal) tmp, we use the temporary directory as set by the
|
|
environment (RECOLL_TMPDIR else TMPDIR else /tmp). If the value is
|
|
an absolute path to a directory, we go there.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">checkneedretryindexscript</span></code></dt>
|
|
<dd>Script used to heuristically check if we need to retry indexing
|
|
files which previously failed. The default script checks the
|
|
modified dates on /usr/bin and /usr/local/bin. A relative path will
|
|
be looked up in the filters dirs, then in the path. Use an absolute
|
|
path to do otherwise.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">recollhelperpath</span></code></dt>
|
|
<dd>Additional places to search for helper executables. This is only
|
|
used on Windows for now.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">idxabsmlen</span></code></dt>
|
|
<dd>Length of abstracts we store while indexing. Recoll stores an
|
|
abstract for each indexed file. The text can come from an actual
|
|
‘abstract’ section in the document or will just be the beginning of
|
|
the document. It is stored in the index so that it can be displayed
|
|
inside the result lists without decoding the original file. The
|
|
idxabsmlen parameter defines the size of the stored abstract. The
|
|
default value is 250 bytes. The search interface gives you the
|
|
choice to display this stored text or a synthetic abstract built by
|
|
extracting text around the search terms. If you always prefer the
|
|
synthetic abstract, you can reduce this value and save a little
|
|
space.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">idxmetastoredlen</span></code></dt>
|
|
<dd>Truncation length of stored metadata fields. This does not affect
|
|
indexing (the whole field is processed anyway), just the amount of
|
|
data stored in the index for the purpose of displaying fields inside
|
|
result lists or previews. The default value is 150 bytes which may
|
|
be too low if you have custom fields.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">idxtexttruncatelen</span></code></dt>
|
|
<dd>Truncation length for all document texts. Only index the beginning
|
|
of documents. This is not recommended except if you are sure that
|
|
the interesting keywords are at the top and have severe disk space
|
|
issues.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">aspellLanguage</span></code></dt>
|
|
<dd>Language definitions to use when creating the aspell dictionary. The
|
|
value must match a set of aspell language definition files. You can
|
|
type “aspell dicts” to see a list The default if this is not set is
|
|
to use the NLS environment to guess the value.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">aspellAddCreateParam</span></code></dt>
|
|
<dd>Additional option and parameter to aspell dictionary creation
|
|
command. Some aspell packages may need an additional option (e.g. on
|
|
Debian Jessie: –local-data-dir=/usr/lib/aspell). See Debian bug
|
|
772415.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">aspellKeepStderr</span></code></dt>
|
|
<dd>Set this to have a look at aspell dictionary creation errors. There
|
|
are always many, so this is mostly for debugging.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">noaspell</span></code></dt>
|
|
<dd>Disable aspell use. The aspell dictionary generation takes time, and
|
|
some combinations of aspell version, language, and local terms,
|
|
result in aspell crashing, so it sometimes makes sense to just
|
|
disable the thing.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">monauxinterval</span></code></dt>
|
|
<dd>Auxiliary database update interval. The real time indexer only
|
|
updates the auxiliary databases (stemdb, aspell) periodically,
|
|
because it would be too costly to do it for every document change.
|
|
The default period is one hour.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">monixinterval</span></code></dt>
|
|
<dd>Minimum interval (seconds) between processings of the indexing
|
|
queue. The real time indexer does not process each event when it
|
|
comes in, but lets the queue accumulate, to diminish overhead and to
|
|
aggregate multiple events affecting the same file. Default 30 S.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">mondelaypatterns</span></code></dt>
|
|
<dd>Timing parameters for the real time indexing. Definitions for files
|
|
which get a longer delay before reindexing is allowed. This is for
|
|
fast-changing files, that should only be reindexed once in a while.
|
|
A list of wildcardPattern:seconds pairs. The patterns are matched
|
|
with fnmatch(pattern, path, 0) You can quote entries containing
|
|
white space with double quotes (quote the whole entry, not the
|
|
pattern). The default is empty. Example: mondelaypatterns =
|
|
*.log:20 “*with spaces.*:30”</dd>
|
|
<dt><code class="docutils literal"><span class="pre">monioniceclass</span></code></dt>
|
|
<dd>ionice class for the real time indexing process On platforms where
|
|
this is supported. The default value is 3.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">monioniceclassdata</span></code></dt>
|
|
<dd>ionice class parameter for the real time indexing process. On
|
|
platforms where this is supported. The default is empty.</dd>
|
|
</dl>
|
|
</div>
|
|
<div class="section" id="query-time-parameters-no-impact-on-the-index">
|
|
<h5>Query-time parameters (no impact on the index)<a class="headerlink" href="#query-time-parameters-no-impact-on-the-index" title="Permalink to this headline">¶</a></h5>
|
|
<dl class="docutils">
|
|
<dt><code class="docutils literal"><span class="pre">autodiacsens</span></code></dt>
|
|
<dd>auto-trigger diacritics sensitivity (raw index only). IF the index
|
|
is not stripped, decide if we automatically trigger diacritics
|
|
sensitivity if the search term has accented characters (not in
|
|
unac_except_trans). Else you need to use the query language and
|
|
the “D” modifier to specify diacritics sensitivity. Default is no.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">autocasesens</span></code></dt>
|
|
<dd>auto-trigger case sensitivity (raw index only). IF the index is not
|
|
stripped (see indexStripChars), decide if we automatically trigger
|
|
character case sensitivity if the search term has upper-case
|
|
characters in any but the first position. Else you need to use the
|
|
query language and the “C” modifier to specify character-case
|
|
sensitivity. Default is yes.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">maxTermExpand</span></code></dt>
|
|
<dd>Maximum query expansion count for a single term (e.g.: when using
|
|
wildcards). This only affects queries, not indexing. We used to not
|
|
limit this at all (except for filenames where the limit was too low
|
|
at 1000), but it is unreasonable with a big index. Default 10000.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">maxXapianClauses</span></code></dt>
|
|
<dd>Maximum number of clauses we add to a single Xapian query. This only
|
|
affects queries, not indexing. In some cases, the result of term
|
|
expansion can be multiplicative, and we want to avoid eating all the
|
|
memory. Default 50000.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">snippetMaxPosWalk</span></code></dt>
|
|
<dd>Maximum number of positions we walk while populating a snippet for
|
|
the result list. The default of 1,000,000 may be insufficient for
|
|
very big documents, the consequence would be snippets with possibly
|
|
meaning-altering missing words.</dd>
|
|
</dl>
|
|
</div>
|
|
<div class="section" id="parameters-for-the-pdf-input-script">
|
|
<h5>Parameters for the PDF input script<a class="headerlink" href="#parameters-for-the-pdf-input-script" title="Permalink to this headline">¶</a></h5>
|
|
<dl class="docutils">
|
|
<dt><code class="docutils literal"><span class="pre">pdfocr</span></code></dt>
|
|
<dd>Attempt OCR of PDF files with no text content if both tesseract and
|
|
pdftoppm are installed. The default is off because OCR is so very
|
|
slow.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">pdfocrlang</span></code></dt>
|
|
<dd>Language to assume for PDF OCR. This is very important for having a
|
|
reasonable rate of errors with tesseract. This can also be set
|
|
through a configuration variable or directory-local parameters. See
|
|
the rclpdf.py script.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">pdfattach</span></code></dt>
|
|
<dd>Enable PDF attachment extraction by executing pdftk (if available).
|
|
This is normally disabled, because it does slow down PDF indexing a
|
|
bit even if not one attachment is ever found.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">pdfextrameta</span></code></dt>
|
|
<dd>Extract text from selected XMP metadata tags. This is a
|
|
space-separated list of qualified XMP tag names. Each element can
|
|
also include a translation to a Recoll field name, separated by a
|
|
‘|’ character. If the second element is absent, the tag name is
|
|
used as the Recoll field names. You will also need to add
|
|
specifications to the ‘fields’ file to direct processing of the
|
|
extracted data.</dd>
|
|
<dt><code class="docutils literal"><span class="pre">pdfextrametafix</span></code></dt>
|
|
<dd>Define name of XMP field editing script. This defines the name of a
|
|
script to be loaded for editing XMP field values. The script should
|
|
define a ‘MetaFixer’ class with a metafix() method which will be
|
|
called with the qualified tag name and value of each selected field,
|
|
for editing or erasing. A new instance is created for each document,
|
|
so that the object can keep state for, e.g. eliminating duplicate
|
|
values.</dd>
|
|
</dl>
|
|
</div>
|
|
<div class="section" id="parameters-set-for-specific-locations">
|
|
<h5>Parameters set for specific locations<a class="headerlink" href="#parameters-set-for-specific-locations" title="Permalink to this headline">¶</a></h5>
|
|
<dl class="docutils">
|
|
<dt><code class="docutils literal"><span class="pre">mhmboxquirks</span></code></dt>
|
|
<dd>Enable thunderbird/mozilla-seamonkey mbox format quirks Set this for
|
|
the directory where the email mbox files are stored.</dd>
|
|
</dl>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="the-fields-file">
|
|
<h4>The fields file<a class="headerlink" href="#the-fields-file" title="Permalink to this headline">¶</a></h4>
|
|
<p>This file contains information about dynamic fields handling in RCL.
|
|
Some very basic fields have hard-wired behaviour, and, mostly, you
|
|
should not change the original data inside the <code class="docutils literal"><span class="pre">fields</span></code> file. But you
|
|
can create custom fields fitting your data and handle them just like
|
|
they were native ones.</p>
|
|
<p>The <code class="docutils literal"><span class="pre">fields</span></code> file has several sections, which each define an aspect of
|
|
fields processing. Quite often, you’ll have to modify several sections
|
|
to obtain the desired behaviour.</p>
|
|
<p>We will only give a short description here, you should refer to the
|
|
comments inside the default file for more detailed information.</p>
|
|
<p>Field names should be lowercase alphabetic ASCII.</p>
|
|
<dl class="docutils">
|
|
<dt>[prefixes]</dt>
|
|
<dd>A field becomes indexed (searchable) by having a prefix defined in
|
|
this section. There is a more complete explanation of what prefixes
|
|
are in used by a standard recoll installation. In a nutshell:
|
|
extension prefixes should be all caps, begin with XY, and short.
|
|
E.g. XYMFLD.</dd>
|
|
<dt>[values]</dt>
|
|
<dd>Fields listed in this section will be stored as XAP <code class="docutils literal"><span class="pre">values</span></code>
|
|
inside the index. This makes them available for range queries,
|
|
allowing to filter results according to the field value. This
|
|
feature currently supports string and integer data. See the comments
|
|
in the file for more detail</dd>
|
|
<dt>[stored]</dt>
|
|
<dd>A field becomes stored (displayable inside results) by having its
|
|
name listed in this section (typically with an empty value).</dd>
|
|
<dt>[aliases]</dt>
|
|
<dd>This section defines lists of synonyms for the canonical names used
|
|
inside the <code class="docutils literal"><span class="pre">[prefixes]</span></code> and <code class="docutils literal"><span class="pre">[stored]</span></code> sections</dd>
|
|
<dt>[queryaliases]</dt>
|
|
<dd>This section also defines aliases for the canonic field names, with
|
|
the difference that the substitution will only be used at query
|
|
time, avoiding any possibility that the value would pick-up random
|
|
metadata from documents.</dd>
|
|
<dt>handler-specific sections</dt>
|
|
<dd>Some input handlers may need specific configuration for handling
|
|
fields. Only the email message handler currently has such a section
|
|
(named <code class="docutils literal"><span class="pre">[mail]</span></code>). It allows indexing arbitrary email headers in
|
|
addition to the ones indexed by default. Other such sections may
|
|
appear in the future.</dd>
|
|
</dl>
|
|
<p>Here follows a small example of a personal <code class="docutils literal"><span class="pre">fields</span></code> file. This would
|
|
extract a specific email header and use it as a searchable field, with
|
|
data displayable inside result lists. (Side note: as the email handler
|
|
does no decoding on the values, only plain ascii headers can be indexed,
|
|
and only the first occurrence will be used for headers that occur
|
|
several times).</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">prefixes</span><span class="p">]</span>
|
|
<span class="c1"># Index mailmytag contents (with the given prefix)</span>
|
|
<span class="n">mailmytag</span> <span class="o">=</span> <span class="n">XMTAG</span>
|
|
|
|
<span class="p">[</span><span class="n">stored</span><span class="p">]</span>
|
|
<span class="c1"># Store mailmytag inside the document data record (so that it can be</span>
|
|
<span class="c1"># displayed - as %(mailmytag) - in result lists).</span>
|
|
<span class="n">mailmytag</span> <span class="o">=</span>
|
|
|
|
<span class="p">[</span><span class="n">queryaliases</span><span class="p">]</span>
|
|
<span class="n">filename</span> <span class="o">=</span> <span class="n">fn</span>
|
|
<span class="n">containerfilename</span> <span class="o">=</span> <span class="n">cfn</span>
|
|
|
|
<span class="p">[</span><span class="n">mail</span><span class="p">]</span>
|
|
<span class="c1"># Extract the X-My-Tag mail header, and use it internally with the</span>
|
|
<span class="c1"># mailmytag field name</span>
|
|
<span class="n">x</span><span class="o">-</span><span class="n">my</span><span class="o">-</span><span class="n">tag</span> <span class="o">=</span> <span class="n">mailmytag</span>
|
|
</pre></div>
|
|
</div>
|
|
<div class="section" id="extended-attributes-in-the-fields-file">
|
|
<h5>Extended attributes in the fields file<a class="headerlink" href="#extended-attributes-in-the-fields-file" title="Permalink to this headline">¶</a></h5>
|
|
<p>RCL versions 1.19 and later process user extended file attributes as
|
|
documents fields by default.</p>
|
|
<p>Attributes are processed as fields of the same name, after removing the
|
|
<code class="docutils literal"><span class="pre">user</span></code> prefix on Linux.</p>
|
|
<p>The <code class="docutils literal"><span class="pre">[xattrtofields]</span></code> section of the <code class="docutils literal"><span class="pre">fields</span></code> file allows specifying
|
|
translations from extended attributes names to RCL field names. An empty
|
|
translation disables use of the corresponding attribute data.</p>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="the-mimemap-file">
|
|
<h4>The mimemap file<a class="headerlink" href="#the-mimemap-file" title="Permalink to this headline">¶</a></h4>
|
|
<p><code class="docutils literal"><span class="pre">mimemap</span></code> specifies the file name extension to MIME type mappings.</p>
|
|
<p>For file names without an extension, or with an unknown one, a system
|
|
command (<code class="docutils literal"><span class="pre">file</span></code> <code class="docutils literal"><span class="pre">-i</span></code>, or <code class="docutils literal"><span class="pre">xdg-mime</span></code>) will be executed to determine
|
|
the MIME type (this can be switched off, or the command changed inside
|
|
the main configuration file).</p>
|
|
<p>All extension values in <code class="docutils literal"><span class="pre">mimemap</span></code> must be entered in lower case. File
|
|
names extensions are lower-cased for comparison during indexing, meaning
|
|
that an upper case <code class="docutils literal"><span class="pre">mimemap</span></code> entry will never be matched.</p>
|
|
<p>The mappings can be specified on a per-subtree basis, which may be
|
|
useful in some cases. Example: okular notes have a <code class="docutils literal"><span class="pre">.xml</span></code> extension
|
|
but should be handled specially, which is possible because they are
|
|
usually all located in one place. Example:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="p">[</span><span class="o">~/.</span><span class="n">kde</span><span class="o">/</span><span class="n">share</span><span class="o">/</span><span class="n">apps</span><span class="o">/</span><span class="n">okular</span><span class="o">/</span><span class="n">docdata</span><span class="p">]</span>
|
|
<span class="o">.</span><span class="n">xml</span> <span class="o">=</span> <span class="n">application</span><span class="o">/</span><span class="n">x</span><span class="o">-</span><span class="n">okular</span><span class="o">-</span><span class="n">notes</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>The <code class="docutils literal"><span class="pre">recoll_noindex</span></code> <code class="docutils literal"><span class="pre">mimemap</span></code> variable has been moved to
|
|
<code class="docutils literal"><span class="pre">recoll.conf</span></code> and renamed to <code class="docutils literal"><span class="pre">noContentSuffixes</span></code>, while keeping the
|
|
same function, as of RCL version 1.21. For older RCL versions, see the
|
|
documentation for <code class="docutils literal"><span class="pre">noContentSuffixes</span></code> but use <code class="docutils literal"><span class="pre">recoll_noindex</span></code> in
|
|
<code class="docutils literal"><span class="pre">mimemap</span></code>.</p>
|
|
</div>
|
|
<div class="section" id="the-mimeconf-file">
|
|
<h4>The mimeconf file<a class="headerlink" href="#the-mimeconf-file" title="Permalink to this headline">¶</a></h4>
|
|
<p>The main purpose of the <code class="docutils literal"><span class="pre">mimeconf</span></code> file is to specify how the
|
|
different MIME types are handled for indexing. This is done in the
|
|
<code class="docutils literal"><span class="pre">[index]</span></code> section, which should not be modified casually. See the
|
|
comments in the file.</p>
|
|
<p>The file also contains other definitions which affect the query language
|
|
and the GUI, and which, in retrospect, should have been stored
|
|
elsewhere.</p>
|
|
<p>The <code class="docutils literal"><span class="pre">[icons]</span></code> section allows you to change the icons which are
|
|
displayed by the <code class="docutils literal"><span class="pre">recoll</span></code> GUI in the result lists (the values are the
|
|
basenames of the <code class="docutils literal"><span class="pre">png</span></code> images inside the <code class="docutils literal"><span class="pre">iconsdir</span></code> directory (which
|
|
is itself defined in <code class="docutils literal"><span class="pre">recoll.conf</span></code>).</p>
|
|
<p>The <code class="docutils literal"><span class="pre">[categories]</span></code> section defines the groupings of MIME types into
|
|
<code class="docutils literal"><span class="pre">categories</span></code> as used when adding an <code class="docutils literal"><span class="pre">rclcat</span></code> clause to a <a class="reference external" href="#RCL.SEARCH.LANG">query
|
|
language</a> query. <code class="docutils literal"><span class="pre">rclcat</span></code> clauses are also used
|
|
by the default <code class="docutils literal"><span class="pre">guifilters</span></code> buttons in the GUI (see next).</p>
|
|
<p>The filter controls appear at the top of the <code class="docutils literal"><span class="pre">recoll</span></code> GUI, either as
|
|
checkboxes just above the result list, or as a dropbox in the tool area.</p>
|
|
<p>By default, they are labeled: <code class="docutils literal"><span class="pre">media</span></code>, <code class="docutils literal"><span class="pre">message</span></code>, <code class="docutils literal"><span class="pre">other</span></code>,
|
|
<code class="docutils literal"><span class="pre">presentation</span></code>, <code class="docutils literal"><span class="pre">spreadsheet</span></code> and <code class="docutils literal"><span class="pre">text</span></code>, and each maps to a
|
|
document category. This is determined in the <code class="docutils literal"><span class="pre">[guifilters]</span></code> section,
|
|
where each control is defined by a variable naming a query language
|
|
fragment.</p>
|
|
<p>A simple exemple will hopefully make things clearer.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">guifilters</span><span class="p">]</span>
|
|
|
|
<span class="n">Big</span> <span class="n">Books</span> <span class="o">=</span> <span class="nb">dir</span><span class="p">:</span><span class="s2">"~/My Books"</span> <span class="n">size</span><span class="o">></span><span class="mi">10</span><span class="n">K</span>
|
|
<span class="n">My</span> <span class="n">Docs</span> <span class="o">=</span> <span class="nb">dir</span><span class="p">:</span><span class="s2">"~/My Documents"</span>
|
|
<span class="n">Small</span> <span class="n">Books</span> <span class="o">=</span> <span class="nb">dir</span><span class="p">:</span><span class="s2">"~/My Books"</span> <span class="n">size</span><span class="o"><</span><span class="mi">10</span><span class="n">K</span>
|
|
<span class="n">System</span> <span class="n">Docs</span> <span class="o">=</span> <span class="nb">dir</span><span class="p">:</span><span class="o">/</span><span class="n">usr</span><span class="o">/</span><span class="n">share</span><span class="o">/</span><span class="n">doc</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>The above definition would create four filter checkboxes, labelled
|
|
<code class="docutils literal"><span class="pre">Big</span> <span class="pre">Books</span></code>, <code class="docutils literal"><span class="pre">My</span> <span class="pre">Docs</span></code>, etc.</p>
|
|
<p>The text after the equal sign must be a valid query language fragment,
|
|
and, when the button is checked, it will be combined with the rest of
|
|
the query with an AND conjunction.</p>
|
|
<p>Any name text before a colon character will be erased in the display,
|
|
but used for sorting. You can use this to display the checkboxes in any
|
|
order you like. For exemple, the following would do exactly the same as
|
|
above, but ordering the checkboxes in the reverse order.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">guifilters</span><span class="p">]</span>
|
|
|
|
<span class="n">d</span><span class="p">:</span><span class="n">Big</span> <span class="n">Books</span> <span class="o">=</span> <span class="nb">dir</span><span class="p">:</span><span class="s2">"~/My Books"</span> <span class="n">size</span><span class="o">></span><span class="mi">10</span><span class="n">K</span>
|
|
<span class="n">c</span><span class="p">:</span><span class="n">My</span> <span class="n">Docs</span> <span class="o">=</span> <span class="nb">dir</span><span class="p">:</span><span class="s2">"~/My Documents"</span>
|
|
<span class="n">b</span><span class="p">:</span><span class="n">Small</span> <span class="n">Books</span> <span class="o">=</span> <span class="nb">dir</span><span class="p">:</span><span class="s2">"~/My Books"</span> <span class="n">size</span><span class="o"><</span><span class="mi">10</span><span class="n">K</span>
|
|
<span class="n">a</span><span class="p">:</span><span class="n">System</span> <span class="n">Docs</span> <span class="o">=</span> <span class="nb">dir</span><span class="p">:</span><span class="o">/</span><span class="n">usr</span><span class="o">/</span><span class="n">share</span><span class="o">/</span><span class="n">doc</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>As you may have guessed, The default <code class="docutils literal"><span class="pre">[guifilters]</span></code> section looks
|
|
like:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">guifilters</span><span class="p">]</span>
|
|
<span class="n">text</span> <span class="o">=</span> <span class="n">rclcat</span><span class="p">:</span><span class="n">text</span>
|
|
<span class="n">spreadsheet</span> <span class="o">=</span> <span class="n">rclcat</span><span class="p">:</span><span class="n">spreadsheet</span>
|
|
<span class="n">presentation</span> <span class="o">=</span> <span class="n">rclcat</span><span class="p">:</span><span class="n">presentation</span>
|
|
<span class="n">media</span> <span class="o">=</span> <span class="n">rclcat</span><span class="p">:</span><span class="n">media</span>
|
|
<span class="n">message</span> <span class="o">=</span> <span class="n">rclcat</span><span class="p">:</span><span class="n">message</span>
|
|
<span class="n">other</span> <span class="o">=</span> <span class="n">rclcat</span><span class="p">:</span><span class="n">other</span>
|
|
</pre></div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="the-mimeview-file">
|
|
<h4>The mimeview file<a class="headerlink" href="#the-mimeview-file" title="Permalink to this headline">¶</a></h4>
|
|
<p><code class="docutils literal"><span class="pre">mimeview</span></code> specifies which programs are started when you click on an
|
|
Open link in a result list. Ie: HTML is normally displayed using
|
|
firefox, but you may prefer Konqueror, your openoffice.org program might
|
|
be named <code class="docutils literal"><span class="pre">oofice</span></code> instead of <code class="docutils literal"><span class="pre">openoffice</span></code> etc.</p>
|
|
<p>Changes to this file can be done by direct editing, or through the
|
|
<code class="docutils literal"><span class="pre">recoll</span></code> GUI preferences dialog.</p>
|
|
<p>If Use desktop preferences to choose document editor is checked in the
|
|
RCL GUI preferences, all <code class="docutils literal"><span class="pre">mimeview</span></code> entries will be ignored except the
|
|
one labelled <code class="docutils literal"><span class="pre">application/x-all</span></code> (which is set to use <code class="docutils literal"><span class="pre">xdg-open</span></code> by
|
|
default).</p>
|
|
<p>In this case, the <code class="docutils literal"><span class="pre">xallexcepts</span></code> top level variable defines a list of
|
|
MIME type exceptions which will be processed according to the local
|
|
entries instead of being passed to the desktop. This is so that specific
|
|
RCL options such as a page number or a search string can be passed to
|
|
applications that support them, such as the evince viewer.</p>
|
|
<p>As for the other configuration files, the normal usage is to have a
|
|
<code class="docutils literal"><span class="pre">mimeview</span></code> inside your own configuration directory, with just the
|
|
non-default entries, which will override those from the central
|
|
configuration file.</p>
|
|
<p>All viewer definition entries must be placed under a <code class="docutils literal"><span class="pre">[view]</span></code> section.</p>
|
|
<p>The keys in the file are normally MIME types. You can add an application
|
|
tag to specialize the choice for an area of the filesystem (using a
|
|
<code class="docutils literal"><span class="pre">localfields</span></code> specification in <code class="docutils literal"><span class="pre">mimeconf</span></code>). The syntax for the key
|
|
is mimetype<code class="docutils literal"><span class="pre">|</span></code>tag</p>
|
|
<p>The <code class="docutils literal"><span class="pre">nouncompforviewmts</span></code> entry, (placed at the top level, outside of
|
|
the <code class="docutils literal"><span class="pre">[view]</span></code> section), holds a list of MIME types that should not be
|
|
uncompressed before starting the viewer (if they are found compressed,
|
|
ie: mydoc.doc.gz).</p>
|
|
<p>The right side of each assignment holds a command to be executed for
|
|
opening the file. The following substitutions are performed:</p>
|
|
<ul>
|
|
<li><p class="first"><strong>%D.</strong></p>
|
|
<p>Document date</p>
|
|
</li>
|
|
<li><p class="first"><strong>%f.</strong></p>
|
|
<p>File name. This may be the name of a temporary file if it was
|
|
necessary to create one (ie: to extract a subdocument from a
|
|
container).</p>
|
|
</li>
|
|
<li><p class="first"><strong>%i.</strong></p>
|
|
<p>Internal path, for subdocuments of containers. The format depends on
|
|
the container type. If this appears in the command line, RCL will not
|
|
create a temporary file to extract the subdocument, expecting the
|
|
called application (possibly a script) to be able to handle it.</p>
|
|
</li>
|
|
<li><p class="first"><strong>%M.</strong></p>
|
|
<p>MIME type</p>
|
|
</li>
|
|
<li><p class="first"><strong>%p.</strong></p>
|
|
<p>Page index. Only significant for a subset of document types,
|
|
currently only PDF, Postscript and DVI files. Can be used to start
|
|
the editor at the right page for a match or snippet.</p>
|
|
</li>
|
|
<li><p class="first"><strong>%s.</strong></p>
|
|
<p>Search term. The value will only be set for documents with indexed
|
|
page numbers (ie: PDF). The value will be one of the matched search
|
|
terms. It would allow pre-setting the value in the “Find” entry
|
|
inside Evince for example, for easy highlighting of the term.</p>
|
|
</li>
|
|
<li><p class="first"><strong>%u.</strong></p>
|
|
<p>Url.</p>
|
|
</li>
|
|
</ul>
|
|
<p>In addition to the predefined values above, all strings like
|
|
<code class="docutils literal"><span class="pre">%(fieldname)</span></code> will be replaced by the value of the field named
|
|
<code class="docutils literal"><span class="pre">fieldname</span></code> for the document. This could be used in combination with
|
|
field customisation to help with opening the document.</p>
|
|
</div>
|
|
<div class="section" id="the-ptrans-file">
|
|
<h4>The <code class="docutils literal"><span class="pre">ptrans</span></code> file<a class="headerlink" href="#the-ptrans-file" title="Permalink to this headline">¶</a></h4>
|
|
<p><code class="docutils literal"><span class="pre">ptrans</span></code> specifies query-time path translations. These can be useful
|
|
in <a class="reference external" href="#RCL.SEARCH.PTRANS">multiple cases</a>.</p>
|
|
<p>The file has a section for any index which needs translations, either
|
|
the main one or additional query indexes. The sections are named with
|
|
the XAP index directory names. No slash character should exist at the
|
|
end of the paths (all comparisons are textual). An exemple should make
|
|
things sufficiently clear</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="p">[</span><span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">me</span><span class="o">/.</span><span class="n">recoll</span><span class="o">/</span><span class="n">xapiandb</span><span class="p">]</span>
|
|
<span class="o">/</span><span class="n">this</span><span class="o">/</span><span class="n">directory</span><span class="o">/</span><span class="n">moved</span> <span class="o">=</span> <span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">this</span><span class="o">/</span><span class="n">place</span>
|
|
|
|
<span class="p">[</span><span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">to</span><span class="o">/</span><span class="n">additional</span><span class="o">/</span><span class="n">xapiandb</span><span class="p">]</span>
|
|
<span class="o">/</span><span class="n">server</span><span class="o">/</span><span class="n">volume1</span><span class="o">/</span><span class="n">docdir</span> <span class="o">=</span> <span class="o">/</span><span class="n">net</span><span class="o">/</span><span class="n">server</span><span class="o">/</span><span class="n">volume1</span><span class="o">/</span><span class="n">docdir</span>
|
|
<span class="o">/</span><span class="n">server</span><span class="o">/</span><span class="n">volume2</span><span class="o">/</span><span class="n">docdir</span> <span class="o">=</span> <span class="o">/</span><span class="n">net</span><span class="o">/</span><span class="n">server</span><span class="o">/</span><span class="n">volume2</span><span class="o">/</span><span class="n">docdir</span>
|
|
</pre></div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="examples-of-configuration-adjustments">
|
|
<h4>Examples of configuration adjustments<a class="headerlink" href="#examples-of-configuration-adjustments" title="Permalink to this headline">¶</a></h4>
|
|
<div class="section" id="adding-an-external-viewer-for-an-non-indexed-type">
|
|
<h5>Adding an external viewer for an non-indexed type<a class="headerlink" href="#adding-an-external-viewer-for-an-non-indexed-type" title="Permalink to this headline">¶</a></h5>
|
|
<p>Imagine that you have some kind of file which does not have indexable
|
|
content, but for which you would like to have a functional Open link in
|
|
the result list (when found by file name). The file names end in .blob
|
|
and can be displayed by application blobviewer.</p>
|
|
<p>You need two entries in the configuration files for this to work:</p>
|
|
<ul>
|
|
<li><p class="first">In <code class="docutils literal"><span class="pre">$RECOLL_CONFDIR/mimemap</span></code> (typically <code class="docutils literal"><span class="pre">~/.recoll/mimemap</span></code>), add
|
|
the following line:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o">.</span><span class="n">blob</span> <span class="o">=</span> <span class="n">application</span><span class="o">/</span><span class="n">x</span><span class="o">-</span><span class="n">blobapp</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Note that the MIME type is made up here, and you could call it
|
|
diesel/oil just the same.</p>
|
|
</li>
|
|
<li><p class="first">In <code class="docutils literal"><span class="pre">$RECOLL_CONFDIR/mimeview</span></code> under the <code class="docutils literal"><span class="pre">[view]</span></code> section, add:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">application</span><span class="o">/</span><span class="n">x</span><span class="o">-</span><span class="n">blobapp</span> <span class="o">=</span> <span class="n">blobviewer</span> <span class="o">%</span><span class="n">f</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>We are supposing that blobviewer wants a file name parameter here,
|
|
you would use <code class="docutils literal"><span class="pre">%u</span></code> if it liked URLs better.</p>
|
|
</li>
|
|
</ul>
|
|
<p>If you just wanted to change the application used by RCL to display a
|
|
MIME type which it already knows, you would just need to edit
|
|
<code class="docutils literal"><span class="pre">mimeview</span></code>. The entries you add in your personal file override those
|
|
in the central configuration, which you do not need to alter.
|
|
<code class="docutils literal"><span class="pre">mimeview</span></code> can also be modified from the Gui.</p>
|
|
</div>
|
|
<div class="section" id="adding-indexing-support-for-a-new-file-type">
|
|
<h5>Adding indexing support for a new file type<a class="headerlink" href="#adding-indexing-support-for-a-new-file-type" title="Permalink to this headline">¶</a></h5>
|
|
<p>Let us now imagine that the above .blob files actually contain indexable
|
|
text and that you know how to extract it with a command line program.
|
|
Getting RCL to index the files is easy. You need to perform the above
|
|
alteration, and also to add data to the <code class="docutils literal"><span class="pre">mimeconf</span></code> file (typically in
|
|
<code class="docutils literal"><span class="pre">~/.recoll/mimeconf</span></code>):</p>
|
|
<ul>
|
|
<li><p class="first">Under the <code class="docutils literal"><span class="pre">[index]</span></code> section, add the following line (more about the
|
|
rclblob indexing script later):</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">application</span><span class="o">/</span><span class="n">x</span><span class="o">-</span><span class="n">blobapp</span> <span class="o">=</span> <span class="n">exec</span> <span class="n">rclblob</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>Or if the files are mostly text and you don’t need to process them
|
|
for indexing:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">application</span><span class="o">/</span><span class="n">x</span><span class="o">-</span><span class="n">blobapp</span> <span class="o">=</span> <span class="n">internal</span> <span class="n">text</span><span class="o">/</span><span class="n">plain</span>
|
|
</pre></div>
|
|
</div>
|
|
</li>
|
|
<li><p class="first">Under the <code class="docutils literal"><span class="pre">[icons]</span></code> section, you should choose an icon to be
|
|
displayed for the files inside the result lists. Icons are normally
|
|
64x64 pixels PNG files which live in <code class="docutils literal"><span class="pre">/usr/share/recoll/images</span></code>.</p>
|
|
</li>
|
|
<li><p class="first">Under the <code class="docutils literal"><span class="pre">[categories]</span></code> section, you should add the MIME type
|
|
where it makes sense (you can also create a category). Categories may
|
|
be used for filtering in advanced search.</p>
|
|
</li>
|
|
</ul>
|
|
<p>The rclblob handler should be an executable program or script which
|
|
exists inside <code class="docutils literal"><span class="pre">/usr/share/recoll/filters</span></code>. It will be given a file
|
|
name as argument and should output the text or html contents on the
|
|
standard output.</p>
|
|
<p>The <a class="reference external" href="#RCL.PROGRAM.FILTERS">filter programming</a> section describes in
|
|
more detail how to write an input handler.</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
|
|
<div class="sphinxsidebarwrapper">
|
|
<h3><a href="#">Table Of Contents</a></h3>
|
|
<ul>
|
|
<li><a class="reference internal" href="#">Recoll user manual</a><ul>
|
|
<li><a class="reference internal" href="#introduction">Introduction</a><ul>
|
|
<li><a class="reference internal" href="#giving-it-a-try">Giving it a try</a></li>
|
|
<li><a class="reference internal" href="#full-text-search">Full text search</a></li>
|
|
<li><a class="reference internal" href="#recoll-overview">Recoll overview</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#indexing">Indexing</a><ul>
|
|
<li><a class="reference internal" href="#id1">Introduction</a><ul>
|
|
<li><a class="reference internal" href="#indexing-modes">Indexing modes</a></li>
|
|
<li><a class="reference internal" href="#configurations-multiple-indexes">Configurations, multiple indexes</a></li>
|
|
<li><a class="reference internal" href="#document-types">Document types</a></li>
|
|
<li><a class="reference internal" href="#indexing-failures">Indexing failures</a></li>
|
|
<li><a class="reference internal" href="#recovery">Recovery</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#index-storage">Index storage</a><ul>
|
|
<li><a class="reference internal" href="#xap-index-formats">XAP index formats</a></li>
|
|
<li><a class="reference internal" href="#security-aspects">Security aspects</a></li>
|
|
<li><a class="reference internal" href="#special-considerations-for-big-indexes">Special considerations for big indexes</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#index-configuration">Index configuration</a><ul>
|
|
<li><a class="reference internal" href="#multiple-indexes">Multiple indexes</a></li>
|
|
<li><a class="reference internal" href="#index-case-and-diacritics-sensitivity">Index case and diacritics sensitivity</a></li>
|
|
<li><a class="reference internal" href="#indexing-threads-configuration">Indexing threads configuration</a></li>
|
|
<li><a class="reference internal" href="#the-index-configuration-gui">The index configuration GUI</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#indexing-the-web-pages-which-you-wisit">Indexing the WEB pages which you wisit.</a></li>
|
|
<li><a class="reference internal" href="#extended-attributes-data">Extended attributes data</a></li>
|
|
<li><a class="reference internal" href="#importing-external-tags">Importing external tags</a></li>
|
|
<li><a class="reference internal" href="#the-pdf-input-handler">The PDF input handler</a><ul>
|
|
<li><a class="reference internal" href="#ocr-with-tesseract">OCR with Tesseract</a></li>
|
|
<li><a class="reference internal" href="#xmp-fields-extraction">XMP fields extraction</a></li>
|
|
<li><a class="reference internal" href="#pdf-attachment-indexing">PDF attachment indexing</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#periodic-indexing">Periodic indexing</a><ul>
|
|
<li><a class="reference internal" href="#running-indexing">Running indexing</a></li>
|
|
<li><a class="reference internal" href="#using-cron-to-automate-indexing">Using <code class="docutils literal"><span class="pre">cron</span></code> to automate indexing</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#real-time-indexing">Real time indexing</a><ul>
|
|
<li><a class="reference internal" href="#real-time-indexing-automatic-daemon-start">Real time indexing: automatic daemon start</a></li>
|
|
<li><a class="reference internal" href="#real-time-indexing-miscellaneous-details">Real time indexing: miscellaneous details</a></li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#searching">Searching</a><ul>
|
|
<li><a class="reference internal" href="#searching-with-the-qt-graphical-user-interface">Searching with the Qt graphical user interface</a><ul>
|
|
<li><a class="reference internal" href="#simple-search">Simple search</a></li>
|
|
<li><a class="reference internal" href="#the-default-result-list">The default result list</a><ul>
|
|
<li><a class="reference internal" href="#no-results-the-spelling-suggestions">No results: the spelling suggestions</a></li>
|
|
<li><a class="reference internal" href="#the-result-list-right-click-menu">The result list right-click menu</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#the-result-table">The result table</a></li>
|
|
<li><a class="reference internal" href="#running-arbitrary-commands-on-result-files-1-20-and-later">Running arbitrary commands on result files (1.20 and later)</a></li>
|
|
<li><a class="reference internal" href="#displaying-thumbnails">Displaying thumbnails</a></li>
|
|
<li><a class="reference internal" href="#the-preview-window">The preview window</a><ul>
|
|
<li><a class="reference internal" href="#searching-inside-the-preview">Searching inside the preview</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#the-query-fragments-window">The Query Fragments window</a></li>
|
|
<li><a class="reference internal" href="#complex-advanced-search">Complex/advanced search</a><ul>
|
|
<li><a class="reference internal" href="#avanced-search-the-find-tab">Avanced search: the “find” tab</a></li>
|
|
<li><a class="reference internal" href="#avanced-search-the-filter-tab">Avanced search: the “filter” tab</a></li>
|
|
<li><a class="reference internal" href="#avanced-search-history">Avanced search history</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#the-term-explorer-tool">The term explorer tool</a></li>
|
|
<li><a class="reference internal" href="#id2">Multiple indexes</a></li>
|
|
<li><a class="reference internal" href="#document-history">Document history</a></li>
|
|
<li><a class="reference internal" href="#sorting-search-results-and-collapsing-duplicates">Sorting search results and collapsing duplicates</a></li>
|
|
<li><a class="reference internal" href="#search-tips-shortcuts">Search tips, shortcuts</a><ul>
|
|
<li><a class="reference internal" href="#terms-and-search-expansion">Terms and search expansion</a></li>
|
|
<li><a class="reference internal" href="#working-with-phrases-and-proximity">Working with phrases and proximity</a></li>
|
|
<li><a class="reference internal" href="#others">Others</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#saving-and-restoring-queries-1-21-and-later">Saving and restoring queries (1.21 and later)</a></li>
|
|
<li><a class="reference internal" href="#customizing-the-search-interface">Customizing the search interface</a><ul>
|
|
<li><a class="reference internal" href="#the-result-list-format">The result list format</a><ul>
|
|
<li><a class="reference internal" href="#the-paragraph-format">The paragraph format</a></li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#searching-with-the-kde-kio-slave">Searching with the KDE KIO slave</a><ul>
|
|
<li><a class="reference internal" href="#what-s-this">What’s this</a></li>
|
|
<li><a class="reference internal" href="#searchable-documents">Searchable documents</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#searching-on-the-command-line">Searching on the command line</a></li>
|
|
<li><a class="reference internal" href="#using-synonyms-1-22">Using Synonyms (1.22)</a></li>
|
|
<li><a class="reference internal" href="#path-translations">Path translations</a></li>
|
|
<li><a class="reference internal" href="#the-query-language">The query language</a><ul>
|
|
<li><a class="reference internal" href="#range-clauses">Range clauses</a></li>
|
|
<li><a class="reference internal" href="#modifiers">Modifiers</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#search-case-and-diacritics-sensitivity">Search case and diacritics sensitivity</a></li>
|
|
<li><a class="reference internal" href="#anchored-searches-and-wildcards">Anchored searches and wildcards</a><ul>
|
|
<li><a class="reference internal" href="#more-about-wildcards">More about wildcards</a><ul>
|
|
<li><a class="reference internal" href="#wildcards-and-path-filtering">Wildcards and path filtering</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#anchored-searches">Anchored searches</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#desktop-integration">Desktop integration</a><ul>
|
|
<li><a class="reference internal" href="#hotkeying-recoll">Hotkeying recoll</a></li>
|
|
<li><a class="reference internal" href="#the-kde-kicker-recoll-applet">The KDE Kicker Recoll applet</a></li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#removable-volumes">Removable volumes</a><ul>
|
|
<li><a class="reference internal" href="#indexing-removable-volumes-in-the-main-index">Indexing removable volumes in the main index</a></li>
|
|
<li><a class="reference internal" href="#self-contained-volumes">Self contained volumes</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#programming-interface">Programming interface</a><ul>
|
|
<li><a class="reference internal" href="#writing-a-document-input-handler">Writing a document input handler</a><ul>
|
|
<li><a class="reference internal" href="#simple-input-handlers">Simple input handlers</a></li>
|
|
<li><a class="reference internal" href="#multiple-handlers">“Multiple” handlers</a></li>
|
|
<li><a class="reference internal" href="#telling-rcl-about-the-handler">Telling RCL about the handler</a></li>
|
|
<li><a class="reference internal" href="#input-handler-output">Input handler output</a></li>
|
|
<li><a class="reference internal" href="#page-numbers">Page numbers</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#field-data-processing">Field data processing</a></li>
|
|
<li><a class="reference internal" href="#python-api">Python API</a><ul>
|
|
<li><a class="reference internal" href="#id3">Introduction</a></li>
|
|
<li><a class="reference internal" href="#interface-elements">Interface elements</a></li>
|
|
<li><a class="reference internal" href="#python-search-interface">Python search interface</a><ul>
|
|
<li><a class="reference internal" href="#the-recoll-module">The recoll module</a></li>
|
|
<li><a class="reference internal" href="#the-rclextract-module">The rclextract module</a></li>
|
|
<li><a class="reference internal" href="#search-api-usage-example">Search API usage example</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#creating-python-external-indexers">Creating Python external indexers</a><ul>
|
|
<li><a class="reference internal" href="#python-update-interface">Python update interface</a></li>
|
|
<li><a class="reference internal" href="#query-data-access-for-external-indexers-1-23">Query data access for external indexers (1.23)</a></li>
|
|
<li><a class="reference internal" href="#external-indexer-samples">External indexer samples</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#package-compatibility-with-the-previous-version">Package compatibility with the previous version</a></li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#installation-and-configuration">Installation and configuration</a><ul>
|
|
<li><a class="reference internal" href="#installing-a-binary-copy">Installing a binary copy</a></li>
|
|
<li><a class="reference internal" href="#supporting-packages">Supporting packages</a></li>
|
|
<li><a class="reference internal" href="#building-from-source">Building from source</a><ul>
|
|
<li><a class="reference internal" href="#prerequisites">Prerequisites</a></li>
|
|
<li><a class="reference internal" href="#building">Building</a></li>
|
|
<li><a class="reference internal" href="#installing">Installing</a></li>
|
|
<li><a class="reference internal" href="#python-api-package">Python API package</a></li>
|
|
<li><a class="reference internal" href="#building-on-solaris">Building on Solaris</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#configuration-overview">Configuration overview</a><ul>
|
|
<li><a class="reference internal" href="#environment-variables">Environment variables</a></li>
|
|
<li><a class="reference internal" href="#recoll-main-configuration-file-recoll-conf">Recoll main configuration file, recoll.conf</a><ul>
|
|
<li><a class="reference internal" href="#parameters-affecting-what-documents-we-index">Parameters affecting what documents we index</a></li>
|
|
<li><a class="reference internal" href="#parameters-affecting-how-we-generate-terms-and-organize-the-index">Parameters affecting how we generate terms and organize the index</a></li>
|
|
<li><a class="reference internal" href="#parameters-affecting-where-and-how-we-store-things">Parameters affecting where and how we store things</a></li>
|
|
<li><a class="reference internal" href="#parameters-affecting-indexing-performance-and-resource-usage">Parameters affecting indexing performance and resource usage</a></li>
|
|
<li><a class="reference internal" href="#miscellaneous-parameters">Miscellaneous parameters</a></li>
|
|
<li><a class="reference internal" href="#query-time-parameters-no-impact-on-the-index">Query-time parameters (no impact on the index)</a></li>
|
|
<li><a class="reference internal" href="#parameters-for-the-pdf-input-script">Parameters for the PDF input script</a></li>
|
|
<li><a class="reference internal" href="#parameters-set-for-specific-locations">Parameters set for specific locations</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#the-fields-file">The fields file</a><ul>
|
|
<li><a class="reference internal" href="#extended-attributes-in-the-fields-file">Extended attributes in the fields file</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a class="reference internal" href="#the-mimemap-file">The mimemap file</a></li>
|
|
<li><a class="reference internal" href="#the-mimeconf-file">The mimeconf file</a></li>
|
|
<li><a class="reference internal" href="#the-mimeview-file">The mimeview file</a></li>
|
|
<li><a class="reference internal" href="#the-ptrans-file">The <code class="docutils literal"><span class="pre">ptrans</span></code> file</a></li>
|
|
<li><a class="reference internal" href="#examples-of-configuration-adjustments">Examples of configuration adjustments</a><ul>
|
|
<li><a class="reference internal" href="#adding-an-external-viewer-for-an-non-indexed-type">Adding an external viewer for an non-indexed type</a></li>
|
|
<li><a class="reference internal" href="#adding-indexing-support-for-a-new-file-type">Adding indexing support for a new file type</a></li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
</li>
|
|
</ul>
|
|
<div class="relations">
|
|
<h3>Related Topics</h3>
|
|
<ul>
|
|
<li><a href="#">Documentation overview</a><ul>
|
|
</ul></li>
|
|
</ul>
|
|
</div>
|
|
<div role="note" aria-label="source link">
|
|
<h3>This Page</h3>
|
|
<ul class="this-page-menu">
|
|
<li><a href="_sources/usermanual.txt"
|
|
rel="nofollow">Show Source</a></li>
|
|
</ul>
|
|
</div>
|
|
<div id="searchbox" style="display: none" role="search">
|
|
<h3>Quick search</h3>
|
|
<form class="search" action="search.html" method="get">
|
|
<div><input type="text" name="q" /></div>
|
|
<div><input type="submit" value="Go" /></div>
|
|
<input type="hidden" name="check_keywords" value="yes" />
|
|
<input type="hidden" name="area" value="default" />
|
|
</form>
|
|
</div>
|
|
<script type="text/javascript">$('#searchbox').show(0);</script>
|
|
</div>
|
|
</div>
|
|
<div class="clearer"></div>
|
|
</div>
|
|
<div class="footer">
|
|
©2019, J.F. Dockes.
|
|
|
|
|
|
|
Powered by <a href="http://sphinx-doc.org/">Sphinx 1.4.9</a>
|
|
& <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.8</a>
|
|
|
|
|
|
|
<a href="_sources/usermanual.txt"
|
|
rel="nofollow">Page source</a>
|
|
</div>
|
|
|
|
|
|
|
|
|
|
</body>
|
|
</html> |