moved website
@ -45,8 +45,8 @@ debdir=debian
|
|||||||
# Note: no new releases for lucid: no webkit. Or use old debianrclqt4 dir.
|
# Note: no new releases for lucid: no webkit. Or use old debianrclqt4 dir.
|
||||||
# No new releases for trusty either because of risk of kio compat (kio
|
# No new releases for trusty either because of risk of kio compat (kio
|
||||||
# wont build)
|
# wont build)
|
||||||
series="xenial yakkety zesty"
|
series="xenial yakkety zesty artful"
|
||||||
series=
|
series=artful
|
||||||
|
|
||||||
if test "X$series" != X ; then
|
if test "X$series" != X ; then
|
||||||
check_recoll_orig
|
check_recoll_orig
|
||||||
@ -77,8 +77,8 @@ done
|
|||||||
|
|
||||||
### KIO. Does not build on trusty from recoll 1.23 because of the need
|
### KIO. Does not build on trusty from recoll 1.23 because of the need
|
||||||
### for c++11
|
### for c++11
|
||||||
series="xenial yakkety zesty"
|
series="xenial yakkety zesty artful"
|
||||||
#series=
|
series=
|
||||||
|
|
||||||
debdir=debiankio
|
debdir=debiankio
|
||||||
topdir=kio-recoll-${RCLVERS}
|
topdir=kio-recoll-${RCLVERS}
|
||||||
@ -146,8 +146,8 @@ for series in $series ; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
### Unity Scope
|
### Unity Scope
|
||||||
series="trusty xenial yakkety"
|
series="trusty xenial yakkety zesty artful"
|
||||||
series=
|
#series=
|
||||||
|
|
||||||
debdir=debianunityscope
|
debdir=debianunityscope
|
||||||
if test ! -d ${debdir}/ ; then
|
if test ! -d ${debdir}/ ; then
|
||||||
|
|||||||
@ -20,8 +20,8 @@ alink="#0000FF">
|
|||||||
<div class="titlepage">
|
<div class="titlepage">
|
||||||
<div>
|
<div>
|
||||||
<div>
|
<div>
|
||||||
<h1 class="title"><a name="idp56557776" id=
|
<h1 class="title"><a name="idm44986984150384" id=
|
||||||
"idp56557776"></a>Recoll user manual</h1>
|
"idm44986984150384"></a>Recoll user manual</h1>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
@ -109,13 +109,14 @@ alink="#0000FF">
|
|||||||
multiple indexes</a></span></dt>
|
multiple indexes</a></span></dt>
|
||||||
|
|
||||||
<dt><span class="sect2">2.1.3. <a href=
|
<dt><span class="sect2">2.1.3. <a href=
|
||||||
"#idp62130176">Document types</a></span></dt>
|
"#idm44986952097312">Document types</a></span></dt>
|
||||||
|
|
||||||
<dt><span class="sect2">2.1.4. <a href=
|
<dt><span class="sect2">2.1.4. <a href=
|
||||||
"#idp62154272">Indexing failures</a></span></dt>
|
"#idm44986952072736">Indexing
|
||||||
|
failures</a></span></dt>
|
||||||
|
|
||||||
<dt><span class="sect2">2.1.5. <a href=
|
<dt><span class="sect2">2.1.5. <a href=
|
||||||
"#idp62161280">Recovery</a></span></dt>
|
"#idm44986952065728">Recovery</a></span></dt>
|
||||||
</dl>
|
</dl>
|
||||||
</dd>
|
</dd>
|
||||||
|
|
||||||
@ -1017,8 +1018,9 @@ alink="#0000FF">
|
|||||||
<div class="titlepage">
|
<div class="titlepage">
|
||||||
<div>
|
<div>
|
||||||
<div>
|
<div>
|
||||||
<h3 class="title"><a name="idp62130176" id=
|
<h3 class="title"><a name="idm44986952097312" id=
|
||||||
"idp62130176"></a>2.1.3. Document types</h3>
|
"idm44986952097312"></a>2.1.3. Document
|
||||||
|
types</h3>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -1131,8 +1133,8 @@ indexedmimetypes = application/pdf
|
|||||||
<div class="titlepage">
|
<div class="titlepage">
|
||||||
<div>
|
<div>
|
||||||
<div>
|
<div>
|
||||||
<h3 class="title"><a name="idp62154272" id=
|
<h3 class="title"><a name="idm44986952072736" id=
|
||||||
"idp62154272"></a>2.1.4. Indexing
|
"idm44986952072736"></a>2.1.4. Indexing
|
||||||
failures</h3>
|
failures</h3>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -1172,8 +1174,8 @@ indexedmimetypes = application/pdf
|
|||||||
<div class="titlepage">
|
<div class="titlepage">
|
||||||
<div>
|
<div>
|
||||||
<div>
|
<div>
|
||||||
<h3 class="title"><a name="idp62161280" id=
|
<h3 class="title"><a name="idm44986952065728" id=
|
||||||
"idp62161280"></a>2.1.5. Recovery</h3>
|
"idm44986952065728"></a>2.1.5. Recovery</h3>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -1778,7 +1780,7 @@ thrQSizes = -1 -1 -1
|
|||||||
|
|
||||||
<p>A current pointer to the extension can be found, along
|
<p>A current pointer to the extension can be found, along
|
||||||
with up-to-date instructions, on the <a class="ulink" href=
|
with up-to-date instructions, on the <a class="ulink" href=
|
||||||
"http://bitbucket.org/medoc/recoll/wiki/IndexWebHistory"
|
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/IndexWebHistory"
|
||||||
target="_top">Recoll wiki</a>.</p>
|
target="_top">Recoll wiki</a>.</p>
|
||||||
|
|
||||||
<p>A copy of the indexed WEB pages is retained by Recoll in
|
<p>A copy of the indexed WEB pages is retained by Recoll in
|
||||||
@ -3057,7 +3059,7 @@ MimeType=*/*
|
|||||||
thumbnails.</p>
|
thumbnails.</p>
|
||||||
|
|
||||||
<p>There are also <a class="ulink" href=
|
<p>There are also <a class="ulink" href=
|
||||||
"http://bitbucket.org/medoc/recoll/wiki/ResultsThumbnails.wiki"
|
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/ResultsThumbnails.wiki"
|
||||||
target="_top">some pointers about thumbnail
|
target="_top">some pointers about thumbnail
|
||||||
generation</a> on the <span class=
|
generation</a> on the <span class=
|
||||||
"application">Recoll</span> wiki.</p>
|
"application">Recoll</span> wiki.</p>
|
||||||
@ -5898,7 +5900,7 @@ dir:recoll dir:src -dir:utils -dir:common
|
|||||||
<li class="listitem">
|
<li class="listitem">
|
||||||
<p>If you use a recent version of Ubuntu Linux, you
|
<p>If you use a recent version of Ubuntu Linux, you
|
||||||
may find the <a class="ulink" href=
|
may find the <a class="ulink" href=
|
||||||
"http://bitbucket.org/medoc/recoll/wiki/UnityLens"
|
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/UnityLens"
|
||||||
target="_top">Ubuntu Unity Lens</a> module
|
target="_top">Ubuntu Unity Lens</a> module
|
||||||
useful.</p>
|
useful.</p>
|
||||||
</li>
|
</li>
|
||||||
@ -5932,7 +5934,7 @@ dir:recoll dir:src -dir:utils -dir:common
|
|||||||
"application">libwnck</span> window manager interface
|
"application">libwnck</span> window manager interface
|
||||||
library, which will allow you to do just this. The
|
library, which will allow you to do just this. The
|
||||||
detailed instructions are on <a class="ulink" href=
|
detailed instructions are on <a class="ulink" href=
|
||||||
"http://bitbucket.org/medoc/recoll/wiki/HotRecoll"
|
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/HotRecoll"
|
||||||
target="_top">this wiki page</a>.</p>
|
target="_top">this wiki page</a>.</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@ -6642,9 +6644,9 @@ or
|
|||||||
comments inside the file.</p>
|
comments inside the file.</p>
|
||||||
|
|
||||||
<p>You can also have a look at the <a class="ulink" href=
|
<p>You can also have a look at the <a class="ulink" href=
|
||||||
"http://bitbucket.org/medoc/recoll/wiki/HandleCustomField"
|
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/HandleCustomField"
|
||||||
target="_top">example on the Wiki</a>, detailing how one
|
target="_top">example in the FAQs area</a>, detailing how
|
||||||
could add a <span class="emphasis"><em>page
|
one could add a <span class="emphasis"><em>page
|
||||||
count</em></span> field to pdf documents for displaying
|
count</em></span> field to pdf documents for displaying
|
||||||
inside result lists.</p>
|
inside result lists.</p>
|
||||||
</div>
|
</div>
|
||||||
@ -8978,7 +8980,7 @@ thesame = "some string with spaces"
|
|||||||
function similar to skippedNames, but works
|
function similar to skippedNames, but works
|
||||||
independantly. Can be redefined for subdirectories.
|
independantly. Can be redefined for subdirectories.
|
||||||
Supported by recoll 1.20 and newer. See
|
Supported by recoll 1.20 and newer. See
|
||||||
https://bitbucket.org/medoc/recoll/wiki/Filtering%20out%20Zip%20archive%20members</p>
|
https://www.lesbonscomptes.com/recoll/faqsandhowtos/Filtering%20out%20Zip%20archive%20members</p>
|
||||||
</dd>
|
</dd>
|
||||||
|
|
||||||
<dt><a name=
|
<dt><a name=
|
||||||
|
|||||||
1091
website/BUGS.html
@ -1,945 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Recoll changes</title>
|
|
||||||
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll is a simple full-text search system for unix and linux
|
|
||||||
based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"full text search, desktop search, unix, linux">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
|
|
||||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="index.html">Home</a></li>
|
|
||||||
<li><a href="download.html">Downloads</a></li>
|
|
||||||
<li><a href="doc.html">Documentation</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h1>Recoll journal of user-visible changes </h1>
|
|
||||||
|
|
||||||
<p>Newer releases are described in their release notes document:</p>
|
|
||||||
<p>
|
|
||||||
<a href="release-1.20.html">1.20</a>
|
|
||||||
<a href="release-1.19.html">1.19</a>
|
|
||||||
<a href="release-1.18.html">1.18</a>
|
|
||||||
<a href="release-1.17.html">1.17</a>
|
|
||||||
<a href="release-1.16.html">1.16</a>
|
|
||||||
<a href="release-1.15.html">1.15</a>
|
|
||||||
<a href="release-1.14.4.html">1.14.4</a>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<h2><a name="1.14.3">1.14.3</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li>Get rid of permanent filter subprocess at the end of a GUI
|
|
||||||
indexing pass.</li>
|
|
||||||
<li>Add new filter for indexing GNU info files.</li>
|
|
||||||
<li>Index the file name from a zip or chm internal path.</li>
|
|
||||||
<li>Add hotrecoll.py script to help with one-key recoll
|
|
||||||
activation/hiding. Move focus to search entry when unminimized.</li>
|
|
||||||
<li>Handle bad mbox format from Thunderbird.</li>
|
|
||||||
<li>Catch exception which was causing stderr messages while
|
|
||||||
indexing encrypted zip files.</li>
|
|
||||||
<li>Change result list "Edit" links to "Open" for consistency
|
|
||||||
with menus.</li>
|
|
||||||
<li>Change the type of character set conversion occurring
|
|
||||||
when using "Copy file path" from the result list.
|
|
||||||
Should work in more cases than the previous approach (but
|
|
||||||
will still fail sometimes).</li>
|
|
||||||
<li>Update lyx filter.</li>
|
|
||||||
<li>Fix problems with white space in file name in several
|
|
||||||
input filters.</li>
|
|
||||||
<li>Support mutagen versions older than 1.17.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.14.2">1.14.2</a></h2>
|
|
||||||
<p>Note: most of the changes are in release 1.14.0. Release 1.14.1 fixed 2
|
|
||||||
bugs. Release 1.14.2 fixes the help browser which was broken
|
|
||||||
by 1.14.2. Sigh ...</p>
|
|
||||||
<ul>
|
|
||||||
<li><a href="usermanual/usermanual.html#RCL.SEARCH.LANG">
|
|
||||||
date selection in queries</a>.</li>
|
|
||||||
<li>Pure negative queries (ie: <i>-someterm date:P10D/</i>.</li>
|
|
||||||
<li>Autosuffs: option to automatically turn words into <tt>ext:</tt>
|
|
||||||
clauses (ie: <i>xls</i> -> <i>ext:xls</i>) (GUI preferences
|
|
||||||
panel).</li>
|
|
||||||
<li>Allow extracting arbitrary mail headers and use them as
|
|
||||||
index/search fields (configured in the <tt>fields</tt>
|
|
||||||
file).</li>
|
|
||||||
<li><tt>nonumbers</tt> configuration parameter: disable
|
|
||||||
indexing of all numbers, useful for some data files with lots
|
|
||||||
of numerical data.</li>
|
|
||||||
<li>Shortcuts for the results page: <tt>PageUp/Down</tt> can
|
|
||||||
be used even when the focus is in the search
|
|
||||||
entry. <tt>Shift+Home</tt>: back to first page of results.
|
|
||||||
<tt>Ctrl+Shift+s</tt>: return focus to the search
|
|
||||||
entry. </li>
|
|
||||||
<li>Add full screen mode for small devices.</li>
|
|
||||||
<li>Added -i option to recollq to specify extra indexes.</li>
|
|
||||||
<li>Removed use of id3lib for extracting mp3 tags. A Python filter
|
|
||||||
based on mutagen now handles all audio formats
|
|
||||||
(mp3/flac/ogg). <i>If you are currently indexing audio
|
|
||||||
files, you need to install mutagen, Recoll will not use
|
|
||||||
id3lib or the Flac/Ogg tools any more</i>. </li>
|
|
||||||
<li>Filter for <b>fictionbook</b> (.fb2) documents.</li>
|
|
||||||
<li>Cleaned up the Python samples and made recollq.py a usable
|
|
||||||
clone of recollq.</li>
|
|
||||||
<li>Errors when opening additional indexes for a query are now
|
|
||||||
fatal. They could easily go unnoticed before.</li>
|
|
||||||
<li>Proper LARGEFILE support.</li>
|
|
||||||
<li>Use <b>xsltproc</b> instead of misc dirty tricks to
|
|
||||||
extract text from most current XML-based documents (except
|
|
||||||
those in which the XML is too broken).</li>
|
|
||||||
<li>Implement <tt>configure --enable-pic</tt> and use it for
|
|
||||||
the KIO slave and Python and PHP modules.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p>Bugs also fixed in the 1.13 branch:</p>
|
|
||||||
<ul>
|
|
||||||
<li>The <tt>filename</tt> (transcoded file name) field
|
|
||||||
could not be stored, so it could not be displayed in the
|
|
||||||
result list. Can now be displayed as %(filename).</li>
|
|
||||||
<li>Html files would always be indexed even when filtered
|
|
||||||
out by <tt>indexedmimetypes</tt></li>
|
|
||||||
<li>Preview: toggling between main text and metadata
|
|
||||||
display would confuse the text format.</li>
|
|
||||||
<li>Restore <tt>indexallfilenames=0</tt> functionality.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.13.04">1.13.04</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li>Provide a set of configuration defaults so that compilation has a
|
|
||||||
chance to succeed on unknown systems.</li>
|
|
||||||
<li>Install icon to the pixmaps directory.</li>
|
|
||||||
<li>Fixes stemming, which was broken for all previous 1.13
|
|
||||||
releases.</li>
|
|
||||||
<li><a href="BUGS.html#b_1_13_02">Bugs fixed between 1.13.02
|
|
||||||
and 1.13.04.</a></li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.13.02">1.13.02</a></h2>
|
|
||||||
<ul><li>This version has a single fix to work around a problem in the
|
|
||||||
Qt 4.6.1 uic utility. If you are not using Qt 4.6.1 and are
|
|
||||||
currently running Recoll 1.13.01, you do not need to
|
|
||||||
upgrade.</li></ul>
|
|
||||||
|
|
||||||
<h2><a name="1.13.01">1.13.01</a></h2>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li>Recoll has a new class of persistent external filters
|
|
||||||
with the capability to process several documents, or
|
|
||||||
multi-document files, in the same instance. Benefits: much
|
|
||||||
faster image tag indexing, and new file formats. Except for
|
|
||||||
the Perl image tag filter (because of ExifTool), the new
|
|
||||||
filters are written in Python.<li>
|
|
||||||
|
|
||||||
<li>New file formats: chm (microsoft help), zip archives, .ics
|
|
||||||
calendar files. Individual pages in chm files are indexed and
|
|
||||||
can be previewed. Zip is quite convenient for maildir
|
|
||||||
archives (for example).</li>
|
|
||||||
|
|
||||||
<li>Recoll can now use the output of the Beagle Firefox plugin
|
|
||||||
to index visited web pages and bookmarks. This is only usable
|
|
||||||
if Beagle itself is not running, else Recoll and Beagle will be
|
|
||||||
fighting for the same queue.</li>
|
|
||||||
|
|
||||||
<li>Big text files (like application logs) can now be paged for
|
|
||||||
indexing, avoiding excess memory usage during indexing and
|
|
||||||
improving the usability at query time. They can also be
|
|
||||||
altogether skipped by setting a maximum size configuration
|
|
||||||
parameter. These parameters have default values (1 MB and 20
|
|
||||||
MB) which change Recoll behaviour compared to previous
|
|
||||||
versions. You can set <i>textfilepagekbs</i>
|
|
||||||
and <i>textfilemaxmbs</i> to -1 in the configuration to
|
|
||||||
restore the old behaviour.</li>
|
|
||||||
|
|
||||||
<li>A cache was implemented for mbox message header offsets. This
|
|
||||||
speeds up message previews for big mbox files.</li>
|
|
||||||
|
|
||||||
<li>Miscellaneous usability improvements:
|
|
||||||
<ul>
|
|
||||||
<li>Allow using page-up/down and shift-home to scroll the
|
|
||||||
result list while the focus is in the search entry. </li>
|
|
||||||
<li>Make 'Use desktop preferences' the default for new
|
|
||||||
Recoll installations, and make this choice more
|
|
||||||
prominent in the external viewer dialog.</li>
|
|
||||||
<li>^P starts the print dialog on a preview window.</li>
|
|
||||||
<li>If a search has no result, alternate spellings are
|
|
||||||
suggested. This feature is still a bit raw and will be
|
|
||||||
improved.</li>
|
|
||||||
<li>If the text of a document is empty, preview will switch to
|
|
||||||
displaying the document fields.</li>
|
|
||||||
<li>New entry in the result list contextual menu for opening
|
|
||||||
the parent document of a result list hit with its native
|
|
||||||
application. Useful for exemple for pages inside chm files.</li>
|
|
||||||
</ul>
|
|
||||||
</li>
|
|
||||||
|
|
||||||
<li>Indentation is now preserved when displaying text documents
|
|
||||||
inside the preview window. This is particularly welcome for
|
|
||||||
program source files.</li>
|
|
||||||
|
|
||||||
<li>Allow substituting arbitrary fields in the result
|
|
||||||
paragraph, using a %(fieldname) syntax</li>
|
|
||||||
|
|
||||||
<li>The real-time indexing monitor will now accumulate
|
|
||||||
modifications during 30 S before indexing.</li>
|
|
||||||
|
|
||||||
<li>The indexer can now split camelCase words, allowing search on
|
|
||||||
component terms. This is not enabled by default as it can
|
|
||||||
confuse phrase searches (ie: "MySQL manual" is matched by
|
|
||||||
phrase queries for "my sql manual" and "MySQL manual"
|
|
||||||
but not "mysql manual"). Use "configure --enable-camelcase"
|
|
||||||
to activate it.
|
|
||||||
</li>
|
|
||||||
|
|
||||||
<li>The ipath is now printed by default after the url in the
|
|
||||||
default result list format.</li>
|
|
||||||
|
|
||||||
<li><i>recoll_noindex</i> and <i>skippedNames</i> can now be
|
|
||||||
changed at any point in the tree (only for topdirs previously).</li>
|
|
||||||
|
|
||||||
<li>Allow using location/application sensitivity in external viewer
|
|
||||||
choice. This uses several new functions:
|
|
||||||
<ul>
|
|
||||||
<li>Allow the substitution of arbitrary document fields inside
|
|
||||||
external viewer command line arguments.</li>
|
|
||||||
<li>Allow field values to be set on all documents
|
|
||||||
in a file system subtree. For example, you can
|
|
||||||
set an application tag (ie: rclaptg = gnus) on all mailbox
|
|
||||||
files under a specific directory.</li>
|
|
||||||
<li>New syntax in mimeview for including the rclaptg field in
|
|
||||||
viewer choice
|
|
||||||
(<i>mimetype</i>|<i>tagvalue</i> = ...).</li>
|
|
||||||
</ul>
|
|
||||||
</li>
|
|
||||||
|
|
||||||
<li>Allow specifiying a specific default character set for mail
|
|
||||||
messages. This is mainly useful for readpst dumps. All
|
|
||||||
reasonable non-ascii messages specify their character set.</li>
|
|
||||||
|
|
||||||
<li>Added a --without-gui configure option. Removes all X11 and
|
|
||||||
Qt dependancies and only compiles the command-line interface.</li>
|
|
||||||
|
|
||||||
<li>Improved the kio_recoll build. There is no need to run
|
|
||||||
configure manually in the main directory any more. Ubuntu
|
|
||||||
packages for kio_recoll are now built on the
|
|
||||||
<a href="http://launchpad.net/~recoll-backports/+archive/ppa">
|
|
||||||
recoll-backports PPA on launchpad.net</a>.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
|
|
||||||
<h2><a name="1.12.4">1.12.4</a></h2>
|
|
||||||
<p>Bugs fixed:</p>
|
|
||||||
<ul>
|
|
||||||
<li>Qt4 version only: the search inside the preview window
|
|
||||||
could become unbearably slow for big documents (quadratically
|
|
||||||
so), and could not be interrupted (Qt bug). The Qt3 version of
|
|
||||||
the code was included in the preview tool to restore good
|
|
||||||
performance. This bug is the main reason for this release.</li>
|
|
||||||
</ul>
|
|
||||||
<p>Build system improvements:</p>
|
|
||||||
<ul>
|
|
||||||
<li>Perform minimal base package configuration inside the kio
|
|
||||||
cmake code to permit building it from scratch (without a build
|
|
||||||
of the main code). Mainly useful for builds on the Ubuntu
|
|
||||||
PPA.</li>
|
|
||||||
<li>Implement a --without-gui option to build a pure
|
|
||||||
command-line version with no Qt or X11 dependancies.</li>
|
|
||||||
<li>Ensure that the user's PATH settings determine where we
|
|
||||||
look first for qmake in all cases.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.12.3">1.12.3</a></h2>
|
|
||||||
<p>This is a bug fix release.</p>
|
|
||||||
<ul>
|
|
||||||
<li>Fix the sort tool which had been broken since 1.11 with
|
|
||||||
some (or all?) qt3 versions.<li>
|
|
||||||
<li>Catch two Xapian exceptions which could crash the GUI when a query
|
|
||||||
was run while the index was being updated.</li>
|
|
||||||
<li>Ensure that the result list right-click pop up menu will appear even
|
|
||||||
when the click is inside a table.</li>
|
|
||||||
<li>Fix the way we retrieve the Xapian library version to avoid
|
|
||||||
GUI compilation problems.</li>
|
|
||||||
<li>Inside the real-time indexer: only use the main thread to test that
|
|
||||||
the X11 server is still alive. Multithreaded calls to x11IsAlive()
|
|
||||||
would sometimes crash the process because of an X11 error.</li>
|
|
||||||
<li>Define filter timeout so that a looping filter (ie: rclps trying to
|
|
||||||
index loop.ps) will not completely stop the indexing. Default value:
|
|
||||||
20mn. Add loop.ps to skippedNames.</li>
|
|
||||||
<li>Improve filter subprocesses management. Some could previously be
|
|
||||||
left around after recollindex was killed. Improve cancellation
|
|
||||||
request acknowledgment by recollindex (two ^C were sometimes
|
|
||||||
necessary to make it terminate).</li>
|
|
||||||
<li>Signals SIGUSR1 and SIGUSR2 are now blocked in addition to
|
|
||||||
INTR/TERM/QUIT.</li>
|
|
||||||
<li>Extended attributes indexing now works for all file types.</li>
|
|
||||||
<li>Ensure that queries started from the command line are handled as
|
|
||||||
normal ones (they previously could not be sorted).</li>
|
|
||||||
<li>Improve man page indexing: do not index section header terms.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
|
|
||||||
<h2><a name="1.12.1">1.12.1</a></h2>
|
|
||||||
<p>This is a very minor release, mainly to fix compilation
|
|
||||||
issues and a few very minor bugs. No need to upgrade if
|
|
||||||
you don't experience these.</p>
|
|
||||||
<ul>
|
|
||||||
<li>Fixed compilation errors for new gcc and gnu libc.</li>
|
|
||||||
<li>Use groff html output in rclman to get rid of control
|
|
||||||
characters in output (improve manual pages indexing). Fix
|
|
||||||
8bit character issues in file names in rcllyx.</li>
|
|
||||||
<li>Fixed command line arguments processing problem with
|
|
||||||
"recoll -q"</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.12.0">1.12.0</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li>Recoll now implements a KIO slave to allow searching
|
|
||||||
directly from KDE applications. This does not affect the
|
|
||||||
main application and is not enabled by default (go to the
|
|
||||||
kde/kio/recoll source directory for build
|
|
||||||
instructions). </li>
|
|
||||||
<li>Recoll now computes md5 checksums for all indexed
|
|
||||||
documents and optionally collapses duplicate entries inside
|
|
||||||
the result list. This needs a full reindex to become
|
|
||||||
effective for older documents already in the index. The
|
|
||||||
option to activate collapsing is in the <i>Query
|
|
||||||
Configuration</i>.</li>
|
|
||||||
<li>Typing F1 anywhere in the GUI should bring up the
|
|
||||||
appropriate section of the manual in the application
|
|
||||||
configured for viewing HTML documents.</li>
|
|
||||||
<li>The result list right click menu now has an entry to
|
|
||||||
save the document to a file. This is only enabled for
|
|
||||||
documents contained inside another file (ie, messages inside
|
|
||||||
an mbox folder, or attachments), and is especially useful for
|
|
||||||
extracting an attachment with no associated external
|
|
||||||
editor.</li>
|
|
||||||
<li>The preview window now has a right-click menu, with an
|
|
||||||
entry to toggle between viewing the main text or all the
|
|
||||||
metadata for the document. This is most useful in the case
|
|
||||||
where the search match actually occurred in a field not
|
|
||||||
visible in the main text (ie: author or HTML title).</li>
|
|
||||||
<li>Words glued by an underscore character like
|
|
||||||
<i>compound_word</i> are now split during indexing, and
|
|
||||||
will be found when queried either as themselves or in a
|
|
||||||
search for the components.</li>
|
|
||||||
<li>There is now a size limit over which no attempt will be made to
|
|
||||||
uncompress/identify/index compressed files. Not active by
|
|
||||||
default, to be set in the <i>Indexing Configuration</i>.</li>
|
|
||||||
<li>Added support for fetching field values from extended file
|
|
||||||
attributes. This is not enabled by default, use
|
|
||||||
<i>configure --enable-xattr</i>. You'll also need to
|
|
||||||
set up a map from the attributes names to the Recoll field
|
|
||||||
names (see comment at the end of the <i>fields</i>
|
|
||||||
configuration file.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.11.4">1.11.4</a></h2>
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>Bugs fixed:
|
|
||||||
check the <a href="BUGS.html#b_1_11_1">list</a>.</li>
|
|
||||||
|
|
||||||
<li>The right-click menu "Copy" commands inside the result list
|
|
||||||
now copy to the clipboard in addition to the main selection,
|
|
||||||
enabling subsequent ^v commands.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.11.0">1.11.0</a></h2>
|
|
||||||
|
|
||||||
<p><i>Recoll release 1.11 has relatively extensive changes that have
|
|
||||||
necessitated a modification of the index format. Hence installing this
|
|
||||||
release implies a full re-indexing, which is enforced by the
|
|
||||||
software.</i></p>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li>Filtering on category (message/text/media etc.) as a function of
|
|
||||||
the main window for quick access.</li>
|
|
||||||
|
|
||||||
<li>Use html for preview when available (ex: html files or "colorized"
|
|
||||||
python) instead of converting to text. This can be turned of in the
|
|
||||||
preferences. </li>
|
|
||||||
|
|
||||||
<li>New Python query and index interfaces. The Python query
|
|
||||||
interface will be used for building a Xesam adapter for
|
|
||||||
Recoll when the specification is stabilized, and could be
|
|
||||||
useful for other things, such as indexing contents from an
|
|
||||||
RDBMS (see
|
|
||||||
<a href="usermanual/usermanual.html#RCL.PROGRAM.PYTHONAPI">
|
|
||||||
the manual</a> for details). Restructured and cleaned up
|
|
||||||
internal Recoll interfaces.</li>
|
|
||||||
|
|
||||||
<li>Improved filter framework. Can now process either html or text output
|
|
||||||
from the filters, and more easily execute "raw" commands instead of
|
|
||||||
Recoll scripts. Avoided wasteful repeated execution of filters for
|
|
||||||
which the helper application is missing.</li>
|
|
||||||
|
|
||||||
<li>Query language now closer to Xesam specification, (but
|
|
||||||
still far from a
|
|
||||||
complete implementation). See the Recoll manual and
|
|
||||||
<a href="http://www.xesam.org/main/XesamUserSearchLanguage">
|
|
||||||
http://www.xesam.org/main/XesamUserSearchLanguage</a> </li>
|
|
||||||
|
|
||||||
<li>Much improved configuration for fields. Fields like
|
|
||||||
"author" can now be specified as storable (displayable in
|
|
||||||
results) and/or indexed (searchable). Added alias facility
|
|
||||||
for translating from user-level names to internal.</li>
|
|
||||||
|
|
||||||
<li>Added "recipient" as an indexed/searchable field for emails.</li>
|
|
||||||
|
|
||||||
<li>rcltext filter for processing text such as C code for which no specific
|
|
||||||
processing is needed when indexing but a specific viewer is
|
|
||||||
desired.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.10.6">1.10.6</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li>Fix a simple and mildly nasty bug that would cause the
|
|
||||||
indexer to stop
|
|
||||||
indexing an mbox on encountering a specific but not exceptional error
|
|
||||||
condition (like a few dozen errors while indexing attachments for which
|
|
||||||
no filter was installed).</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
|
|
||||||
<h2><a name="1.10.5">1.10.5</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li>Ensure that file names indexed as terms don't overflow the maximum term
|
|
||||||
size.</li>
|
|
||||||
|
|
||||||
<li> Handle non-standard date format in mbox separator lines sometimes
|
|
||||||
generated by thunderbird.
|
|
||||||
|
|
||||||
<li> Use attachment file names to help identify a better mime type for
|
|
||||||
parts only described as application/octet-stream
|
|
||||||
|
|
||||||
<li> For Phrase/Near searches, highlight all term groups in preview, not just
|
|
||||||
the first
|
|
||||||
|
|
||||||
<li> Added Open XML filters
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.10.2">1.10.2</a></h2>
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>Fixed openSuse 11 compile issues.
|
|
||||||
|
|
||||||
<li>Fixed bug in interpreting email mime structure, which resulted in base-64
|
|
||||||
decoding errors.
|
|
||||||
|
|
||||||
<li>Fixed "Prev" button in preview window. Would actually go forward when
|
|
||||||
walking the search terms.
|
|
||||||
|
|
||||||
<li> Allow setting the highlight color for search terms in result list and
|
|
||||||
preview (yes: feature change, should have waited for major release...)
|
|
||||||
|
|
||||||
<li> Added svg filter
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.10.1">1.10.1</a></h2>
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li> Ensure that in case the data of a file can't be indexed because of some
|
|
||||||
error, at least the file name is indexed.
|
|
||||||
|
|
||||||
<li> Improve query language to support OR queries of terms with field
|
|
||||||
specifications (ie: title:someterm OR author:someauthor).
|
|
||||||
|
|
||||||
<li> Fix filename search to split patterns on white space, so that
|
|
||||||
a "*.jpg *.jpeg" search does what's expected. Means you now need to use
|
|
||||||
double-quotes if there is actual embedded white space.
|
|
||||||
|
|
||||||
<li> Jump directly to the external editor choice dialog instead of opening
|
|
||||||
preferences when an external viewer is not found.
|
|
||||||
|
|
||||||
<li> Allow stopping indexing through menu action (only works with qt4 for now).
|
|
||||||
|
|
||||||
<li> Create an "indexedmimetypes" configuration variable to allow explicitely
|
|
||||||
restricting the file types which do get indexed.
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.10.0">1.10.0</a></h2>
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li> Added a GUI dialog to configure the indexing parameters.
|
|
||||||
|
|
||||||
<li> Added better support for indexing CJK text (Chinese, Japanese, Korean).
|
|
||||||
Please note that:
|
|
||||||
- You will need a full reindex to take good advantage of this. (You
|
|
||||||
*don't* need to reindex if you don't need to search CJK, even if there
|
|
||||||
is some in your index).
|
|
||||||
- When entering CJK search terms, words (single or multiple characters)
|
|
||||||
should be separated with white space.
|
|
||||||
- The specific CJK processing can be turned off by setting the nocjk
|
|
||||||
variable to true in the configuration file (this may make sense if you
|
|
||||||
have a mixed cjk/other document base and don't want to index the cjk
|
|
||||||
part, as it will save some disk space and a minuscule amount of cpu).
|
|
||||||
|
|
||||||
<li> Changed the way Recoll handles searches including composite words (like
|
|
||||||
an email address). The new approach looks saner, but could have
|
|
||||||
side-effects, please report any problems in this area.
|
|
||||||
|
|
||||||
<li> The query language got a new "dir:" specifier to filter results on location.
|
|
||||||
|
|
||||||
<li> New rclimg perl filter for better indexing of picture tags, thanks to
|
|
||||||
Cedric Scott. This depends on Exiftool.
|
|
||||||
<li> New rcltex filter.
|
|
||||||
|
|
||||||
<li> Changed and improved how the preview window local search finds the
|
|
||||||
query terms, this does not involve weird characters any more. The
|
|
||||||
display is cleaner and cut and paste works better.
|
|
||||||
|
|
||||||
<li> Fixed the fact that a newline-separated word list in simple search would
|
|
||||||
wrongly trigger a phrase search.
|
|
||||||
|
|
||||||
<li> Fixed the way we input text to the preview textedit (the old way would
|
|
||||||
sometimes confuse the window into displaying tags instead of acting on
|
|
||||||
them).
|
|
||||||
|
|
||||||
<li> Fixed transcoding to utf-8 for text/plain email attachments
|
|
||||||
|
|
||||||
<li> Improved mbox From_ line detection
|
|
||||||
|
|
||||||
<li> Added indexedmimetypes variables to allow restricting the list of indexed
|
|
||||||
mime types.
|
|
||||||
|
|
||||||
<li> KDE kicker applet: start a recoll search from the panel and get a
|
|
||||||
Recoll window. This is a clone from the find_applet, originally meant to
|
|
||||||
start a Tracker search. Not so useful presently because it will start a
|
|
||||||
new Recoll instance for every search. Not part of the main source (the
|
|
||||||
configure script is a whopping 1MB...), linked from the download page.
|
|
||||||
<li> Added recoll command line options to define a query and execute it
|
|
||||||
immediately when the program starts. This is used in practice from the
|
|
||||||
applet and could be used from other programs. There is a also a new
|
|
||||||
option to not start the GUI and print the results to stdout.
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.9.0">1.9.0</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Incompatible change: the icon image reference is now part of the result
|
|
||||||
list paragraph format string:
|
|
||||||
- If you had a standard config, you need do nothing.
|
|
||||||
- If you had a custom format string, you need to add
|
|
||||||
<img src="%I" align="left"> at its beginning to get the same result as
|
|
||||||
before.
|
|
||||||
- If you had unchecked the "show icons" option, you need to remove the
|
|
||||||
above string from the paragraph format to make the icons go away.
|
|
||||||
Changes to the format string are performed in the
|
|
||||||
"Preferences->Query Configuration->User Interface" dialog tab.
|
|
||||||
|
|
||||||
<li> New filters: wordperfect, abiword and kword, rcljpeg, rclflac, rclogg
|
|
||||||
(contributed filters). The jpeg and audio filters should be extended to
|
|
||||||
make use of the new field indexing/search capability (hint :) )
|
|
||||||
|
|
||||||
<li> When searching for an empty string inside the preview window, position
|
|
||||||
the window to the next occurrence of a primary search term.
|
|
||||||
|
|
||||||
<li> Added ext: and mime: selectors to the query language.
|
|
||||||
|
|
||||||
<li> Added an adjustable flush threshold during indexing: should help control
|
|
||||||
memory usage. See the idxflushmb configuration variable.
|
|
||||||
|
|
||||||
<li> Added a check for file system free space. Indexing will stop if the
|
|
||||||
threshold is reached. See the maxfsoccuppc configuration parameter.
|
|
||||||
|
|
||||||
<li> Added 'followLinks' configuration option to have the indexer follow
|
|
||||||
symbolic links while walking the tree (the default is false).
|
|
||||||
|
|
||||||
<li> Allow symbolic links as 'topdirs' members. These are always followed.
|
|
||||||
|
|
||||||
<li> Add preference option to remember sort tool state between program
|
|
||||||
invocations (it is reset to inactive by default)
|
|
||||||
|
|
||||||
<li> Added File menu entry to erase document history.
|
|
||||||
|
|
||||||
<li> Bound the space and backspace keys to PgUp/PgDown in preview.
|
|
||||||
|
|
||||||
<li> (Hopefully) Improved abstract (keyword in context) generation
|
|
||||||
|
|
||||||
<li> Added support for arbitrary fields. Filters can now produce any number of
|
|
||||||
fields which will be selectively searchable through the query
|
|
||||||
language. This could be useful, for exemple, for the mp3 and jpeg filters
|
|
||||||
(but it is not currently used).
|
|
||||||
|
|
||||||
<li> Improved qt4 build: no more need for --enable-qt4. Note: the qt4 build
|
|
||||||
still needs the qt3 support library.
|
|
||||||
|
|
||||||
<li> Changed the icon to an ugly one. The previous one was nicer but looked
|
|
||||||
too much like Xapian's.
|
|
||||||
|
|
||||||
<li> Added some kind of support for a stopword list.
|
|
||||||
|
|
||||||
<li> Have email attachments inherit date and author from their parent message
|
|
||||||
(instead of mail folder).
|
|
||||||
|
|
||||||
<li> Fix bus error on rclmon exit
|
|
||||||
|
|
||||||
<li> Better handling of aspell errors inside rclmon
|
|
||||||
|
|
||||||
<li> Fixed a number of qt4 glitches: selection and keyboard shortcuts.
|
|
||||||
|
|
||||||
<li> New query configuration parameter to set the maximum text size beyond
|
|
||||||
which text won't be hilighted before preview (takes too much time). This
|
|
||||||
was a fixed value in 1.8.
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.8.2">1.8.2 2007-05-19</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Fixed method name for compatibility with xapian 1.0.0
|
|
||||||
<li> Add .beagle to default list of skipped names (avoids indexing beagle
|
|
||||||
document cache...)
|
|
||||||
<li> Fix configure.ac to use $libdir instead of /usr/lib
|
|
||||||
<li> Fix recollinstall to properly copy translations and pictures for qt4
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.8.1">1.8.1 2007-02-20</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Add a small query language with some field-based searches (author, title,
|
|
||||||
etc.)
|
|
||||||
<li> Add wildcard handling everywhere. *, ?, [] can be used in any
|
|
||||||
search. Warning: using a wild card at the left of a term can make
|
|
||||||
for a very slow search.
|
|
||||||
<li> Allow skipping specific paths during indexing (in addition to file name
|
|
||||||
patterns)
|
|
||||||
<li> Improved external index choice dialog, accessible from the top-level menu.
|
|
||||||
<li> Many small bugs fixed: stemming language choice ignored in term explorer,
|
|
||||||
qt4 preview window reentrancy crashes, issues with saving the default
|
|
||||||
advanced search file, type filter, display more clearly missing helper
|
|
||||||
errors, etc.
|
|
||||||
<li> Option to use the desktop defaults (with xdg-open) to choose the native
|
|
||||||
viewer for files (instead of recoll's mimeview).
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.7.6">1.7.6 2007-01-30</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Fixes an issue with the openoffice filter on debian systems.
|
|
||||||
<li> Adds Scribus and Lyx filters.
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.7.5">1.7.5 2007-01-15</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Fixes two email indexing bugs in 1.7.3, which would bail out from an
|
|
||||||
mbox folder on the first attachment filtering error, and would decline
|
|
||||||
to handle multipart/signed bodies. You may need to run a full indexing
|
|
||||||
pass (recollindex -z), to force reindexing of old folders.
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.7.3">1.7.3 2007-01-09</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Email attachments are now indexed.
|
|
||||||
<li> Right-click menu option to access the parent document of an embedded
|
|
||||||
result (ie from mail attachment to parent message), or the parent folder
|
|
||||||
of a given file (which is opened with the application configured for
|
|
||||||
directories)
|
|
||||||
<li> The sort tool has been improved: no need to restart the query after sort
|
|
||||||
criteria change.
|
|
||||||
<li> Support for real-time indexing with inotify is now enabled by default
|
|
||||||
when appropriate.
|
|
||||||
<li> Recoll now warns when the configured native viewer can not be found and
|
|
||||||
starts an interface for chosing another one.
|
|
||||||
<li> Categories (text, presentation, spreadsheets, etc.) can be used instead
|
|
||||||
of raw mime types when filtering on file types in advanced search.
|
|
||||||
<li> The port to qt4 is functional and can be enabled with configure --enable-qt4
|
|
||||||
<li> 'autophrase' option improved and may now actually be useful.
|
|
||||||
<li> Improved highlighting (again...)
|
|
||||||
<li> Display term frequencies in term explorer.
|
|
||||||
<li> Recollindex -e to remove data from index for listed files.
|
|
||||||
<li> Directory names now indexed. Directories can be 'edited' with the
|
|
||||||
configured application (rox by default)
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.6.3">1.6.3</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Fixed problem with bad detection of mbox message boundaries.
|
|
||||||
Upgrading can change the message numbering in some cases, and you should
|
|
||||||
perform a full index update (recollindex -z) after installing
|
|
||||||
the new version.
|
|
||||||
<li> Fixed problem with execution of external viewer for files with
|
|
||||||
single-quotes in the name.
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.6.2">1.6.2</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Minor solaris compilation glitches only.
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.6.1">1.6.1</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Term explorer: a multimode wildcard-regexp-spell/phonetic tool to search
|
|
||||||
the index for terms. This uses aspell for the orthographic/phonetic part.
|
|
||||||
<li> A more dynamic advanced search window. You now have a choice of the top
|
|
||||||
level conjunction (OR/AND) and of any number of clauses, including NEAR
|
|
||||||
and PHRASE clauses with an adjustable proximity parameter.
|
|
||||||
<li> User-settable format for the result-list entries, which use an HTML
|
|
||||||
string with %xx printf-like replacements (accessible from the user
|
|
||||||
preferences).
|
|
||||||
<li> Real time monitoring/indexing support. This is not configured by
|
|
||||||
default, and must be specified at build time (configure --help).
|
|
||||||
<li> Improved phrase/group highlighting in abstracts and preview
|
|
||||||
<li> Better sample selection for synthetic abstracts.
|
|
||||||
<li> Improved performance of the text splitter, good for indexing and previewing.
|
|
||||||
<li> Shift+click link to open new preview window instead of tab in existing
|
|
||||||
window.
|
|
||||||
<li> The key sequence for term completion in the simple search entry was
|
|
||||||
changed from CTRL+TAB to "Escape Space" to avoid interaction with window
|
|
||||||
managers.
|
|
||||||
<li> Improved recall for phrases with composite words like email addresses.
|
|
||||||
|
|
||||||
|
|
||||||
Updating from 1.2 to 1.3 or 1.4 or 1.5:
|
|
||||||
<li>--------------------------------------
|
|
||||||
From version 1.3 up, there is a new feature to search specifically for file
|
|
||||||
names (with wildcard processing). If you want to take full advantage of
|
|
||||||
this, you should perform a full reindex after installing the new version
|
|
||||||
(ie: use recollindex -z, or delete ~/.recoll/xapiandb).
|
|
||||||
Also, we now use the central copies of configuration files for default
|
|
||||||
values, and the user ones only for overrides. Your old configuration files
|
|
||||||
will still work, but, you may want to remove them if they are unmodified,
|
|
||||||
or keep only the modified parameters.
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.5.9 ">1.5.9 </a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Fix bad timezone conversion in email dates. Display timezone in result
|
|
||||||
list dates.
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.5.8">1.5.8</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Fix stored and displayed dates which used to come from the file's ctime,
|
|
||||||
now use mtime (which was already used for deciding re-indexing).
|
|
||||||
<li> Fix problem with some weird MIME messages (with null boundaries) which
|
|
||||||
crashed the indexer.
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.5.6">1.5.6</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Small fixes dealing with the build process or compiler issues.
|
|
||||||
1.5.6 has updated ukrainian and russian messages.
|
|
||||||
Otherwise no functional changes, and no need to upgrade from 1.5.1
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.5.1">1.5.1</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Fix serious bug with non ascii strings in simple search history
|
|
||||||
<li> Improve synthetic abstracts: remove size limitations, handle overlapping
|
|
||||||
extracts, avoid printing several terms from the same position.
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.5.0">1.5.0 2006-09-20</a></h2>
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li> Added support for powerpoint and excel files, with the catdoc package.
|
|
||||||
<li> Allow viewing consecutive documents from the result list inside a single
|
|
||||||
preview window using the shift-arrow-up and shift-arrow-down keys.
|
|
||||||
<li> Colorize search terms in abstracts in the result list.
|
|
||||||
<li> A number of elements are now remembered between program invocations:
|
|
||||||
sort criteria, list of ignored file types (always starts inactive),
|
|
||||||
subtree restriction, better handling of the recent searches listbox, the
|
|
||||||
buildAbstract and replaceAbstract settings are not forgotten any more.
|
|
||||||
<li> New option to automatically add a phrase to simple searches.
|
|
||||||
<li> Possibility to adjust the length and context width for synthetic abstracts.
|
|
||||||
<li> Handle weird html better.
|
|
||||||
<li> When indexing mail messages, walk the full mime tree instead of staying
|
|
||||||
at the top level, index all text parts and attachement file names.
|
|
||||||
<li> Add -c <confdir> option to recoll and recollindex to specify the
|
|
||||||
configuration directory on the command line
|
|
||||||
<li> Better synchronization between the active preview and the highlighted
|
|
||||||
paragraph inside the list
|
|
||||||
<li> Improved recall for some special cases of stemming.
|
|
||||||
<li> Much better handling of email dates, allowing better email sorting by
|
|
||||||
date (previously the message date was quite often the date when the file
|
|
||||||
was indexed).
|
|
||||||
<li> Store the external database lists in the configuration directory, not the
|
|
||||||
qt preferences.
|
|
||||||
<li> Ensure dialogs are sized according to font size
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.4.3">1.4.3 2006-05-07</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Multiple search databases.
|
|
||||||
<li> Optionally auto-search when a word is entered in the simple search
|
|
||||||
field.
|
|
||||||
<li> Show possible term completions in simple search by typing CTRL+TAB
|
|
||||||
<li> Add 'more like this' option to result list right-click menu, to look for
|
|
||||||
documents related to the current result.
|
|
||||||
<li> Double-click in preview or result list adds the selected word to the
|
|
||||||
simple search text field.
|
|
||||||
<li> The simple search text entry field is now a combobox and remembers
|
|
||||||
previous searches.
|
|
||||||
<li> Additional OR field in complex search.
|
|
||||||
<li> Improved indexing cancellability (interrupting recollindex or closing
|
|
||||||
recoll with an indexing thread active), and status reporting.
|
|
||||||
<li> Fixed filters to handle file paths with embedded spaces.
|
|
||||||
<li> Misc small bug and memory leaks fixes.
|
|
||||||
<li> More compact result list.
|
|
||||||
<li> Set mode 0700 on .recoll directory by default
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.3.3">1.3.3 2006-04-04</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Implement specific search on file names with wildcard
|
|
||||||
support. Indexing can optionally process all file names or only those
|
|
||||||
with mime types supported for normal indexing. UPDATING: you need a
|
|
||||||
full re-indexing to take advantage of this.
|
|
||||||
<li> Use links and a right-click popup menu to replace confusing use of
|
|
||||||
mouse clicks and double-clicks inside the result list.
|
|
||||||
<li> The 'example' configuration files are now used as default, and are not
|
|
||||||
copied any more to the user directory during installation. Overrides can
|
|
||||||
be set in the personal files for any value that the user wishes to
|
|
||||||
modify, with unchanged formats and file names (so that the files from
|
|
||||||
previous versions remain valid, but you may wish to trim them of values
|
|
||||||
that duplicate the central ones).
|
|
||||||
<li> Use NLS information (LC_CTYPE, LANG) do determine default charset when
|
|
||||||
possible.
|
|
||||||
<li> Mp3 file indexing, either filenames only or also id3 tags if id3info is
|
|
||||||
available. c/c++ ext edit. Use gnuclient instead of xemacs for text files.
|
|
||||||
<li> Russian and Ukrainian translations and many improvement ideas thanks to
|
|
||||||
Michael Shigorin.
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.2.3">1.2.3 2006-03-03</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Added support for dvi (with dvips), and dvu (with DjVuLibre).
|
|
||||||
<li> Ensure that configure and make use the same qt version.
|
|
||||||
<li> Fix sorted sequence title display.
|
|
||||||
<li> Discriminate fatal errors and missing docs while loading a doc list.
|
|
||||||
<li> Improved and cleaned up way to position a preview on the first search term.
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.2.2">1.2.2 2006-02-02</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Fix minor compilation glitches (FreeBSD 4, QT 3.1, xapian-config problem)
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
<h2><a name="1.2.0">1.2.0 2006-02-01</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Improved preview loading: don't highlight very big documents (over 1Mb),
|
|
||||||
allow cancellation while loading.
|
|
||||||
<li> Abstracts generated in the result list by looking at search term
|
|
||||||
contexts. This can slow down result list display for big documents, and
|
|
||||||
can be turned off in the preferences menu.
|
|
||||||
<li> Wrap query detail line displayed when clicking on result list header.
|
|
||||||
<li> Text splitting cleanup with less spurious terms should result in
|
|
||||||
slightly smaller databases.
|
|
||||||
<li> Sligthly improved presentation in preview, esp. line breaks.
|
|
||||||
<li> Color icons...
|
|
||||||
<li> Let the user select the html browser used for help display.
|
|
||||||
<li> autoconf/Makefile change: allow building UI from inside the qtgui
|
|
||||||
directory.
|
|
||||||
<li> autoconf/Makefile: improved search and diagnostics for qt/qmake.
|
|
||||||
<li> Internal code cleanup for maintainability: text splitting, user
|
|
||||||
interface.
|
|
||||||
<li> Added prototype kio_slave to show result inside Konqueror, doesn't seem
|
|
||||||
particularly useful.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.1.0">1.1.0 2006-01-12</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> A much better user manual, which can be browsed from the help menu.
|
|
||||||
<li> man pages for recoll, recollindex, recoll.conf
|
|
||||||
<li> User/query interface configuration dialog.
|
|
||||||
<li> Click on result list header will display the exact boolean search which
|
|
||||||
was used.
|
|
||||||
<li> recollindex can be used to create stem expansion databases independantly
|
|
||||||
of a full indexing pass.
|
|
||||||
<li> Misc user interface improvements, like an 'all terms' checkbox for
|
|
||||||
simple search.
|
|
||||||
<li> Fixed case-insensitivity issues. Probably needs more testing.
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.0.16">1.0.16 2006-01-05</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Minor installation tweaks for rpm compatibility
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.0.15 ">1.0.15 </a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Fix problems with prefix != /usr/local
|
|
||||||
<li> Remove '.*' from the default list of ignored file/dir names: this
|
|
||||||
prevented mozilla/thunderbird mail indexing.
|
|
||||||
<li> Fix some 64 bits issues
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.0.14">1.0.14</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Small changes for FreeBSD 4 compilation.
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.0.13">1.0.13</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Install of recollinstall program not done or needed any more.
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.0.12">1.0.12</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Fixed nasty html parsing bug introduced in 1.0.9 Html parsing failed
|
|
||||||
whenever the document charset name differed from the default only in
|
|
||||||
character case or punctuation.
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.0.11">1.0.11</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Create personal configuration on first start.
|
|
||||||
<li> Use qt toolbars.
|
|
||||||
<li> Also index terms in file paths.
|
|
||||||
<li> Tool for sorting on dates or mime types.
|
|
||||||
<li> Fixed pdf filter which was broken by more recent xpdf
|
|
||||||
<li> Filters now installed/executed from /usr/local
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.0.10">1.0.10</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Added tool to manage the history of consulted documents.
|
|
||||||
<li> Try harder to convert email messages with wrongly declared charsets.
|
|
||||||
<li> Add option to reset the database before indexing (easier than rm -rf).
|
|
||||||
<li> Small gui improvements.
|
|
||||||
<li> Install partial french translation as a tease for future translaters...
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.0.9">1.0.9</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Fixed 2 really ennoying bugs in 1.0.8: wouldn't preview 2nd document
|
|
||||||
from same file + spurious db close when filter could not be executed.
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="1.0.8">1.0.8</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li> Add support for rtf and gaim logs
|
|
||||||
<li> Optionally show icons to indicate mime types in result list
|
|
||||||
<li> Better (but imperfect) feedback during the preview
|
|
||||||
loading for big files
|
|
||||||
<li> Remember main window geometry when closing
|
|
||||||
<li> Fix stem expansion in advanced search
|
|
||||||
<li> Some autoconf
|
|
||||||
<li> Option to use the system's 'file' command as a final step of
|
|
||||||
identification for suffix-less or unknown files.
|
|
||||||
<li> Typo had removed support for .Z compression
|
|
||||||
<li>Use more appropriate conjonction operators when
|
|
||||||
computing the advanced search query (OP_AND_MAYBE,
|
|
||||||
OP_FILTER instead of OP_AND)
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@ -1,17 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
set -x
|
|
||||||
docdir=/home/dockes/projets/fulltext/recoll/src/doc/user/
|
|
||||||
|
|
||||||
#(cd $docdir;make) || exit 1
|
|
||||||
|
|
||||||
test -d usermanual || mkdir usermanual || exit 1
|
|
||||||
cd usermanual
|
|
||||||
|
|
||||||
thisdir=`pwd`
|
|
||||||
(cd $docdir; find . -name templates -prune -o -print | cpio -vudp $thisdir)
|
|
||||||
|
|
||||||
mv usermanual.pdf recoll_user_manual.pdf
|
|
||||||
# The freebsd tool chain generates a link to book.html in the index. Too
|
|
||||||
# lazy to check if this can be changed
|
|
||||||
cp -p usermanual.html book.html
|
|
||||||
#cp usermanual.html index.html
|
|
||||||
@ -1,80 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>RECOLL: credits</title>
|
|
||||||
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll is a simple full-text search system for unix and linux
|
|
||||||
based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"full text search, desktop search, unix, linux">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
|
|
||||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="index.html">Home</a></li>
|
|
||||||
<li><a href="pics/index.html">Screenshots</a></li>
|
|
||||||
<li><a href="download.html">Downloads</a></li>
|
|
||||||
<li><a href="doc.html">User manual</a></li>
|
|
||||||
<li><a href="index.html#support">Support</a></li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h3><a name="credits">Credits</a></h3>
|
|
||||||
|
|
||||||
<p>First of all, many thanks to the users who provided criticism
|
|
||||||
and ideas to make <span class="application">Recoll</span> go
|
|
||||||
forward ! Please
|
|
||||||
<a href="mailto:jfd@recoll.org">
|
|
||||||
contact me</a> if you have something to suggest.</p>
|
|
||||||
|
|
||||||
<p><span class="application">Recoll</span> borrows
|
|
||||||
from the following projects. I tried to include the relevant
|
|
||||||
copyright attributions with the code. Any omission is
|
|
||||||
unintentional and will be fixed as soon as notified. </p>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li><a href="http://www.xapian.org">Xapian</a>: The database module
|
|
||||||
(core) is used unmodified, and quite a lot of code has been
|
|
||||||
borrowed from Omega, the web-based search application (ie:
|
|
||||||
the html parser, plus miscellaneous bits and ideas). </li>
|
|
||||||
<li><a href="http://estraier.sourceforge.net/">Estraier</a>:
|
|
||||||
Some of the input handlers still have bits of Estraier code
|
|
||||||
in them.</li>
|
|
||||||
<li><a href="http://www.senga.org/">Unac</a>: for accent
|
|
||||||
removal. This package is unmaintained and the (quite modified)
|
|
||||||
code is carried with the <span class="application">Recoll</span>
|
|
||||||
source.</li>
|
|
||||||
<li><a href="http://www.gnu.org/software/libiconv/">Iconv</a>, for
|
|
||||||
character set conversion.</li>
|
|
||||||
<li><a href="http://www.bincimap.org/">Binc IMAP</a> for MIME
|
|
||||||
parsing code. The original package is unmaintained and the
|
|
||||||
relevant code is carried with the <span
|
|
||||||
class="application">Recoll</span> source.</li>
|
|
||||||
<li>The icons mainly come from the <a
|
|
||||||
href="http://www.everaldo.com/">Crystal SVG</a> KDE set.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li>I fear that bugs found elsewhere are mostly mine:
|
|
||||||
<a href="mailto:jfd@recoll.org">jfd@recoll.org</a></li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
||||||
@ -1,630 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>RECOLL: result list customisation tips</title>
|
|
||||||
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll is a simple full-text search system for unix and linux
|
|
||||||
based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"full text search, desktop search, unix, linux">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
|
|
||||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
|
||||||
|
|
||||||
|
|
||||||
<style type="text/css">
|
|
||||||
/* Photo-Caption PZ3 CSS v080630
|
|
||||||
* copyright: http://randsco.com/copyright
|
|
||||||
* www.randsco.com
|
|
||||||
*/
|
|
||||||
|
|
||||||
.PZ3-l { float:left; margin-right:10px; }
|
|
||||||
.PZ3-r { float:right; margin-left:10px; direction:rtl; }
|
|
||||||
html>/**/body .PZ3-r { position:relative; }
|
|
||||||
|
|
||||||
.PZ3zoom { border:1px solid #369; }
|
|
||||||
.PZ3zoom a,.PZ3zoom a:visited { display:block;
|
|
||||||
padding:0; overflow:hidden; text-decoration:none;
|
|
||||||
height:100%; width:100%; }
|
|
||||||
html>/**/body .PZ3-r a { right:0; }
|
|
||||||
|
|
||||||
.PZ3zoom a:hover { position:absolute;
|
|
||||||
z-index:999; padding:0; background:none;
|
|
||||||
cursor:default; height:auto; width:auto;
|
|
||||||
overflow:visible; border:1px solid #369;
|
|
||||||
margin:-1px 0 0 -1px; }
|
|
||||||
html>body .PZ3zoom a:hover { margin:-1px -1px 0 -1px; }
|
|
||||||
|
|
||||||
.PZ3zoom a img { border:0; height:100%; width:100%; }
|
|
||||||
.PZ3zoom a:hover img { height:auto; width:auto;
|
|
||||||
border:0; }
|
|
||||||
|
|
||||||
a:hover .PZ3cap,
|
|
||||||
a:hover .PZ31cap { display:block;
|
|
||||||
direction:ltr; font:10pt verdana,sans-serif;
|
|
||||||
margin-top:-3px; background:#369; color:#fff;
|
|
||||||
text-align:left; }
|
|
||||||
a:hover .PZ3cap { padding:3px 5px; }
|
|
||||||
.PZ3inr { display:block; padding:2px 5px; }
|
|
||||||
|
|
||||||
.noCap a:hover .PZ3cap,
|
|
||||||
.noCap a:hover .PZ31cap { display:none; }
|
|
||||||
.noBdr,.noBdr a:hover { border:0; }
|
|
||||||
.Lnk a:hover { cursor:pointer; }
|
|
||||||
|
|
||||||
/* End Photo-Caption Zoom CSS */
|
|
||||||
</style>
|
|
||||||
|
|
||||||
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="index.html">Home</a></li>
|
|
||||||
<li><a href="pics/index.html">Screenshots</a></li>
|
|
||||||
<li><a href="download.html">Downloads</a></li>
|
|
||||||
<li><a href="doc.html">User manual</a></li>
|
|
||||||
<li><a href="index.html#support">Support</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h1>Recoll result list customising exemples</h1>
|
|
||||||
|
|
||||||
<p>The Recoll result list is actually made of html text
|
|
||||||
displayed inside a Qt Widget. In all Recoll versions, you
|
|
||||||
can specify the format for the list entries: what data is
|
|
||||||
displayed for each hit document and how. This used to include
|
|
||||||
"almost full" support for HTML capabilities, with a few
|
|
||||||
restrictions due to the Qt QTextBrowser object. The details
|
|
||||||
are described in the
|
|
||||||
<a href="http://www.recoll.org/usermanual/usermanual.html#RCL.SEARCH.GUI.CUSTOM.RESLIST">
|
|
||||||
Recoll manual</a>.</p>
|
|
||||||
|
|
||||||
<p>As of Recoll 1.17, the result list is a WebKit object by
|
|
||||||
default (WebKit is the basis for several major browsers),
|
|
||||||
which yields full CSS and even Javascript support.</p>
|
|
||||||
|
|
||||||
<h2>New in Recoll 1.17: the WebKit result list</h2>
|
|
||||||
|
|
||||||
|
|
||||||
<p>For newer Recoll versions, you can specify the
|
|
||||||
individual result format, as for previous versions. You can
|
|
||||||
also define code to be included in the HTML
|
|
||||||
header (ie: CSS or Javascript), using
|
|
||||||
<tt>Preferences->Query Configuration->Result List->Edit result page html header insert</tt></p>
|
|
||||||
|
|
||||||
<p>This, plus the full Javascript and CSS support in WebKit,
|
|
||||||
open a world of possibilities for result list formatting and
|
|
||||||
even behaviour.</p>
|
|
||||||
|
|
||||||
<p>The examples which follow are probably not generally
|
|
||||||
very useful but they show the kinds of things you can do, if
|
|
||||||
you can use Javascript/CSS which is not my case.</p>
|
|
||||||
|
|
||||||
<h3>Using the icons as links</h3>
|
|
||||||
<p>You can now make the list icons links that activate the
|
|
||||||
preview or open action (or the document url which you can then
|
|
||||||
drag/drop to other windows). Using images as links did
|
|
||||||
not work with QTextBrowser.</p>
|
|
||||||
|
|
||||||
<h3>Alternating result backgrounds</h3>
|
|
||||||
<p>Using the following Javascript inside the header will yield
|
|
||||||
alternating backgrounds for the results:</p>
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<script type="text/javascript">
|
|
||||||
function altRows() {
|
|
||||||
var rows = document.getElementsByClassName("rclresult");
|
|
||||||
for (i = 0; i < rows.length; i++) {
|
|
||||||
if (i % 2 == 0) {
|
|
||||||
rows[i].style.backgroundColor = "#d4e3e5";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
window.onload = function() {
|
|
||||||
altRows();
|
|
||||||
}
|
|
||||||
</script>
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
|
|
||||||
<h3>Zooming the paragraph font size</h3>
|
|
||||||
<p>If you are using a format with small fonts, it may be useful
|
|
||||||
to be able to zoom the text when the mouse hovers over it. A
|
|
||||||
very basic way to do this -<em>with the standard paragraph
|
|
||||||
format, which is a table</em>- would be to include the following
|
|
||||||
code in the header:</p>
|
|
||||||
<pre>
|
|
||||||
<style type="text/css">
|
|
||||||
table:hover {font-size: 130%;}
|
|
||||||
</style>
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
<p>Of course, the selector should be adapted to your own
|
|
||||||
result format. You should know that every result will be
|
|
||||||
enclosed by Recoll inside a <tt><div
|
|
||||||
class="rclresult" rcldocnum="nn"></tt> element.</p>
|
|
||||||
|
|
||||||
<h3>Zooming the thumbnails</h3>
|
|
||||||
|
|
||||||
<p>Recoll 1.17 and newer will display document
|
|
||||||
thumbnails instead of the type icon if the thumbnail exists in
|
|
||||||
the standard Freedesktop location. The icons/thumbnails are
|
|
||||||
64x64 pixels in size, which is a bit small. The standard
|
|
||||||
thumbnail files are actually 128x128, which is much more
|
|
||||||
detailed. Using them statically would consume too much list
|
|
||||||
space though. Using CSS, you can get them to expand when the
|
|
||||||
mouse is over them. Recipee:</p>
|
|
||||||
|
|
||||||
<blockquote>
|
|
||||||
<p>Retrieve the CSS code
|
|
||||||
from <a href="http://randsco.com/_miscPgs/cssZoomPZ3.html">randsco
|
|
||||||
pure CSS photo-caption zoom</a>, and include it inside the
|
|
||||||
result list html header by using the "Edit result page html
|
|
||||||
header insert" from the GUI preferences. Don't forget to
|
|
||||||
enclose the CSS code between <code><style type="text/css">
|
|
||||||
</style></code> tags.</p>
|
|
||||||
|
|
||||||
<p>Use something like the following result paragraph format
|
|
||||||
(only the code around the img tag is relevant, the rest can be
|
|
||||||
what you want):</p>
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<!--
|
|
||||||
<table><tr><td>
|
|
||||||
<div class="PZ3zoom PZ3-l noBdr noCap noLnk" style="width:64px;height:64px;">
|
|
||||||
<a href="%U"> <img src='%I' width='64'></a>
|
|
||||||
</div>
|
|
||||||
</td><td>
|
|
||||||
%R %S %L <b>%T</b><br>%M %D <i>%U</i> %i<br>%A %K
|
|
||||||
</td></tr></table>
|
|
||||||
-->
|
|
||||||
<table><tr><td>
|
|
||||||
|
|
||||||
<div class="PZ3zoom PZ3-l noBdr noCap noLnk" style="width:64px;height:64px;">
|
|
||||||
<a href="%U"> <img src='%I' width='64'></a>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
</td><td>
|
|
||||||
%R %S %L &nbsp;&nbsp;<b>%T</b><br>%M&nbsp;%D&nbsp;&nbsp;&nbsp;<i>%U</i>&nbsp;%i<br>%A %K
|
|
||||||
</td></tr></table>
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
</blockquote>
|
|
||||||
<div class="PZ3zoom PZ3-r noCap noLnk" style="width:100px;height:40px;">
|
|
||||||
<a href="resparpics/pz3.png" onclick="return false">
|
|
||||||
<img src="resparpics/pz3.png" alt="hover zoom" />
|
|
||||||
</a>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<p>Et voilà! The icons will grow to their full size when the mouse is
|
|
||||||
over them.</p>
|
|
||||||
|
|
||||||
<h2>Alternate icons theme</h2>
|
|
||||||
<p>There is an alternate set of icons
|
|
||||||
at <a href="http://kde-look.org/content/show.php?content=145669">
|
|
||||||
kde-look.org</a>. If you are running KDE desktop, it should
|
|
||||||
be more consistent with the rest of your applications.</p>
|
|
||||||
<p>You do not need to replace the standard Recoll set of icons
|
|
||||||
to use it, just extract it somewhere, and use
|
|
||||||
the <tt>iconsdir</tt> variable in <i>~/.recoll/recoll.conf</i> to
|
|
||||||
point Recoll to it. e.g.:
|
|
||||||
<blockquote><pre>
|
|
||||||
<tt>iconsdir = /path/to/my/icons</tt>
|
|
||||||
</pre></blockquote>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<h2>Result list paragraph format samples (for all versions)</h2>
|
|
||||||
|
|
||||||
<p>Here follow some sample formats. Most of them were contributed by
|
|
||||||
kind users, and I'll be happy to show their names if they so
|
|
||||||
wish (abstaining by default).</p>
|
|
||||||
|
|
||||||
<h3>Recoll 1.15 default</h3>
|
|
||||||
<pre>
|
|
||||||
|
|
||||||
<!--
|
|
||||||
<table>
|
|
||||||
<tr>
|
|
||||||
<td><img src='%I'></td>
|
|
||||||
<td>%R %S %L <b>%T</b><br>
|
|
||||||
%M %D <i>%U</i><br>
|
|
||||||
%A %K
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
-->
|
|
||||||
<table>
|
|
||||||
<tr>
|
|
||||||
<td><img src='%I'></td>
|
|
||||||
<td>%R %S %L&nbsp;&nbsp;<b>%T</b><br>
|
|
||||||
%M&nbsp;%D&nbsp;&nbsp;&nbsp;<i>%U</i><br>
|
|
||||||
%A %K
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
<br clear="all">
|
|
||||||
<img src="resparpics/default.png"/>
|
|
||||||
|
|
||||||
<h3>Alternating bands, bigger previews, and custom paragraph
|
|
||||||
typesetting</h3>
|
|
||||||
|
|
||||||
<p>Paul, the author, gives the following description for his
|
|
||||||
result list formatting:
|
|
||||||
<blockquote>
|
|
||||||
It uses the "Alternating Results Background" from that page,
|
|
||||||
plus my own layout which incorporates a larger view of image
|
|
||||||
files. The 'large image' is scaled down from the actual
|
|
||||||
image, rather than a scaled up version of the thumbnail.
|
|
||||||
</blockquote>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<p>The header fragment has the javascript for
|
|
||||||
alternating backgrounds, and the CSS code:</p>
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<!-- Custom Header -->
|
|
||||||
<script type="text/javascript">
|
|
||||||
function altRows() {
|
|
||||||
var rows = document.getElementsByClassName("rclresult");
|
|
||||||
for (i = 0; i < rows.length; i++) {
|
|
||||||
if (i % 2 == 0) {
|
|
||||||
rows[i].style.backgroundColor = "#f0f0f0";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
window.onload = function() {
|
|
||||||
altRows();
|
|
||||||
}
|
|
||||||
</script>
|
|
||||||
|
|
||||||
<style type="text/css">
|
|
||||||
.thumbnail {
|
|
||||||
display:block;
|
|
||||||
position:relative;
|
|
||||||
padding: 4px;
|
|
||||||
width: auto; /* set width of thumbnail image in 'paragraph' code - not here */
|
|
||||||
border:none;
|
|
||||||
z-index:0;
|
|
||||||
}
|
|
||||||
.thumbnail:hover {
|
|
||||||
border:none;
|
|
||||||
background-color: transparent;
|
|
||||||
z-index: 50;
|
|
||||||
}
|
|
||||||
.thumbnail span {
|
|
||||||
position: absolute;
|
|
||||||
left: -9999px;
|
|
||||||
visibility: hidden;
|
|
||||||
}
|
|
||||||
.thumbnail span img {
|
|
||||||
max-width:256px; /* set 'large image' max width/height - advise keeping these */
|
|
||||||
max-height:256px; /* the same to avoid inadvertently changing the aspect ratio */
|
|
||||||
width:auto; /* leave set to auto */
|
|
||||||
height:auto; /* leave set to auto */
|
|
||||||
background-color: gray;
|
|
||||||
padding: 1px;
|
|
||||||
border: 1px solid black;
|
|
||||||
}
|
|
||||||
.thumbnail:hover span {
|
|
||||||
visibility: visible;
|
|
||||||
top: 4px; /* top/left positions 'large image' relative to top left */
|
|
||||||
left: 88px; /* of parent thumbnail (plus padding) */
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
<!-- End of Custom Header -->
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
<p>And the paragraph format:</p>
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<!-- Custom Paragraph -->
|
|
||||||
<table>
|
|
||||||
<tr>
|
|
||||||
<td>
|
|
||||||
<a class="thumbnail" href="#">
|
|
||||||
<img src="%I" width="64px" height="auto"> <!-- set width of thumbnail -->
|
|
||||||
<span>
|
|
||||||
<img src="%U">
|
|
||||||
</span>
|
|
||||||
</a>
|
|
||||||
<td>
|
|
||||||
<table>
|
|
||||||
<tr>
|
|
||||||
<td>
|
|
||||||
<div>
|
|
||||||
<b>%T</b></br>
|
|
||||||
%L</br>
|
|
||||||
<p><font color="grey">%A </font><font color="#CD6688"><i>%K</i></font></p>
|
|
||||||
<font color="green"><font size=1>
|
|
||||||
%U</br>
|
|
||||||
%R — %S—%D — %M
|
|
||||||
</font></font></br>
|
|
||||||
</div>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
<!-- End Custom Paragraph -->
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
<p>Result:</p>
|
|
||||||
<br clear="all">
|
|
||||||
<img src="resparpics/pip.png"/>
|
|
||||||
|
|
||||||
<h3>A simpler format, suggested in Bitbucket issue #69</h3>
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<!--
|
|
||||||
<img src="%I" align="left">%R %L <b>%T</b><br>
|
|
||||||
<i><font color="#808080">%U</font></i> %i<br>
|
|
||||||
%A %K
|
|
||||||
-->
|
|
||||||
<img src="%I" align="left">%R %L&nbsp;&nbsp;<b>%T</b><br>
|
|
||||||
&nbsp;&nbsp;<i><font color="#808080">%U</font></i>&nbsp;%i<br>
|
|
||||||
%A %K
|
|
||||||
</pre>
|
|
||||||
<br clear="all">
|
|
||||||
<img src="resparpics/issue73.png"/>
|
|
||||||
|
|
||||||
|
|
||||||
<h3>Simple+table</h3>
|
|
||||||
|
|
||||||
<p>Same format, but using a table to avoid text flowing into the icon
|
|
||||||
area.</p>
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<!--
|
|
||||||
<table>
|
|
||||||
<tr>
|
|
||||||
<td><img src="%I" align="left"></td>
|
|
||||||
<td>%R %L <b>%T</b><br>
|
|
||||||
<i><font color="#808080">%U</font></i> %i<br>
|
|
||||||
%A %K
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
-->
|
|
||||||
<table>
|
|
||||||
<tr>
|
|
||||||
<td><img src="%I" align="left"></td>
|
|
||||||
<td>%R %L&nbsp;&nbsp;<b>%T</b><br>
|
|
||||||
&nbsp;&nbsp;<i><font color="#808080">%U</font></i>&nbsp;%i<br>
|
|
||||||
%A %K
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
<br clear="all">
|
|
||||||
<img src="resparpics/issue73+table.png"/>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<h3>Using a small font to make the size/date details less obstrusive</h3>
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<!--
|
|
||||||
<table>
|
|
||||||
<tr>
|
|
||||||
<td><img src="%I" align="left"></td>
|
|
||||||
<td><table bgcolor="#bababa">
|
|
||||||
<tr><td><div>
|
|
||||||
<font face="Tahoma, sans-serif"><u><b><a href="P%N">%T</a></b></u><br>
|
|
||||||
<font color=#008000>%L</font><br>
|
|
||||||
<font color=#510101>%A %K</font><br>
|
|
||||||
<font color=#0100FF>%U</font>
|
|
||||||
<p align="right"><font size=1><font color=#000000>%S
|
|
||||||
- %D
|
|
||||||
- %M</font></p>
|
|
||||||
</div></td></tr>
|
|
||||||
</table></td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
-->
|
|
||||||
<table>
|
|
||||||
<tr>
|
|
||||||
<td><img src="%I" align="left"></td>
|
|
||||||
<td><table bgcolor="#bababa">
|
|
||||||
<tr><td><div>
|
|
||||||
<font face="Tahoma, sans-serif"><u><b><a href="P%N">%T</a></b></u><br>
|
|
||||||
<font color=#008000>%L</font><br>
|
|
||||||
<font color=#510101>%A %K</font><br>
|
|
||||||
<font color=#0100FF>%U</font>
|
|
||||||
<p align="right"><font size=1><font color=#000000>%S
|
|
||||||
&nbsp;&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp; %D
|
|
||||||
&nbsp;&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp; %M</font></p>
|
|
||||||
</div></td></tr>
|
|
||||||
</table></td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
<br clear="all">
|
|
||||||
<img src="resparpics/detailSmallGreyTable.png"/>
|
|
||||||
|
|
||||||
|
|
||||||
<h3>A very structured table</h3>
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<!--
|
|
||||||
<table border="1" bgcolor="lightyellow">
|
|
||||||
<tr>
|
|
||||||
<td rowspan="4" width="40px" align="center" valign="center">
|
|
||||||
<img src="%I" width="32" height="32">
|
|
||||||
<p><b>%R</b></p>
|
|
||||||
<p><a href="P%N">Aperçu</a></p>
|
|
||||||
</td>
|
|
||||||
<th colspan="3" bgcolor="lightgrey">%T</th>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td align="center">%M</td>
|
|
||||||
<td align="center">%D</td>
|
|
||||||
<td align="center">%S</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td colspan="3"><a href="E%N">%U</a></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td colspan="3">%A</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
-->
|
|
||||||
<table border="1" bgcolor="lightyellow">
|
|
||||||
<tr>
|
|
||||||
<td rowspan="4" width="40px" align="center" valign="center">
|
|
||||||
<img src="%I" width="32" height="32">
|
|
||||||
<p><b>%R</b></p>
|
|
||||||
<p><a href="P%N">Aperçu</a></p>
|
|
||||||
</td>
|
|
||||||
<th colspan="3" bgcolor="lightgrey">%T</th>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td align="center">%M</td>
|
|
||||||
<td align="center">%D</td>
|
|
||||||
<td align="center">%S</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td colspan="3"><a href="E%N">%U</a></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td colspan="3">%A</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
</pre>
|
|
||||||
<br clear="all">
|
|
||||||
<img src="resparpics/structuredTable.png"/>
|
|
||||||
|
|
||||||
|
|
||||||
<h3>Web-like from the user manual</h3>
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<!--
|
|
||||||
<u><b><a href="P%N">%T</a></b></u><br>
|
|
||||||
%U<br>
|
|
||||||
%A <font color=#008000>%S</font> - <a href="E%N">Edit</a>
|
|
||||||
-->
|
|
||||||
<u><b><a href="P%N">%T</a></b></u><br>
|
|
||||||
%U<br>
|
|
||||||
%A <font color=#008000>%S</font> - <a href="E%N">Edit</a>
|
|
||||||
</pre>
|
|
||||||
<br clear="all">
|
|
||||||
<img src="resparpics/weblike.png"/>
|
|
||||||
|
|
||||||
|
|
||||||
<h3>Clean-Looking from the user manual</h3>
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<!--
|
|
||||||
<table>
|
|
||||||
<tr><td><img src="%I" align="left"></td>
|
|
||||||
<td>%L <font color="#900000">%R</font> <b>%T</b><br>
|
|
||||||
%S <font color="#808080"><i>%U</i></font>
|
|
||||||
<table bgcolor="#e0e0e0">
|
|
||||||
<tr><td><div>%A</div> %K </td></tr>
|
|
||||||
</table></td>
|
|
||||||
</table>
|
|
||||||
-->
|
|
||||||
<table>
|
|
||||||
<tr><td><img src="%I" align="left"></td>
|
|
||||||
<td>%L <font color="#900000">%R</font> <b>%T</b><br>
|
|
||||||
%S <font color="#808080"><i>%U</i></font>
|
|
||||||
<table bgcolor="#e0e0e0">
|
|
||||||
<tr><td><div>%A</div> %K </td></tr>
|
|
||||||
</table></td>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
</pre>
|
|
||||||
<br clear="all">
|
|
||||||
<img src="resparpics/clean.png"/>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<h3>Another clean and nice one, using both a bit of header code and a
|
|
||||||
custom paragraph format</h3>
|
|
||||||
|
|
||||||
<p>This one also uses the custom icons set from
|
|
||||||
<a href="http://kde-look.org/content/show.php?content=145669">
|
|
||||||
this kde-look page</a>.</p>
|
|
||||||
|
|
||||||
<p>The header code:</p>
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<style type="text/css">
|
|
||||||
body {
|
|
||||||
color: rgb(0, 0, 0);
|
|
||||||
background-color: rgb(224, 224, 224);
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
<p>The paragraph code:</p>
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<table style="background-color: white; width: 950px;"
|
|
||||||
border-style="none" border-color:="" border="0">
|
|
||||||
<tbody>
|
|
||||||
<tr>
|
|
||||||
<td rowspan="4"
|
|
||||||
style="width: 68px; text-align: center; background-color: rgb(238, 238, 238);">
|
|
||||||
<img src="%I" height="32" width="32">
|
|
||||||
<p style="font-family: sans-serif;"><b>%R</b></p>
|
|
||||||
<p style="font-family: sans-serif; color: rgb(0, 153, 0);"><br>
|
|
||||||
</p>
|
|
||||||
</td>
|
|
||||||
<td style="vertical-align: top;"><br>
|
|
||||||
</td>
|
|
||||||
<th
|
|
||||||
style="font-family: sans-serif; background-color: white; text-align: left;"
|
|
||||||
colspan="3" bgcolor="lightgrey">%T</th>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td style="vertical-align: top; width: 11px;"><br>
|
|
||||||
</td>
|
|
||||||
<td
|
|
||||||
style="text-align: center; font-family: sans-serif; background-color: rgb(249, 249, 249);">%M</td>
|
|
||||||
<td
|
|
||||||
style="text-align: center; font-family: sans-serif; background-color: rgb(249, 249, 249);">%D</td>
|
|
||||||
<td
|
|
||||||
style="font-family: sans-serif; text-align: right; background-color: rgb(249, 249, 249);">%S</td>
|
|
||||||
</tr>
|
|
||||||
<tr style="font-family: sans-serif; color: rgb(0, 153, 0);">
|
|
||||||
<td style="vertical-align: top;"><br>
|
|
||||||
</td>
|
|
||||||
<td colspan="3"><a href="E%N">%U</a></td>
|
|
||||||
</tr>
|
|
||||||
<tr style="font-family: sans-serif;" 8="">
|
|
||||||
<td style="vertical-align: top;"><br>
|
|
||||||
</td>
|
|
||||||
<td colspan="3">%A</td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
<br clear="all">
|
|
||||||
<img src="resparpics/christopher.png"/>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
||||||
@ -1,109 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>RECOLL: a personal text search system for
|
|
||||||
Unix/Linux</title>
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll is a simple full-text search system for unix and linux based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"full text search,fulltext,desktop search,unix,linux,solaris,open source,free">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content=
|
|
||||||
"text/html; charset=iso-8859-1">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="index.html">Home</a></li>
|
|
||||||
<li><a href="pics/index.html">Screenshots</a></li>
|
|
||||||
<li><a href="download.html">Downloads</a></li>
|
|
||||||
<li><a href="doc.html">Documentation</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h1>Contributing to Recoll developement and availability</h1>
|
|
||||||
|
|
||||||
<p>If you are not a software developer, or have no time
|
|
||||||
available for testing the application of thinking about how it
|
|
||||||
could be improved, there is always the possibility of
|
|
||||||
contributing a donation, which will be much appreciated !<br/>
|
|
||||||
<a href="/donations/index.html">
|
|
||||||
<img src="/donations/btn_donate_LG.gif" /></a>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<p>If you wish to become involved in the development of <span
|
|
||||||
class="application">Recoll</span>, please send me an <a
|
|
||||||
href="mailto:jfd@recoll.org">email</a>.</p>
|
|
||||||
|
|
||||||
<h1><a name="translation">Translation</a></h1>
|
|
||||||
|
|
||||||
<p>More translations is good ! If you are a non-english speaker
|
|
||||||
(and understand english, which can probably be assumed, you
|
|
||||||
being reading this), you can take a little time to translate
|
|
||||||
the GUI messages file.</p>
|
|
||||||
<p>The newest versions of the message files follow can be found
|
|
||||||
in <a href="translations">this directory</a>. There
|
|
||||||
is an empty one (the xx one), the others are partially
|
|
||||||
translated, just needing an update for the new messages.<p>
|
|
||||||
<p>Updating the files can easily be done with
|
|
||||||
the <span class="application">Qt Linguist</span>. Contact me
|
|
||||||
for more directions if needed.</p>
|
|
||||||
|
|
||||||
<h1><a name="development">Development</a></h1>
|
|
||||||
|
|
||||||
<p>The Recoll source repository is on
|
|
||||||
<a href="https://opensourceprojects.eu/p/recoll1/code/">opensourceprojects.eu</a>.
|
|
||||||
Use git, to clone it and hack away.</p>
|
|
||||||
|
|
||||||
<p>Apart from possible tickets in the
|
|
||||||
<a href="https://opensourceprojects.eu/p/recoll1/tickets/">tracking
|
|
||||||
system</a>, these are the general areas where help or
|
|
||||||
ideas are particularly welcome:</p>
|
|
||||||
<ul>
|
|
||||||
<li>A better GUI design (both the ergonomy and the
|
|
||||||
appearance). Adding missing shortcuts or fixing the menu
|
|
||||||
accelerators for exemple is easy and useful.</li>
|
|
||||||
|
|
||||||
<li>More support for the more advanced <span class=
|
|
||||||
"application">Xapian</span> concepts like relevance
|
|
||||||
feedback.</li>
|
|
||||||
|
|
||||||
<li>More filters for less common or less obviously
|
|
||||||
useful file types.</li>
|
|
||||||
|
|
||||||
<li>Integration with the <span class="application">KDE</span>
|
|
||||||
desktop.</li>
|
|
||||||
|
|
||||||
<li>Integration with some mail user agent. We need a way to
|
|
||||||
jump from a message preview to the message in thread context
|
|
||||||
inside the MUA.</li>
|
|
||||||
|
|
||||||
<li>Etc. :)</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h1><a name="problemreport">Problem reporting</a></h1>
|
|
||||||
|
|
||||||
<p>Once in a while it will happen that a Recoll program will
|
|
||||||
crash (either the "recoll" graphical interface or the
|
|
||||||
"recollindex" command line indexing command).</p>
|
|
||||||
|
|
||||||
<p>Reporting crashes is very useful. It can help others, and it
|
|
||||||
can get your own problem to be solved.</p>
|
|
||||||
|
|
||||||
<p>You will find help and information about producing a useful
|
|
||||||
problem report on this
|
|
||||||
<a href="faqsandhowtos/ProblemSolvingData.html">
|
|
||||||
Howto page</a>.</p>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
||||||
@ -1,71 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Recoll documentation</title>
|
|
||||||
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll is a simple full-text search system for unix and linux
|
|
||||||
based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"full text search, desktop search, unix, linux">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
|
|
||||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="index.html">Home</a></li>
|
|
||||||
<li><a href="download.html">Downloads</a></li>
|
|
||||||
<li>Documentation</li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h1>Recoll user manual</h1>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li><a href="usermanual/webhelp/docs/index.html">English, HTML, many
|
|
||||||
pages, nicer format (needs javascript).</a></li>
|
|
||||||
<li><a href="usermanual/usermanual.html">English, HTML, one page</a></li>
|
|
||||||
<li><a href="http://stupidbeauty.com/Blog/2012/03/recoll%E7%94%A8%E6%88%B6%E6%89%8B%E5%86%8A%E7%BF%BB%E8%AD%AF%EF%BC%8Crecoll-user-manual-2/">
|
|
||||||
中文,HTML</a></li>
|
|
||||||
<li><a href="usermanual/recoll_user_manual.pdf">English, PDF</a></li>
|
|
||||||
<li><a href="http://mcz.altervista.org/Pagine/usermanual-italian.html">
|
|
||||||
Italian (rather old)</a></li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><br></p>
|
|
||||||
|
|
||||||
<h1>Faqs and Howtos</h1>
|
|
||||||
|
|
||||||
<p>You will find a number of useful tips for common
|
|
||||||
issues and extensions on the
|
|
||||||
<a href="faqsandhowtos/index.html">
|
|
||||||
Faqs and Howtos section</a>.
|
|
||||||
|
|
||||||
<h1>Other documentation</h1>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li><a href="recoll_XMP/index.html">Indexing PDF
|
|
||||||
XMP-metadata</a>: a nice exemple of customizing a Recoll
|
|
||||||
configuration and the PDF filter to use additional
|
|
||||||
metadata, by Jeffrey Dick.</li>
|
|
||||||
<li><a href="perfs.html">Index size and indexing performance
|
|
||||||
data.</a></li>
|
|
||||||
<li><a href="custom.html">Result list format samples.</a></li>
|
|
||||||
<li><a href="idxthreads/threadingRecoll.html">Lessons learned
|
|
||||||
while modifying Recoll indexing to be multithreaded</a>.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@ -1,462 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Recoll download</title>
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description"
|
|
||||||
content="recoll is a simple full-text search system for unix and linux based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content="full text search, desktop search, unix, linux">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
|
||||||
|
|
||||||
<script type="text/javascript">
|
|
||||||
function showdiv(viewid)
|
|
||||||
{
|
|
||||||
var ids = ["general", "bugs", "source", "packages",
|
|
||||||
"windows", "ports", "filters", "translations"];
|
|
||||||
for (var i = 0; i < ids.length; i++) {
|
|
||||||
document.getElementById(ids[i]).style.display = "none";
|
|
||||||
}
|
|
||||||
document.getElementById(viewid).style.display = "block";
|
|
||||||
}
|
|
||||||
</script>
|
|
||||||
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="index.html.en">Home</a></li>
|
|
||||||
<li><b>Downloads</b></li>
|
|
||||||
<li><a href="doc.html">Documentation</a></li>
|
|
||||||
<li><a href="usermanual/usermanual.html#RCL.INSTALL">Installation</a></li>
|
|
||||||
<li><a href="support.html">Support</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
<h1>Recoll downloads</h1>
|
|
||||||
|
|
||||||
<div class="intrapage">
|
|
||||||
|
|
||||||
<table width="100%">
|
|
||||||
<tbody>
|
|
||||||
<tr>
|
|
||||||
<td><a href="#general" onmouseover="showdiv('general')">
|
|
||||||
General</a></td>
|
|
||||||
<td><a href="#source" onmouseover="showdiv('source')">
|
|
||||||
Source</a></td>
|
|
||||||
<td><a href="#packages" onmouseover="showdiv('packages')">
|
|
||||||
Linux Packages (.rpm and .deb)</a></td>
|
|
||||||
<td><a href="#windows" onmouseover="showdiv('windows')">
|
|
||||||
Windows</a></td>
|
|
||||||
<td><a href="BUGS.html" onmouseover="showdiv('bugs')">
|
|
||||||
Known bugs</a></td>
|
|
||||||
<td><a href="#ports" onmouseover="showdiv('ports')">
|
|
||||||
Mac ports</a></td>
|
|
||||||
<td><a href="filters/filters.html" onmouseover="showdiv('filters')">
|
|
||||||
Updated Filters</a></td>
|
|
||||||
<td><a href="#translations" onmouseover="showdiv('translations')">
|
|
||||||
Translations</a></td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div id="general">
|
|
||||||
<h2><a name="general">General information</a></h2>
|
|
||||||
|
|
||||||
<p>The current version is 1.23.2. <a href="release-1.23.html">Release
|
|
||||||
notes</a>.</p>
|
|
||||||
|
|
||||||
<p>Recoll <a href="usermanual/usermanual.html#RCL.INSTALL">Installation
|
|
||||||
/ building manual</a>.</p>
|
|
||||||
|
|
||||||
<p>The indexing filters used for some document types may need external
|
|
||||||
packages not installed on your system by default, and not installed
|
|
||||||
automatically with Recoll: <a href="features.html#doctypes">take a
|
|
||||||
look at the list</a> and decide what you need to install.</p>
|
|
||||||
|
|
||||||
<p>The Recoll term explorer tool in phonetic mode (marginally useful and
|
|
||||||
optional) uses the <b>aspell</b> package, version 0.60
|
|
||||||
(utf-8 support) or newer.</p>
|
|
||||||
|
|
||||||
<p>If you find problems with this page, the package or its
|
|
||||||
installation, <em>please</em> <a href="mailto:jfd@recoll.org">report
|
|
||||||
them</a>.</p>
|
|
||||||
|
|
||||||
<h4>What do the release numbers mean?</h4>
|
|
||||||
|
|
||||||
<p>The Recoll releases are numbered X.Y.Z. The X would only
|
|
||||||
change for really major modifications like a big change in
|
|
||||||
the index format, and possibly won't ever reach 2.</p>
|
|
||||||
|
|
||||||
<p>Y is for functional modifications. These may bring bugs, so
|
|
||||||
if you don't need the new features, you may want to wait a
|
|
||||||
little, and especially skip the first release (X.Y.0), at
|
|
||||||
least for a few weeks.</p>
|
|
||||||
|
|
||||||
<p>Z changes for <a href="BUGS.html">bug fixes</a> only, and
|
|
||||||
moving from X.Y.Z to X.Y.Z+u should in general involve
|
|
||||||
little risk of regression. But, <em>any</em> change can
|
|
||||||
bring problems, if you are not affected by the corrected
|
|
||||||
bugs (check the <a href="release-1.21.html">release
|
|
||||||
file</a>), there is probably no necessity to upgrade
|
|
||||||
anyway.</p>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div id="bugs">
|
|
||||||
<h2><a name="bugs">Known bugs</a></h2>
|
|
||||||
<p>There is a <a href="BUGS.html">history of known bugs</a>, sorted
|
|
||||||
by fix release. Also see
|
|
||||||
the <a href="https://opensourceprojects.eu/p/recoll1/tickets/new/">
|
|
||||||
Recoll issue tracker</a>.
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div id="source">
|
|
||||||
<h2><a name="source">Source</a></h2>
|
|
||||||
|
|
||||||
<h3>Current release distribution: 1.23.2:</h3>
|
|
||||||
<!-- Attention: source packages must remain here, not in a
|
|
||||||
subdirectory, because of all the places they're referred from
|
|
||||||
(package watches) -->
|
|
||||||
|
|
||||||
<p><a href="recoll-1.23.2.tar.gz">recoll-1.23.2.tar.gz</a>.</p>
|
|
||||||
<p><a href="release-1.23.html">Release notes</a>.</p>
|
|
||||||
|
|
||||||
<h3>Previous release: 1.22.4:</h3>
|
|
||||||
<p><a href="recoll-1.22.4.tar.gz">recoll-1.22.4.tar.gz</a>.</p>
|
|
||||||
<p><a href="release-1.22.html">Release notes</a>.</p>
|
|
||||||
|
|
||||||
|
|
||||||
<!--
|
|
||||||
<h3>Snapshot</h3>
|
|
||||||
<p>I sometimes release a source tarfile when I consider that the
|
|
||||||
current development version is stable enough. The current
|
|
||||||
snapshot contains commits up to 2240 (see
|
|
||||||
<a href="https://opensourceprojects.eu/p/recoll1/code/commit_browser">
|
|
||||||
the changelog</a>, and a synthetic abstract in the
|
|
||||||
current <a href="release-1.16.html">1.16 release notes</a>).
|
|
||||||
<p><a href="betarecoll-2240.tar.gz">betarecoll-2240.tar.gz</a>.</p>
|
|
||||||
-->
|
|
||||||
|
|
||||||
<h3>Ubuntu Unity Lens and Scope</h3>
|
|
||||||
|
|
||||||
<p>You will probably get these from the <a href="#ubuntu">PPA</a>, but
|
|
||||||
here are the source files. These are not included in the main tar file
|
|
||||||
any more. For any Recoll version after 1.19 (choose on the
|
|
||||||
Ubuntu version, not the Recoll one):
|
|
||||||
|
|
||||||
<blockquote>
|
|
||||||
<a href="recoll-lens-1.19.10.3543.tar.gz">
|
|
||||||
recoll-lens-1.19.10.3543.tar.gz</a> (Ubuntu up to 13.04
|
|
||||||
Raring)<br>
|
|
||||||
|
|
||||||
<a href="unity-scope-recoll-1.20.2.4.tar.gz">
|
|
||||||
unity-scope-recoll-1.20.2.4.tar.gz</a> (Ubuntu 13.10 and
|
|
||||||
later).<br>
|
|
||||||
|
|
||||||
</blockquote>
|
|
||||||
|
|
||||||
For Recoll 1.18:
|
|
||||||
<a href="recoll-lens-1.18.1.2997.tar.gz">
|
|
||||||
recoll-lens-1.18.1.2997.tar.gz</a><br>
|
|
||||||
For Recoll 1.17:
|
|
||||||
<a href="recoll-lens-1.17.2.2697.tar.gz">
|
|
||||||
recoll-lens-1.17.2.2697.tar.gz</a>
|
|
||||||
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<h3>Prerequisites for building from source:</h3>
|
|
||||||
<ul>
|
|
||||||
<li>C++ compiler. Be aware that its absence sometimes
|
|
||||||
manifests itself by quite cryptic messages.</li>
|
|
||||||
|
|
||||||
<li><p>Xapian core development libraries. Most Linux
|
|
||||||
distributions carry them in their package repository. Or
|
|
||||||
you will find source and binary packages on
|
|
||||||
the <a href="http://www.xapian.org/download.php">Xapian
|
|
||||||
download page</a>.
|
|
||||||
<br>
|
|
||||||
<p><em>Note on building Xapian for older CPUs:</em> The build
|
|
||||||
configurations for Xapian releases 1.0.21 and 1.2.1 or
|
|
||||||
newer enable the use of SSE2 floating point
|
|
||||||
instructions. These instructions are not available in
|
|
||||||
CPUs older than Intel Pentium 4 or AMD Athlon 64. When
|
|
||||||
building for such a CPU, you need to add the
|
|
||||||
--disable-sse flag to the Xapian library configure
|
|
||||||
command. If this is not done, the problem signals itself
|
|
||||||
by "Illegal instruction" crashes (SIGILL) in recollindex
|
|
||||||
and recoll. </p>
|
|
||||||
</li>
|
|
||||||
<li>Qt development files: Qt 4.4, 5.3 or newer (5.2 not ok).</li>
|
|
||||||
<li>Qt WebKit development files: these are quite often
|
|
||||||
distributed apart from the main Qt libraries. It is
|
|
||||||
possible to configure Recoll not to use Qt WebKit (see
|
|
||||||
configure --help).</li>
|
|
||||||
<li>zlib development files.</li>
|
|
||||||
<li>X11 development files.</li>
|
|
||||||
<li>Python development package: you can avoid needing this
|
|
||||||
by configuring with --disable-python-module.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h3>Source repository:</h3>
|
|
||||||
|
|
||||||
<p>The <span class="application">Recoll</span> source
|
|
||||||
repository is hosted
|
|
||||||
on <a href="https://opensourceprojects.eu/p/recoll1/code/">
|
|
||||||
opensourceprojects.eu</a>. The trunk is usually a bit on the
|
|
||||||
bleeding edge, but there is always a maintenance branch for
|
|
||||||
the current production version.</p>
|
|
||||||
|
|
||||||
<h3>Instructions for building</h3>
|
|
||||||
|
|
||||||
<p>Normally, it's just:</p>
|
|
||||||
<div class="code">./configure; make; make install</div>
|
|
||||||
<p>If a bit more detail is needed,
|
|
||||||
<a href="http://www.recoll.org/usermanual/usermanual.html#RCL.INSTALL.BUILDING">
|
|
||||||
there is some in the manual</a>.
|
|
||||||
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div id="packages">
|
|
||||||
<h2><a name="packages">Packages</a></h2>
|
|
||||||
|
|
||||||
<p>Packages or ports for Recoll are available in the standard
|
|
||||||
repositories for many distributions.</p>
|
|
||||||
|
|
||||||
<p>However they are often a bit older or built with older
|
|
||||||
Xapian releases. Here follow some pointers to find newer
|
|
||||||
packages for some distributions. In most cases, you will
|
|
||||||
just need to use an alternate repository.</p>
|
|
||||||
|
|
||||||
<h3><a name="debian">Debian</a></h3>
|
|
||||||
|
|
||||||
<p>The Debian Recoll packages are not always up to date in
|
|
||||||
stable distributions. Debian Wheezy and Jessie have Recoll
|
|
||||||
1.17.3. which is ancient (it was an accident for
|
|
||||||
Jessie). Stretch has 1.22.4 which is largely ok.</p>
|
|
||||||
|
|
||||||
<p>I am maintaining a repository for newer versions of the packages.
|
|
||||||
The repository currently has recoll 1.23.x for Jessie and
|
|
||||||
Stretch, Intel 32 and 64 bits, and armhf, and slightly older
|
|
||||||
1.22 packages for Wheezy. There is a separate
|
|
||||||
repository for Raspbian Jessie, which is <em>not</em>
|
|
||||||
compatible with vanilla Debian.</p>
|
|
||||||
|
|
||||||
<p>To add the Debian or Raspbian repository to your sources:</p>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
|
|
||||||
<li>See <a href="../pages/signatures.html">here</a> for the
|
|
||||||
keys used to signed the repository. You will need to import
|
|
||||||
them to suppress <b>apt-get</b> messages about unverified
|
|
||||||
signatures (the method is described on the
|
|
||||||
<a href="../pages/signatures.html">same page</a>).</li>
|
|
||||||
|
|
||||||
<li>Create and edit <span class="filename">
|
|
||||||
/etc/apt/sources.list.d/recoll.list</span>
|
|
||||||
and add the following lines:<br>
|
|
||||||
for Debian wheezy (debian 7.x, recoll 1.22.3):<br>
|
|
||||||
<div class="code">
|
|
||||||
deb http://www.lesbonscomptes.com/recoll/debian/ wheezy main
|
|
||||||
deb-src http://www.lesbonscomptes.com/recoll/debian/ wheezy main
|
|
||||||
</div>
|
|
||||||
for Debian jessie (debian 8.x):<br>
|
|
||||||
<div class="code">
|
|
||||||
deb http://www.lesbonscomptes.com/recoll/debian/ jessie main
|
|
||||||
deb-src http://www.lesbonscomptes.com/recoll/debian/ jessie main
|
|
||||||
</div>
|
|
||||||
for Debian stretch (debian 9.x):<br>
|
|
||||||
<div class="code">
|
|
||||||
deb http://www.lesbonscomptes.com/recoll/debian/ stretch main
|
|
||||||
deb-src http://www.lesbonscomptes.com/recoll/debian/ stretch main
|
|
||||||
</div>
|
|
||||||
for Raspbian jessie (raspbian 8.x):<br>
|
|
||||||
<div class="code">
|
|
||||||
deb http://www.lesbonscomptes.com/recoll/raspbian/ jessie main
|
|
||||||
deb-src http://www.lesbonscomptes.com/recoll/raspbian/ jessie main
|
|
||||||
</div>
|
|
||||||
<li>Then:
|
|
||||||
<div class="code">
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install recoll python-recoll python3-recoll
|
|
||||||
</div>
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p>If you prefer to manually install the packages, they are here:
|
|
||||||
<a href="debian/pool/main/r/recoll/">
|
|
||||||
debian/pool/main/r/recoll/</a><br/>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<h3><a name="ubuntu">Ubuntu</a></h3>
|
|
||||||
|
|
||||||
<p>There are Personal Package Archives on launchpad.net for
|
|
||||||
<a href="https://launchpad.net/~recoll-backports/+archive/recoll-1.15-on">
|
|
||||||
Recoll, kio-recoll and recoll-lens</a>. These were built
|
|
||||||
from the latest versions, for the current set of supported Ubuntu
|
|
||||||
versions. Procedure:</p>
|
|
||||||
<div class="code">
|
|
||||||
sudo add-apt-repository ppa:recoll-backports/recoll-1.15-on
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install recoll
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<p>The packages in the PPA now have a separate package for the Python
|
|
||||||
extension, like the standard ones, so there should be no more
|
|
||||||
conflict issues while switching from the PPA to the normal
|
|
||||||
repositories and back.</p>
|
|
||||||
|
|
||||||
<h3><a name="mint">Linux Mint</a></h3>
|
|
||||||
|
|
||||||
<p>The Ubuntu PPA works perfectly for Mint 13 (and probably other releases
|
|
||||||
too). Just follow the instructions for Ubuntu.</p>
|
|
||||||
|
|
||||||
<h3>RPMS</h3>
|
|
||||||
|
|
||||||
<p>You'll need to install the Xapian, Qt, Qt-Webkit and zlib development
|
|
||||||
packages if you want use the source rpms.</p>
|
|
||||||
|
|
||||||
<h3>Fedora</h3>
|
|
||||||
|
|
||||||
<p>Recoll is present in the standard Fedora package repositories starting from
|
|
||||||
F-12. Recoll packages in Fedora are usually fairly up to
|
|
||||||
date. Please get in touch if you have a need for a Recoll package
|
|
||||||
for Fedora.</p>
|
|
||||||
|
|
||||||
<h3>CentOS 7.1</h3>
|
|
||||||
|
|
||||||
<p><a href="https://fedoraproject.org/wiki/EPEL">EPEL</a> now
|
|
||||||
has a package for Recoll. It is in currently in
|
|
||||||
the <a href="https://fedoraproject.org/wiki/EPEL/testing">testing
|
|
||||||
section</a>, but it should hopefully move on
|
|
||||||
shortly. If you install the test package (which runs just
|
|
||||||
fine as far as I can see), please add feedback to
|
|
||||||
the
|
|
||||||
<a href="https://bodhi.fedoraproject.org/updates/FEDORA-EPEL-2017-ede90eda56">
|
|
||||||
package page</a>.
|
|
||||||
|
|
||||||
<p>If EPEL does not work for you, there are still a few
|
|
||||||
<a href="downloads/centos71">pre-EPEL packages
|
|
||||||
here</a>. They will be deleted shortly, except if someone
|
|
||||||
provides me with a good reason to keep them. There are
|
|
||||||
only x86_64 binaries, use the source rpm for other
|
|
||||||
archs. As base CentOS does not seem to have the Qt WebKit
|
|
||||||
module, the Recoll build uses QTextBrowser instead of a
|
|
||||||
WebKit QWebView, so no Javascript or advanced CSS in the
|
|
||||||
result list or snippets window for you (the EPEL package
|
|
||||||
uses WebKit, so this is another way it is better).</p>
|
|
||||||
|
|
||||||
|
|
||||||
<h3>OpenSUSE</h3>
|
|
||||||
|
|
||||||
<p>Recoll is in the KDE:Extra repository. You just need to add the
|
|
||||||
repository to your software
|
|
||||||
sources (Yast2->software->Software repositories).<br>
|
|
||||||
<a href="http://download.opensuse.org/repositories/KDE:/Extra/">
|
|
||||||
Repository list (supported Suse versions)</a>.
|
|
||||||
After adding the appropriate repository to your software sources,
|
|
||||||
you will be able to install recoll and kio_recoll from the software
|
|
||||||
management interface. The Xapian dependancy will also be satisfied
|
|
||||||
from the build service repository. Some of the older repositories do
|
|
||||||
not build antiword, just tell the software manager to "break" recoll
|
|
||||||
by installing anyway, and get antiword somewhere else.</p>
|
|
||||||
|
|
||||||
|
|
||||||
</div> <!-- Packages -->
|
|
||||||
|
|
||||||
<div id="windows">
|
|
||||||
<h2><a name="windows">Microsoft Windows Setup Files</a></h2>
|
|
||||||
|
|
||||||
<p>The port of Recoll to Windows is still a bit experimental and
|
|
||||||
lacking things like real-time indexing or spelling
|
|
||||||
suggestions. However it works well enough to be useful. More info
|
|
||||||
and links to the setup
|
|
||||||
files <a href="pages/recoll-windows.html">here</a>.</p>
|
|
||||||
</div> <!-- windows -->
|
|
||||||
|
|
||||||
<div id="ports">
|
|
||||||
<h2><a name="ports">Ports</a></h2>
|
|
||||||
|
|
||||||
<h3>Mac port</h3>
|
|
||||||
|
|
||||||
<p>It seems that Recoll will sometimes find data that Spotlight misses
|
|
||||||
(especially inside pdfs apparently, which is probably more to the credit of
|
|
||||||
poppler than recoll itself).</p>
|
|
||||||
|
|
||||||
<p>Recoll is in MacPorts and really easy to install:</p>
|
|
||||||
<ol>
|
|
||||||
<li><a href="https://trac.macports.org/wiki/InstallingMacPorts">Install
|
|
||||||
MacPorts</a>.</li>
|
|
||||||
<li>Type "sudo port install recoll"</li>
|
|
||||||
</ol>
|
|
||||||
|
|
||||||
<p>Recoll is then available from the command line and as an icon in the usual
|
|
||||||
MacPorts applications place.</p>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div id="filters">
|
|
||||||
<h2><a name="filters">Updated filters</a></h2>
|
|
||||||
<p><a href="filters/filters.html">new or updated filters</a>
|
|
||||||
sometimes become available after a release. As a rule, all
|
|
||||||
filters are compatible with all Recoll versions. Any
|
|
||||||
compatibility problem will be explicitely mentionned.</p>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div id="translations">
|
|
||||||
<h2><a name="translations">Translations</a></h2>
|
|
||||||
|
|
||||||
<p>Most of the translations for 1.22/23 are incomplete The source
|
|
||||||
translation files are included in the source release. If
|
|
||||||
your language has some english messages left and you want to
|
|
||||||
take a shot at fixing the problem, you can send the results
|
|
||||||
to <a href="mailto:jfd@recoll.org">me</a> and earn my
|
|
||||||
gratefulness (and your less multilingual
|
|
||||||
compatriot's)...</p>
|
|
||||||
|
|
||||||
<p>You can use the <em>.ts</em> file to alter the translations
|
|
||||||
if you wish (use Qt's <em>linguist</em> tool to edit the
|
|
||||||
source file, then <em>lrelease</em> to produce
|
|
||||||
the <em>.qm</em> file.). The <em>.qm</em> file should be copied
|
|
||||||
to <span class="filename">/usr/[local/]share/recoll/translations</span>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<p><a href="translations/recoll_xx.ts">recoll_xx.ts</a> is a blank
|
|
||||||
Recoll 1.22 message file, handy to work on a new translation. You can
|
|
||||||
also <a href="translations/">list the directory</a> to see all the
|
|
||||||
translation files (same as those in the maintenance source branch on
|
|
||||||
opensourceprojects.eu).</p>
|
|
||||||
|
|
||||||
<h3>Updated 1.22 translations that became available after the
|
|
||||||
release:</h3>
|
|
||||||
|
|
||||||
<p>Greek translation by Dimitrios Glentadakis:
|
|
||||||
<a href="translations/recoll_el.ts">recoll_el.ts</a>
|
|
||||||
<a href="translations/recoll_el.qm">recoll_el.qm</a><br/>
|
|
||||||
</p>
|
|
||||||
<p>Dutch translation by Leslie Scheelings:
|
|
||||||
<a href="translations/recoll_nl.ts">recoll_nl.ts</a>
|
|
||||||
<a href="translations/recoll_nl.qm">recoll_nl.qm</a><br/>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<p>Danish translation by Morten Langlo:
|
|
||||||
<a href="translations/recoll_da.ts">recoll_da.ts</a>
|
|
||||||
<a href="translations/recoll_da.qm">recoll_da.qm</a><br/>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<p>Note that, if you are running an older release, you may find updated
|
|
||||||
messages by looking inside the appropriate maintenance
|
|
||||||
branch in
|
|
||||||
<a href="https://opensourceprojects.eu/p/recoll1/code/">
|
|
||||||
the source repository</a>.</p>
|
|
||||||
|
|
||||||
</div> <!-- translations -->
|
|
||||||
</div> <!-- content -->
|
|
||||||
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@ -1,35 +0,0 @@
|
|||||||
== Extending the Recoll Firefox visited web page indexing mechanism to other browsers
|
|
||||||
|
|
||||||
The *Recoll* _Web Queue_ function allows using WEB browser plug-ins
|
|
||||||
originally designed for indexing visited WEB pages with *Beagle* (rip). The
|
|
||||||
browser plug-ins works very simply by creating copies of the visited pages
|
|
||||||
in a designated directory. Two files are created for each page, one for the
|
|
||||||
contents, the other for the metadata.
|
|
||||||
|
|
||||||
When activated, *Recoll* will visit the queue directory and index each HTML
|
|
||||||
page and its associated metadata. There is more detail about the mechanism
|
|
||||||
on the [[IndexWebHistory|page about the Recoll Web queue]], but mostly, you
|
|
||||||
just need to go to the _Indexing Preferences_ in the *recoll* GUI, open the
|
|
||||||
_Web history_ panel and check the top button.
|
|
||||||
|
|
||||||
Franck, a *Recoll* and *Elinks* user from New Zealand, designed a method
|
|
||||||
and wrote a script to index the *Elinks* WEB history in this fashion.
|
|
||||||
|
|
||||||
The script works by using *wget* to fetch the visited page into the queue
|
|
||||||
directory. This means that it would be reusable to index arbitrary WEB
|
|
||||||
pages in contexts other than *Elinks* visits.
|
|
||||||
|
|
||||||
Recipee for *Elinks* and Recoll 1.18 and later:
|
|
||||||
|
|
||||||
* Retrieve the
|
|
||||||
link:https://www.recoll.org/files/elinks_recoll.sh[elinks_recoll.sh] shell
|
|
||||||
script and make it executable (`chmod a+x elinks_recoll.sh`).
|
|
||||||
* In the Elinks Keyboard shortcut manager (k)/Main, add a shortcut to pass
|
|
||||||
the current URL to an external commande, e.g. _Ctrl-P_.
|
|
||||||
* In the Options manager (o) /Document/Uri Passing, add an action named for
|
|
||||||
example _ToIndex_
|
|
||||||
* Modify the ToIndex action to execute `/path/to/the/script/elinks_recoll.sh %c`
|
|
||||||
* Save, you are done
|
|
||||||
|
|
||||||
For Recoll 1.17, the method is analog, but the script is named
|
|
||||||
link:https://www.recoll.org/files/elinks_recoll.sh[elinks_beagle.sh].
|
|
||||||
@ -1,82 +0,0 @@
|
|||||||
== Recoll input handlers
|
|
||||||
|
|
||||||
In the end, Recoll indexes plain UTF-8 text, remembering when it came
|
|
||||||
from.
|
|
||||||
|
|
||||||
But of course, this is not how the source data looks like.
|
|
||||||
The text content of the original documents is encoded in many fashions
|
|
||||||
(ie pdf, ms-word, html, etc.), and it can also be stored in quite
|
|
||||||
involved ways (inside archives, email attachments ...).
|
|
||||||
|
|
||||||
For getting to the data and converting it to plain text, Recoll uses a set
|
|
||||||
of modules which it calls input handlers (or filters), which either operate
|
|
||||||
on the storage structure (ie: a zip handler), or the storage format (ie a
|
|
||||||
pdf to text translator), or both. In addition, there is a tentative notion
|
|
||||||
of a higher level storage backend which we will ignore for now (for
|
|
||||||
reference there are currently two of those: the file system and the web
|
|
||||||
history cache).
|
|
||||||
|
|
||||||
The basic task of filters is to take a document as input and produce a
|
|
||||||
series of subdocuments as output. The subdocument's format is defined
|
|
||||||
either dynamically (as part of the output data), or statically, in the
|
|
||||||
filter definition.
|
|
||||||
|
|
||||||
=== Simple filters
|
|
||||||
|
|
||||||
These are executed by a the **mh_exec** recoll module. They are the vast
|
|
||||||
majority.
|
|
||||||
|
|
||||||
These filters are very simple. They are designed to perform a simple task
|
|
||||||
with minimal interface, they mostly don't know anything about each other,
|
|
||||||
and they don't know much about their context. This makes writing a filter
|
|
||||||
quite easy as there is not much to learn about their environment.
|
|
||||||
|
|
||||||
Only one output document is produced and the format is fixed.
|
|
||||||
|
|
||||||
In practise the filter, which is most generally a shell-script (but could
|
|
||||||
be any executable program), takes a file name on the command line and
|
|
||||||
outputs an html or plain text document on standard output, then exits.
|
|
||||||
|
|
||||||
For example, the pdf filter takes one pdf file name as input on the command
|
|
||||||
line and produces one html document on stdout. The fact that the output is
|
|
||||||
html is statically defined in a configuration file.
|
|
||||||
|
|
||||||
For filters which produce plain text, the output character set information
|
|
||||||
is in general defined in the configuration file. Else it will be obtained
|
|
||||||
from the locale (hoping that it makes sense).
|
|
||||||
|
|
||||||
Filters that output html can produce metadata information in the html
|
|
||||||
header (ie author etc.). Filters that output plain text can only output
|
|
||||||
main text data, no metadata fields.
|
|
||||||
|
|
||||||
Besides the file name, there is one other piece of input information, which
|
|
||||||
is in the form of an environment variable, and can be safely ignored:
|
|
||||||
+RECOLL_FILTER_FORPREVIEW+. This indicates if the filter is being used
|
|
||||||
for previewing or for indexing data. Some filters will elect to suppress
|
|
||||||
repetitive parts of the output text when indexing to avoid distorting the
|
|
||||||
term statistics. For exemple, the man filter suppresses the section
|
|
||||||
headers (NAME, SYNOPSIS...) when indexing.
|
|
||||||
|
|
||||||
=== Multiple input filters
|
|
||||||
|
|
||||||
These filters are more complex, but still quite easy to write, especially
|
|
||||||
if you can use Python, because they can then use a common module which
|
|
||||||
manages the communication with the indexer.
|
|
||||||
|
|
||||||
Newer Recoll versions have converted many previously 'simple' filters to
|
|
||||||
this kind as part of the port to Windows.
|
|
||||||
|
|
||||||
These filters are executed by the *mh_execm* Recoll module.
|
|
||||||
|
|
||||||
They are persistent (one instance will persist through a whole indexing
|
|
||||||
pass), and will index successive multiple input files (the point being to
|
|
||||||
avoid startup performance penalty), and possibly multiple documents per
|
|
||||||
input file if this makes sense for their input format (ie: zip archive, chm
|
|
||||||
help file).
|
|
||||||
|
|
||||||
They use a simple communication protocol over a pipe with the main recoll
|
|
||||||
or recollindex process, with file names and a few other parameters being
|
|
||||||
sent as input, and decoded data and attributes being sent in return.
|
|
||||||
|
|
||||||
The shared Python module is 'filters/rclexecm.py'. You can look at 'rclzip'
|
|
||||||
or 'rclaudio' for reasonably straightforward exemples.
|
|
||||||
@ -1,62 +0,0 @@
|
|||||||
== Installing a filter for a new document type
|
|
||||||
|
|
||||||
It will sometimes happen that a newer Recoll release has support for a
|
|
||||||
document type which would be useful to you, but which your older release
|
|
||||||
does not support.
|
|
||||||
|
|
||||||
It is in general easy to import support from the newer to the older
|
|
||||||
release: the Recoll input handler interface is very stable, so things should just
|
|
||||||
work.
|
|
||||||
|
|
||||||
Input Handler updates are generally described on the Recoll web site
|
|
||||||
link:https://www.recoll.org/filters/filters.html[new filters pages]. They
|
|
||||||
may include notes about which versions need the new input handler, or specifics
|
|
||||||
about installing it.
|
|
||||||
|
|
||||||
An up to date copy of input handlers and configuration files is also kept
|
|
||||||
link:https://www.recoll.org/filters/[at the same location].
|
|
||||||
|
|
||||||
We will take an example to make things more concrete: Tomboy and Gnote
|
|
||||||
files are directly supported by Recoll 1.19, but not in older Recoll
|
|
||||||
releases. The *rclxml* handler is needed to process them.
|
|
||||||
|
|
||||||
The following procedure will allow you to retrofit support:
|
|
||||||
|
|
||||||
- Retrieve the *rclxml* input handler from:
|
|
||||||
link:https://www.lesbonscomptes.com/recoll/filters/rclxml[]
|
|
||||||
|
|
||||||
- Copy it to '/usr/share/recoll/filters' and make it executable:
|
|
||||||
`chmod +x rclxml`
|
|
||||||
The input handler needs *xsltproc*, but this is probably already on your
|
|
||||||
system (else get it with the package manager).
|
|
||||||
|
|
||||||
- Edit '~/.recoll/mimemap', add the following line:
|
|
||||||
`.note = application/x-gnote`
|
|
||||||
- Edit '~/.recoll/mimeconf', add the following lines:
|
|
||||||
+
|
|
||||||
----
|
|
||||||
[index]
|
|
||||||
application/x-gnote = exec rclxml
|
|
||||||
----
|
|
||||||
- Edit '~/.recoll/mimeview', add the following lines:
|
|
||||||
+
|
|
||||||
----
|
|
||||||
[view]
|
|
||||||
application/x-gnote = tomboy %f
|
|
||||||
----
|
|
||||||
|
|
||||||
- The easiest way to make sure the files are indexed with the new input
|
|
||||||
handlers may then be to just run a full indexing pass (`recollindex -z`).
|
|
||||||
|
|
||||||
Notes:
|
|
||||||
|
|
||||||
- The MIME type which is used is not crucial, you could prefer to use,
|
|
||||||
e.g., +application/x-tomboy+ instead, it just has to be consistent. To
|
|
||||||
avoid future trouble, it's better to use the type used by newer Recoll
|
|
||||||
releases though.
|
|
||||||
- The 'mimeview' entry is necessary even if you are using the desktop
|
|
||||||
preferences to open files. The value will not be used, but it has to be
|
|
||||||
there.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,34 +0,0 @@
|
|||||||
== Filtering out Zip archive members ==
|
|
||||||
|
|
||||||
The *rclzip* Zip archive extraction input handler does not use the general
|
|
||||||
configuration variables which define what file system objects should be
|
|
||||||
skipped, but it has an equivalent internal function.
|
|
||||||
|
|
||||||
The name-skipping code depends on a recent member of the the Recoll Python
|
|
||||||
package. This will become standard for release 1.20, but for earlier
|
|
||||||
releases, you need to do two things to use this function:
|
|
||||||
|
|
||||||
- Fetch 'python/recoll/recoll/rclconfig.py' and 'filters/rclzip' from the
|
|
||||||
source repository.
|
|
||||||
- Copy both to '/usr/share/recoll/filters' and make 'rclzip' executable.
|
|
||||||
|
|
||||||
You can then set a variable named +zipSkippedNames+ inside
|
|
||||||
'recoll.conf'. +zipSkippedNames+ should be a space-separated list of
|
|
||||||
patterns which will be passed to the Python fnmatch() function. The +/+
|
|
||||||
characters are not special (matched as any character).
|
|
||||||
|
|
||||||
You can't use embedded spaces in patterns (no double-quote quoting for now)
|
|
||||||
|
|
||||||
This can be redefined for file system directories using the usual section
|
|
||||||
indicators (Zip archives in different file-system directories can have
|
|
||||||
different skip lists).
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
----
|
|
||||||
zipSkippedNames = *.txt
|
|
||||||
[/path/to/the/dir]
|
|
||||||
zipSkippedNames = somedir/*/*.html
|
|
||||||
----
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,60 +0,0 @@
|
|||||||
== Recoll GUI keyboard navigation
|
|
||||||
|
|
||||||
Using Recoll without the mouse is not completely straightforward, but it is
|
|
||||||
mostly feasible. Here follows a description of the usable shortcuts.
|
|
||||||
|
|
||||||
=== Anywhere
|
|
||||||
|
|
||||||
`Ctrl+q` should exit Recoll from anywhere.
|
|
||||||
|
|
||||||
=== Main window and result list ===
|
|
||||||
|
|
||||||
When Recoll starts up, the focus is in the simple search entry. The main
|
|
||||||
window tab order is as follows:
|
|
||||||
|
|
||||||
* Clear
|
|
||||||
* Search
|
|
||||||
* Search type combo
|
|
||||||
* Search entry (Initial focus)
|
|
||||||
* Result list (scrolling etc)
|
|
||||||
* Result list 1st link
|
|
||||||
* Result list next links...
|
|
||||||
* Back to Clear
|
|
||||||
|
|
||||||
Each result list entry has 3 links: the icon link is not active, but its
|
|
||||||
value is the URL, so that it can be dragged and dropped to another
|
|
||||||
application. The 2 other links are _Preview_ and _Open_ and can be
|
|
||||||
activated by typing _Enter_.
|
|
||||||
|
|
||||||
Typing _Ctrl+Shift+s_ anywhere in the main window should return the focus to the search entry. So will _Ctrl+l_ in future versions (for compatibility with WEB browser usage).
|
|
||||||
|
|
||||||
For pure keyboard usage, you can improve this by:
|
|
||||||
|
|
||||||
- Disabling the icon link: use _Preferences->GUI configuration->Result
|
|
||||||
List->Edit result paragraph_ and remove the `<a href='%U'>` and `</a>`
|
|
||||||
around the `<img...>` tag.
|
|
||||||
- Making the active link more visible by adding the following code to the
|
|
||||||
result page HTML header insert (same preferences tab). Feel free to
|
|
||||||
adjust the color :=) :
|
|
||||||
|
|
||||||
----
|
|
||||||
<style type="text/css">
|
|
||||||
a:focus {background-color: red;}
|
|
||||||
</style>
|
|
||||||
----
|
|
||||||
|
|
||||||
=== Result table
|
|
||||||
|
|
||||||
The same _Ctrl+Shift+s_ will return the focus to the search entry when
|
|
||||||
working with the result table.
|
|
||||||
|
|
||||||
_Ctrl+r_ will move the focus from the entry to the spreadsheet. When in
|
|
||||||
there the arrow keys will navigate the lines.
|
|
||||||
|
|
||||||
When a line is selected:
|
|
||||||
|
|
||||||
* _Ctrl+o_ will _Open_ the document.
|
|
||||||
* _Ctrl+Shift+o_ will _Open_ the document and exit Recoll.
|
|
||||||
* _Ctrl+d_ (detail) will start a _Preview_
|
|
||||||
|
|
||||||
_Esc_ will deselect the current line so that mouse hovering will work again.
|
|
||||||
@ -1,69 +0,0 @@
|
|||||||
== Generating a custom field and using it to sort results
|
|
||||||
|
|
||||||
We are going to show how to generate a custom field from a Recoll filter,
|
|
||||||
and use it for sorting results. The example chosen comes from an actual
|
|
||||||
user request: sorting results on pdf page counts.
|
|
||||||
|
|
||||||
The details here are obsolete, as the +pdf+ input handler is now a quite
|
|
||||||
different python program, but the general idea is still relevant.
|
|
||||||
|
|
||||||
The page count from a pdf file can be displayed by the pdfinfo command
|
|
||||||
(xpdf or poppler tools).
|
|
||||||
|
|
||||||
We first modify a copy of the rclpdf filter
|
|
||||||
('/usr/[local/]share/recoll/filters/rclpdf'), to compute the pdf page count,
|
|
||||||
and output the value as an html meta field. This is a not very interesting
|
|
||||||
bit of shell/awk magic. Another approach would be to just rewrite the
|
|
||||||
rclpdf filter in your favorite scripting language (ie: perl, python...), as
|
|
||||||
all it does is execute pdftotext and pdfinfo and output html, nothing
|
|
||||||
complicated. Here follows the rclpdf modification as a pseudo patch:
|
|
||||||
|
|
||||||
----
|
|
||||||
# compute the page count and format it so that it's alphabetically sortable
|
|
||||||
+set `pdfinfo "$infile" | egrep ^Pages:`
|
|
||||||
+pages=`printf "%04d" $2`
|
|
||||||
[skip...]
|
|
||||||
# Pass the page count value to awk
|
|
||||||
-awk 'BEGIN'\
|
|
||||||
+awk -v Pages="$pages" 'BEGIN'\
|
|
||||||
[skip...]
|
|
||||||
# Inside the awk program startup section: compute the "meta" field line
|
|
||||||
+ pagemeta = "<meta name=\"pdfpages\" content=\"" Pages "\">\n"
|
|
||||||
[skip...]
|
|
||||||
# Then print it as part of the header:
|
|
||||||
+ $0 = part1 charsetmeta pagemeta part2
|
|
||||||
[skip...]
|
|
||||||
----
|
|
||||||
|
|
||||||
You can execute your own version of rclpdf by modifying '~/.recoll/mimeconf':
|
|
||||||
|
|
||||||
----
|
|
||||||
[index]
|
|
||||||
application/pdf = exec /path/to/my/own/rclpdf
|
|
||||||
----
|
|
||||||
|
|
||||||
At this point, recollindex would receive and extract a +pdfpages+ field,
|
|
||||||
but it would not know what to do with it. We are going to tell it to store
|
|
||||||
the value inside the document data record so that it can be displayed in
|
|
||||||
the results, and sorted on. For this we modify the '~/.recoll/fields' file:
|
|
||||||
|
|
||||||
----
|
|
||||||
[stored]
|
|
||||||
pdfpages=
|
|
||||||
----
|
|
||||||
|
|
||||||
That's it ! After reindexing, you can now display +pdfpages+ inside the
|
|
||||||
result list (add a +%(pdfpages)+ value to the paragraph format), and display
|
|
||||||
+pdfpages+ inside the result table (right-click the table header), and sort
|
|
||||||
the results on page count (click the column header).
|
|
||||||
|
|
||||||
Note that +pdfpages+ has not been defined as searchable (this would not make
|
|
||||||
much sense). For this, you'd have to define a prefix and add it to the
|
|
||||||
[prefixes] fields file section:
|
|
||||||
|
|
||||||
----
|
|
||||||
[prefixes]
|
|
||||||
pdfpages = XYPDFP
|
|
||||||
----
|
|
||||||
|
|
||||||
Have a look at the comments inside the 'fields' file for more information.
|
|
||||||
@ -1,13 +0,0 @@
|
|||||||
== Welcome to the Recoll Faqs and Recipees
|
|
||||||
|
|
||||||
link:FaqsAndHowTos.html[FAQs and Howtos] are stored here, but
|
|
||||||
the main source for Recoll user documentation is
|
|
||||||
link:https://www.recoll.org/doc.html[the _Recoll user manual_] on the
|
|
||||||
link:https://www.recoll.org/[Recoll Web site] where you will also find a
|
|
||||||
lot of other Recoll information, source code tarballs and contact
|
|
||||||
information.
|
|
||||||
|
|
||||||
If you want to make your problem report as useful as possible, you may want
|
|
||||||
to take a look at link:ProblemSolvingData.html[this page].
|
|
||||||
|
|
||||||
link:WikiIndex.html[Full file index]
|
|
||||||
@ -1,79 +0,0 @@
|
|||||||
== Recoll hotkey: starting / hiding recoll with a keyboard shortcut
|
|
||||||
|
|
||||||
Type a key (ie: F12) and have recoll appear or disappear. On the first
|
|
||||||
occurrence, recoll is started if it's not already running. Further
|
|
||||||
occurrences toggle recoll between visible and minimized states. Never
|
|
||||||
thought this would be useful until someone asked for it. Can't do without
|
|
||||||
it anymore :)
|
|
||||||
|
|
||||||
This works well with both Gnome and KDE, but is implemented using a gnome
|
|
||||||
library (*libwnck*) and its python interface, which you may have to install
|
|
||||||
on your system if you are a pure KDE user. The library most probably exists
|
|
||||||
in the package repositories for your distribution, so this should not be
|
|
||||||
too complicated.
|
|
||||||
|
|
||||||
This should also work with other window managers, because it is based on a
|
|
||||||
standard window manager interface extension (EWMH) that most modern window
|
|
||||||
managers implement.
|
|
||||||
|
|
||||||
=== Installing the script (all desktops):
|
|
||||||
|
|
||||||
- You will need the libwnck library and its python interface. These are
|
|
||||||
usually part of a gnome installation, otherwise check and possibly
|
|
||||||
install them. For OpenSuse, the library should already be there but you
|
|
||||||
need to install gnome-python-desktop.
|
|
||||||
- Download the
|
|
||||||
link:https://www.recoll.org/files/hotrecoll.py[http://www.recoll.org/files/hotrecoll.py
|
|
||||||
script]. If you have a recent recoll installation (1.14.3 and
|
|
||||||
further), it's already in the recoll filters directory
|
|
||||||
('/usr/[local/]share/recoll/filters')
|
|
||||||
- Copy the script to some permanent place (ie: '~/bin') and make it
|
|
||||||
executable (you can leave it in the filters dirs if it's there). In a
|
|
||||||
shell window: `chmod +x hotrecoll.py`.
|
|
||||||
- You can check that the script works (or not) by executing it on the
|
|
||||||
command line. It does not need an argument. Recoll should appear or
|
|
||||||
disappear every time you execute the script. A few warning messages may
|
|
||||||
be considered normal. If the script says that it does not find the wnck
|
|
||||||
library or some other module, you'll have to install them.
|
|
||||||
|
|
||||||
=== Installing the keyboard shortcut (Gnome):
|
|
||||||
|
|
||||||
- _System->Preferences->Keyboard shortcuts_, or execute
|
|
||||||
*gnome-keybinding-properties*
|
|
||||||
- Click add, Name, ie: StartRecoll, Action: /path/to/hotrecoll.py
|
|
||||||
- This will add the shortcut to the "Custom shortcuts" section. You can
|
|
||||||
then click in the "Shortcut" column for "StartRecoll", and type any key
|
|
||||||
combination (ie: push F12) to assign a key shortcut.
|
|
||||||
|
|
||||||
=== Installing the keyboard shortcut (KDE):
|
|
||||||
|
|
||||||
Under KDE installing a global custom keyboard shortcut like we need is most
|
|
||||||
helpfully not under "Keyboard Shortcuts" but under "Input Actions".
|
|
||||||
|
|
||||||
- _Kmenu -> Configure Desktop -> Input Actions -> Edit -> New -> Global
|
|
||||||
Shortcut -> Command/Url_
|
|
||||||
- A new Action appears, named _New Action_. You can rename it something
|
|
||||||
like +hotrecoll+ for clarity.
|
|
||||||
- Click the _Trigger_ tab, click the input area and press your preferred
|
|
||||||
key combination (ie: F12)
|
|
||||||
- Click the _Action_ tab, and enter +hotrecoll.py+ (if it's in your PATH),
|
|
||||||
or else the full path to the command (e.g.:
|
|
||||||
'/usr/share/recoll/filters/hotrecoll.py').
|
|
||||||
- Click _Apply_.
|
|
||||||
|
|
||||||
=== Installing the keyboard shortcut (XFCE):
|
|
||||||
|
|
||||||
Open the settings manager, and add the shortcut in the
|
|
||||||
_Application Shortcuts_ panel inside the _Keyboard_ tool.
|
|
||||||
|
|
||||||
|
|
||||||
=== Other environments
|
|
||||||
|
|
||||||
Many window managers have a way to set up a keyboard shortcut for running
|
|
||||||
an arbitrary command. You'll need to look at the documentation for yours,
|
|
||||||
or search the web for a solution.
|
|
||||||
|
|
||||||
An alternative independant of the environment would be to use the XBindKeys
|
|
||||||
utility. See this link:http://www.linux.com/archive/feed/59494[linux.com
|
|
||||||
article] for helpful instructions.
|
|
||||||
|
|
||||||
@ -1,33 +0,0 @@
|
|||||||
== Indexing arbitrary mail headers
|
|
||||||
|
|
||||||
By default the Recoll mail handler only processes a subset of email headers
|
|
||||||
(+From+, +To+, +Cc+, +Date+, +Subject+). It is possible to index additional
|
|
||||||
headers by specifying them inside the 'fields' configuration file, inside
|
|
||||||
the configuration directory (typically '~/.recoll/').
|
|
||||||
|
|
||||||
Lengthy explanations are not really needed here, and I'll just show an
|
|
||||||
example (duplicated from the configuration section of the manual):
|
|
||||||
|
|
||||||
----
|
|
||||||
[prefixes]
|
|
||||||
# Index mailmytag contents (with the given prefix)
|
|
||||||
mailmytag = XMTAG
|
|
||||||
|
|
||||||
[stored]
|
|
||||||
# Store mailmytag inside the document data record (so that it can be
|
|
||||||
# displayed - as %(mailmytag) - in result lists).
|
|
||||||
mailmytag =
|
|
||||||
|
|
||||||
[mail]
|
|
||||||
# Extract the X-My-Tag mail header, and use it internally with the
|
|
||||||
# mailmytag field name
|
|
||||||
x-my-tag = mailmytag
|
|
||||||
|
|
||||||
----
|
|
||||||
|
|
||||||
Limitations:
|
|
||||||
|
|
||||||
- The mail filter will only process the first instance for a header
|
|
||||||
occurring several times.
|
|
||||||
- No decoding will take place (ie for non-ascii headers which would have
|
|
||||||
some kind of encoding).
|
|
||||||
@ -1,32 +0,0 @@
|
|||||||
== Indexing Mozilla calendar data
|
|
||||||
|
|
||||||
Mozilla calendar programs (*Sunbird*, *Lightning*) do not store their
|
|
||||||
data in +ics+ files natively. They use an *SQLite* database (the
|
|
||||||
'storage.sdb' file inside the profile). This means that calendar data
|
|
||||||
cannot be indexed directly.
|
|
||||||
|
|
||||||
To get Recoll to index calendar data, you need to export it to an +ics+
|
|
||||||
file. This can be done manually, from the application menus, or, by
|
|
||||||
installing the
|
|
||||||
link:https://addons.mozilla.org/en-US/sunbird/addon/3740[Automatic Export
|
|
||||||
extension].
|
|
||||||
|
|
||||||
The extension can be configured to export the data when exiting the
|
|
||||||
program, or at regular time intervals. You can even set up a command to be
|
|
||||||
executed after the export. If you are not using real time indexing, this
|
|
||||||
can usefully be *recollindex*.
|
|
||||||
|
|
||||||
In _Tools->Add Ons->Automatic Export preferences_, in the _Start an
|
|
||||||
application after export_ subpanel, set _Path of application_ to
|
|
||||||
'/usr/[local/]bin/recollindex' and _Parameters of application_ to
|
|
||||||
something like _-i;/home/me/path/to/nameofexportedcal.ics_
|
|
||||||
|
|
||||||
This will ensure that the calendar is indexed every time it is exported
|
|
||||||
(this is not necessary though, you can let the next batch indexing pass
|
|
||||||
take care of it).
|
|
||||||
|
|
||||||
It may happen that the exported data has some syntax errors which will
|
|
||||||
prevent indexing with the *rclics* filter which was distributed up to
|
|
||||||
Recoll 1.13.04 (included). You may get an updated filter from the
|
|
||||||
link:https://www.recoll.org/download.html[Recoll download page].
|
|
||||||
|
|
||||||
@ -1,24 +0,0 @@
|
|||||||
== Laptops: starting or stopping indexing according to AC power status
|
|
||||||
|
|
||||||
For people using real time indexing on a laptop, kind user "The Doctor"
|
|
||||||
contributed a script to automatically start and stop indexing according to
|
|
||||||
power status. The script can be found here:
|
|
||||||
link:https://opensourceprojects.eu/p/recoll1/code/ci/144da4a5caa2b39d23d9d7cf262f03b6d80a4739/tree/src/desktop/recoll_index_on_ac.sh[recoll_index_on_ac.sh]
|
|
||||||
|
|
||||||
To use it, you need to copy it somewhere (e.g.: '/usr/bin', but any place
|
|
||||||
will do), make it executable (`chmod a+x recoll_index_on_ac.sh`), and edit
|
|
||||||
'~/.config/autostart/recollindex.desktop'
|
|
||||||
|
|
||||||
Change the following line:
|
|
||||||
|
|
||||||
Exec=recollindex -w 60 -m
|
|
||||||
|
|
||||||
to something like the following (depending where you copied the script):
|
|
||||||
|
|
||||||
Exec=/usr/bin/recoll_index_on_ac.sh
|
|
||||||
|
|
||||||
You may also want to change
|
|
||||||
'/usr/share/recoll/examples/recollindex.desktop', otherwise your change
|
|
||||||
will be reverted the next time you toggle real time indexing through the
|
|
||||||
GUI. And, yes, sorry about it, _this_ change will be lost on the next
|
|
||||||
Recoll update, so save a copy.
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
== Indexing Outlook archives ==
|
|
||||||
|
|
||||||
Recoll has no direct support for indexing Microsoft Outlook data, because,
|
|
||||||
if you are a Windows user, you probably are not a good customer for Linux
|
|
||||||
desktop indexing...
|
|
||||||
|
|
||||||
However, if you have a need to index Outlook data at some point, I can
|
|
||||||
recommend the excellent link:http://www.five-ten-sg.com/libpst/[libpst]
|
|
||||||
library and its link:http://www.five-ten-sg.com/libpst/rn01re01.html[readpst]
|
|
||||||
utility. Using this you can very easily convert the Outlook data into MH or
|
|
||||||
mbox format, and then index the result with Recoll.
|
|
||||||
@ -1,29 +0,0 @@
|
|||||||
== Indexing Web history with the Firefox extension ==
|
|
||||||
|
|
||||||
Note: this document is valid for Recoll versions from 1.18.
|
|
||||||
|
|
||||||
The link:http://sourceforge.net/projects/recollfirefox/[Recoll Firefox
|
|
||||||
extension]
|
|
||||||
works together with Recoll to index the Web pages that you visit. The
|
|
||||||
extension is based on an older one which was initially written for the
|
|
||||||
Beagle indexer.
|
|
||||||
|
|
||||||
The extension works by copying the data for the visited pages to a queue
|
|
||||||
directory ('~/.recollweb/ToIndex' by default), from which they are
|
|
||||||
indexed and removed by Recoll, and then stored in a local cache.
|
|
||||||
|
|
||||||
The extension is now hosted on the Mozilla add-ons site, so you can install
|
|
||||||
it very simply in Firefox: link:https://addons.mozilla.org/fr/firefox/addon/recoll-indexer-1/[Recoll Firefox add-on page].
|
|
||||||
|
|
||||||
This feature can be enabled in the Recoll GUI index configuration panel
|
|
||||||
(Web history section), or by editing the configuration file (set
|
|
||||||
+processwebqueue+ to 1).
|
|
||||||
|
|
||||||
Please remember that Recoll only stores a limited amount of cached web data
|
|
||||||
(adjustable from the GUI Index Configuration section), and that old pages
|
|
||||||
will be purged from the index. Pages that you want to archive permanently
|
|
||||||
need to be saved elsewhere, as they will otherwise eventually disappear
|
|
||||||
from the Recoll results.
|
|
||||||
|
|
||||||
Recoll will index +.maff+ files, which may be a better choice for archival
|
|
||||||
usage.
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
.SUFFIXES: .txt .html
|
|
||||||
|
|
||||||
.txt.html:
|
|
||||||
asciidoc $<
|
|
||||||
|
|
||||||
all: $(addsuffix .html,$(basename $(wildcard *.txt)))
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm *.html
|
|
||||||
@ -1,97 +0,0 @@
|
|||||||
== Creating and using multiple indexes
|
|
||||||
|
|
||||||
=== Why would you want to do this ?
|
|
||||||
|
|
||||||
- Easy adjustment of search areas: you can filter results by using the
|
|
||||||
directory filter in the advanced search panel, but, if you have
|
|
||||||
separate well defined places where you store different kind of data,
|
|
||||||
it is easier to maintain separate index and use the External indexes
|
|
||||||
dialog to switch them on or off, and it will also yield much better
|
|
||||||
search performance.
|
|
||||||
- Shared indexes: it may be useful to maintain one or several indexes
|
|
||||||
for shared data, and separate personal indexes for each user. Indexes
|
|
||||||
can be shared over the network.
|
|
||||||
- Creating separate indexes for removable volumes.
|
|
||||||
|
|
||||||
=== How to do it
|
|
||||||
|
|
||||||
As an example we'll suppose that you have Recoll installed and indexing
|
|
||||||
your home directory, and that you would like to have a separate index for
|
|
||||||
'/usr/share/doc'.
|
|
||||||
|
|
||||||
You need to create a separate configuration for the new index, then add it
|
|
||||||
to the external indexes list in the user interface, and activate it as
|
|
||||||
needed.
|
|
||||||
|
|
||||||
. Create a directory for the new index, and create an empty configuration
|
|
||||||
file
|
|
||||||
+
|
|
||||||
----
|
|
||||||
cd
|
|
||||||
mkdir .recoll-sharedoc
|
|
||||||
touch .recoll-sharedoc/recoll.conf
|
|
||||||
----
|
|
||||||
. Either edit the new configuration by hand or start recoll to use the GUI
|
|
||||||
configuration editor.
|
|
||||||
+
|
|
||||||
----
|
|
||||||
cd .recoll-sharedoc
|
|
||||||
echo "topdirs = /usr/share/doc" > recoll.conf
|
|
||||||
# OR
|
|
||||||
recoll -c ~/.recoll-sharedoc
|
|
||||||
----
|
|
||||||
+
|
|
||||||
If using the GUI, click _Cancel_ when asked, to start the configuration
|
|
||||||
editor.
|
|
||||||
|
|
||||||
. Perform initial indexing. If you chose the GUI route, indexing will
|
|
||||||
start as soon as you leave the configuration editor. Else, on the
|
|
||||||
command line:
|
|
||||||
+
|
|
||||||
----
|
|
||||||
recollindex -c ~/.recoll-sharedoc
|
|
||||||
----
|
|
||||||
. Optionally set up *cron* to perform nightly indexing, use +crontab -e+
|
|
||||||
and insert a line like the following:
|
|
||||||
+
|
|
||||||
----
|
|
||||||
45 20 * * * recollindex -c ~/.recoll-sharedoc
|
|
||||||
----
|
|
||||||
+
|
|
||||||
This would start the indexing at 20:45. `crontab -e` will use the *vi*
|
|
||||||
editor by default, you can change this by using the EDITOR
|
|
||||||
environment variable. Exemple: `EDITOR=kate crontab -e`
|
|
||||||
Your favorite desktop may also have a dedicated tool to add crontab entries.
|
|
||||||
|
|
||||||
. Start recoll and choose the _Preferences->External_ index dialog menu
|
|
||||||
entry, then click the Browse button (near the bottom), and select the
|
|
||||||
new index Xapian database directory '~/.recoll-sharedoc/xapiandb'
|
|
||||||
Then click _Add index_.
|
|
||||||
|
|
||||||
. You can then activate or deactivate the new index by clicking the box
|
|
||||||
in front of the directory name in the list.
|
|
||||||
|
|
||||||
When adding an index shared by multiple users, it may be helpful to use the
|
|
||||||
RECOLL_EXTRA_DBS environment variable instead of editing individual
|
|
||||||
configurations, see the manual for more details.
|
|
||||||
|
|
||||||
=== Paths adjustments
|
|
||||||
|
|
||||||
When sharing indexes over a network, in most cases, the indexed data will
|
|
||||||
be accessible through different paths on the different hosts. This will
|
|
||||||
prevent the Preview and Open functions to work because the paths they get
|
|
||||||
from the index do not match the ones which are usable from the local
|
|
||||||
host.
|
|
||||||
|
|
||||||
For example my home directory is accessed as '/home/me' on my home
|
|
||||||
machine, and as '/net/myhost/home/me' on other hosts. By default, trying
|
|
||||||
to access a result from a remote host would use the first path, when the
|
|
||||||
second is the one that would work.
|
|
||||||
|
|
||||||
As of release 1.19 **Recoll** has a facility to perform index-dependant
|
|
||||||
path translations. This facility is accessible from the _external index
|
|
||||||
dialog_ in the GUI preferences. Paths translations can be set for the main
|
|
||||||
index if no index is selected (rarely useful), or for the selected
|
|
||||||
additional index.
|
|
||||||
link:../usermanual/webhelp/docs/RCL.SEARCH.PTRANS.html[See
|
|
||||||
the manual] for more detail.
|
|
||||||
@ -1,77 +0,0 @@
|
|||||||
== Interfacing Recoll and Mutt
|
|
||||||
|
|
||||||
It is possible to either use Mutt as a Recoll search result viewer, or
|
|
||||||
start Recoll from the Mutt search.
|
|
||||||
|
|
||||||
=== Starting Mutt to view Recoll search results
|
|
||||||
|
|
||||||
This method and the associated
|
|
||||||
link:http://www.recoll.org/files/recoll2mutt[recoll2mutt script] were kindly
|
|
||||||
contributed by Morten Langlo.
|
|
||||||
|
|
||||||
This allows finding mail messages in recoll and then calling *mutt*
|
|
||||||
or *mutt-kz* to read or process the mail.
|
|
||||||
|
|
||||||
Installation:
|
|
||||||
|
|
||||||
- Copy the [[http://www.recoll.org/files/recoll2mutt|recoll2mutt script]]
|
|
||||||
somewhere in your PATH, and make it executable.
|
|
||||||
- In the **recoll** GUI menus:
|
|
||||||
_Preferences->GUI configuration->User interface->Choose editor applications_
|
|
||||||
change the entry for "message/rfc822" to: +recoll2mutt %f+
|
|
||||||
|
|
||||||
The script has options for setting a number of parameters, you may not need
|
|
||||||
to set any of them, the defaults are:
|
|
||||||
|
|
||||||
- -c mutt
|
|
||||||
- -F .muttrc
|
|
||||||
- -m Mail
|
|
||||||
- -x "-fn 10*20 -geometry 115x40"
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
----
|
|
||||||
recoll2mutt -c mutt-kz -F .mutt_kzrc -m Mail -x "-fn 10*20 -geometry 115x40" %f
|
|
||||||
----
|
|
||||||
|
|
||||||
The option +-x+ is passed to *xterm*, which is used to call *mutt* or
|
|
||||||
*mutt-kz*.
|
|
||||||
|
|
||||||
The script works for both _mbox_ and _maildir_ mail boxes, and it
|
|
||||||
expects the configuration file for mutt and the mail directory to reside in
|
|
||||||
your $HOME and the spool file to be '/var/spool/mail/$USER' if it is
|
|
||||||
not in your mail directory. But it is easy to change the values in the
|
|
||||||
script if you need to.
|
|
||||||
|
|
||||||
*mutt* is opened with the right mailbox and limit set to _Date_ and
|
|
||||||
_Sender_. In theory you could set limit to _Message-Id_, but very often
|
|
||||||
*mutt* reports, that there are invalid patterns in _Message-Id_, so do it
|
|
||||||
safe, even though all emails in the opened mail box with the same date from
|
|
||||||
the sender are shown.
|
|
||||||
|
|
||||||
|
|
||||||
=== Starting Recoll from the Mutt search
|
|
||||||
|
|
||||||
This will work only when using maildir storage (messages in individual
|
|
||||||
files). It will not work with mailbox files. The latter would probably be
|
|
||||||
possible by extracting the individual result messages using the Python
|
|
||||||
interface, but I did not try.
|
|
||||||
|
|
||||||
The classic way to interface Mutt and a search application is to create a
|
|
||||||
shortcut to an external command which creates a temporary Maildir
|
|
||||||
containing the search results.
|
|
||||||
|
|
||||||
There is such a script for Recoll, you will find it link:https://bitbucket.org/medoc/recoll/raw/41d41799dbac4c69a34db985b3ab9f1597c9c742/src/python/samples/mutt-recoll.py[here].
|
|
||||||
|
|
||||||
Copy the script somewhere in your PATH, and make it executable, then add
|
|
||||||
the following line to your '.muttrc':
|
|
||||||
|
|
||||||
|
|
||||||
----
|
|
||||||
|
|
||||||
macro index S "<enter-command>unset wait_key<enter><shell-escape>mutt-recoll.py -G<enter><change-folder-readonly>~/.cache/mutt_results<enter>" \
|
|
||||||
"search mail (using recoll)"
|
|
||||||
|
|
||||||
----
|
|
||||||
|
|
||||||
Obviously, you can replace the 'S' letter with whatever will suit you (e.g:/)
|
|
||||||
@ -1,85 +0,0 @@
|
|||||||
== Unix and non-ASCII file names, a summary of issues
|
|
||||||
|
|
||||||
Unix/Linux file and directory names are binary byte C strings. Only the
|
|
||||||
null byte and the slash character (/) are forbidden inside a name,
|
|
||||||
nowhere does the kernel interpret the strings as meaningful or
|
|
||||||
printable.
|
|
||||||
|
|
||||||
In the old times, all utilities that would display to the user were
|
|
||||||
ASCII-based, and people would use pure printable ASCII file names (even
|
|
||||||
using space characters inside names was a cause for trouble). Non
|
|
||||||
alphanumeric characters were exclusively used for playing tricks on
|
|
||||||
colleagues. And all was well.
|
|
||||||
|
|
||||||
Then the devil came under the guise of accented 8 bit characters. The
|
|
||||||
system has no problem with them, file names are still binary C strings, but
|
|
||||||
the utilities have to display them or take them as input, and, because
|
|
||||||
there is no encoding specification stored with the file names, they can
|
|
||||||
only do this according to the character encoding taken from the user's
|
|
||||||
current locale.
|
|
||||||
|
|
||||||
For example fr_FR.UTF-8, and fr_FR.ISO8859-1 could be used simultaneously
|
|
||||||
on the same system (by different users), but they are completely
|
|
||||||
uncompatible: ISO-8859-1 strings are illegal when viewed in an UTF-8 locale
|
|
||||||
(will display as interrogation points or some other conventional error
|
|
||||||
marker). UTF-8 strings will display as gibberish in an ISO-8859-1 locale.
|
|
||||||
|
|
||||||
This means that the file names created by an UTF-8 user are displayed as
|
|
||||||
garbage to the ISO-8859 one...
|
|
||||||
|
|
||||||
If you ever change your locale, your old files are still there and named
|
|
||||||
the same (in the binary sense), but the names display badly and you have
|
|
||||||
great trouble inputing them. If you add distributed (NFS) file system
|
|
||||||
issues, things become totally unmanageable. Also think about archives sent
|
|
||||||
from another system with a different encoding.
|
|
||||||
|
|
||||||
For what concerns Recoll:
|
|
||||||
|
|
||||||
- The file names inside recoll.conf are not transcoded, they are taken as
|
|
||||||
binary strings (mostly, only +\n+ and +space+ are a bit special), and
|
|
||||||
passed as is to the system. So if you edit 'recoll.conf' with a text
|
|
||||||
editor, inside the same locale that is or has been used for file names,
|
|
||||||
you'll be fine.
|
|
||||||
- There was a bug in the GUI configuration tool, up to 1.12, it should
|
|
||||||
transcode between the internal Qt format and locale-dependant strings,
|
|
||||||
but it doesn't or does it badly.
|
|
||||||
- There is also an exception for the +unac_except_trans+ variable, this
|
|
||||||
*has* to be UTF-8, so if the rest of the file uses another encoding,
|
|
||||||
you'll need to edit two separate files and concatenate them.
|
|
||||||
|
|
||||||
As of version 1.13, Recoll uses local8Bit()/fromLocal8Bit() to convert
|
|
||||||
recoll.conf file names from/to QStrings (it uses UTF-8 for all string
|
|
||||||
values which are not file names).
|
|
||||||
|
|
||||||
The Qt file dialog is broken (at least was, I have not checked this on
|
|
||||||
recent versions). It should consider file paths as almost-binary data, not
|
|
||||||
QStrings, but doesn't. In consequence, things are even more broken than
|
|
||||||
necessary as seen from there:
|
|
||||||
|
|
||||||
With LANG="C", no non-ASCII paths can't be used at all:
|
|
||||||
|
|
||||||
- Strings read from recoll.conf are stripped of 8bit characters before display.
|
|
||||||
- Directory entries with 8bit characters are not displayed at all in the
|
|
||||||
selection dialog.
|
|
||||||
|
|
||||||
With LANG="fr_FR.UTF-8", only UTF-8 paths can be used:
|
|
||||||
|
|
||||||
- Strings read from recoll.conf are damaged when converted to QString
|
|
||||||
(except those that were actually UTF-8)
|
|
||||||
- Only the UTF-8 directory entries are displayed in the selection dialog.
|
|
||||||
|
|
||||||
|
|
||||||
With LANG="fr_FR.iso8859-1", everything works ok.
|
|
||||||
|
|
||||||
- Strings read from recoll.conf are displayed with weird characters if
|
|
||||||
they use another encoding such as UTF-8, but are correctly maintained
|
|
||||||
and can be read back from the dialogs and rewritten without damage.
|
|
||||||
- Directory entries with 8 bit characters are displayed weirdly (normal),
|
|
||||||
but can be manipulated without trouble (this includes utf-8 names of
|
|
||||||
course).
|
|
||||||
|
|
||||||
In conclusion, only the iso-8859 locales can be used for handling mixed
|
|
||||||
encoding situations. This is a possible workaround for people who need it.
|
|
||||||
|
|
||||||
More data about path encoding issues:
|
|
||||||
http://www.dwheeler.com/essays/fixing-unix-linux-filenames.html
|
|
||||||
@ -1,71 +0,0 @@
|
|||||||
== Starting native applications
|
|
||||||
|
|
||||||
It is sometimes difficult to start a native application on a result
|
|
||||||
document, especially when the result comes from a container file (ie: email
|
|
||||||
folder file, chm file).
|
|
||||||
|
|
||||||
The problem is that native applications usually expect at most a file name
|
|
||||||
on the command line, and sometimes not even that (emailers).
|
|
||||||
|
|
||||||
The _Open parent documents_ link in the result list right click menu is
|
|
||||||
sometimes useful in this situation (e.g.: +chm+ files).
|
|
||||||
|
|
||||||
In some other cases it may help that Recoll does make a lot of data
|
|
||||||
available to the application. This data may have to be pre-processed in a
|
|
||||||
script before calling the actual application.
|
|
||||||
|
|
||||||
Details about configuring how the native application or script are called
|
|
||||||
are given with the
|
|
||||||
link:http://www.recoll.org/usermanual/usermanual.html#RCL.INSTALL.CONFIG.MIMEVIEW[description of the mimeview configuration file]
|
|
||||||
|
|
||||||
Information about
|
|
||||||
link:http://www.recoll.org/usermanual/usermanual.html#RCL.INSTALL.CONFIG.FIELDS[configuring
|
|
||||||
customised fields] may also be useful in combination.
|
|
||||||
|
|
||||||
=== Example
|
|
||||||
|
|
||||||
This is a simple example, because it does not need to use special
|
|
||||||
fields. It just shows how to solve a simple issue by using an intermediary
|
|
||||||
script. The problem is due to the fact that thunderbird's +-file+ option
|
|
||||||
won't open a file if the extension is not '.eml'. Jorge, the kind Recoll
|
|
||||||
user who supplied the example stores his email in Maildir++ format, the
|
|
||||||
file names have no extension, so an intermediary script is necessary to get
|
|
||||||
thunderbird to open them:
|
|
||||||
|
|
||||||
Note that this only works with messages stored in Maildir or MH format (one
|
|
||||||
message per file). As far as I know, there is no way to get Thunderbird to
|
|
||||||
open an arbitrary mbox file.
|
|
||||||
|
|
||||||
The 'recoll-thunderbird-open-file' script:
|
|
||||||
|
|
||||||
----
|
|
||||||
#!/bin/sh
|
|
||||||
cp $1 /tmp/$$.eml
|
|
||||||
thunderbird -file /tmp/$$.eml
|
|
||||||
----
|
|
||||||
|
|
||||||
Create the file in an editor, save it somewhere, and make it executable
|
|
||||||
(`chmod +x recoll-thunderbird-open-file`).
|
|
||||||
|
|
||||||
The mail line in the '~/.recoll/mimeview' file:
|
|
||||||
|
|
||||||
----
|
|
||||||
[view]
|
|
||||||
message/rfc822 = recoll-thunderbird-open-file %f
|
|
||||||
----
|
|
||||||
|
|
||||||
If the place where you saved the script is not in your PATH, you will need
|
|
||||||
to use the full path instead of just the script name, as in
|
|
||||||
|
|
||||||
----
|
|
||||||
[view]
|
|
||||||
message/rfc822 = /home/me/somewhere/recoll-thunderbird-open-file %f
|
|
||||||
----
|
|
||||||
|
|
||||||
You should then be able to open the messages in Thunderbird, which is
|
|
||||||
useful, for example, to handle the attachments.
|
|
||||||
|
|
||||||
With recent Recoll versions, if using the normal option of letting the
|
|
||||||
Desktop chose the _Open_ application to use (_Use Desktop default_),
|
|
||||||
you should also add +message/rfc822+ to the exceptions, and the whole
|
|
||||||
thing is probably more easily done from the Recoll GUI.
|
|
||||||
@ -1,30 +0,0 @@
|
|||||||
== Preventing indexing in a directory
|
|
||||||
|
|
||||||
=== Why would you want to do this ?
|
|
||||||
|
|
||||||
By default, recollindex (or the indexing thread inside the recoll QT user
|
|
||||||
interface) will process your home directories and most its subdirectories,
|
|
||||||
at the exception of some well known places (thumbnails, beagle and web
|
|
||||||
browser caches, etc.)
|
|
||||||
|
|
||||||
You may want to prevent indexing in some directories where you don't expect
|
|
||||||
interesting search results. This will avoid polluting the search result
|
|
||||||
lists, speed up indexing times and make the index smaller.
|
|
||||||
|
|
||||||
=== How to do it
|
|
||||||
|
|
||||||
There are two ways to block indexing at certain points: either by listing
|
|
||||||
specific paths, or by directory name pattern matches.
|
|
||||||
|
|
||||||
- Blocking specific paths: this is controlled by the 'skippedPaths'
|
|
||||||
variable in the main configuration file. You can adjust the value either
|
|
||||||
by editing the file or by using the indexing configuration dialog:
|
|
||||||
_Preferences->Indexing configuration->Global parameters->Skipped paths_
|
|
||||||
- Using pattern matches: these are listed in the skippedNames variable in
|
|
||||||
the main configuration file. You can adjust the value either by editing
|
|
||||||
the file or by using the GUI: _Preferences->Indexing configuration->Local
|
|
||||||
parameters->Skipped names_
|
|
||||||
|
|
||||||
The
|
|
||||||
link:../usermanual/webhelp/docs/RCL.INSTALL.CONFIG.RECOLLCONF.WHATDOCS.html[configuration
|
|
||||||
section] of the manual has a bit more detail about the two variables.
|
|
||||||
@ -1,157 +0,0 @@
|
|||||||
== Gathering useful data for asking help about or reporting a Recoll issue
|
|
||||||
|
|
||||||
Once in a while it will happen that a Recoll program will either signal an
|
|
||||||
error, or even crash (either the *recoll* graphical interface or the
|
|
||||||
*recollindex* command line indexing command).
|
|
||||||
|
|
||||||
Reporting errors and crashes is very useful. It can help others, and it can
|
|
||||||
get your own problem solved.
|
|
||||||
|
|
||||||
Any problem report should include the exact Recoll and system versions.
|
|
||||||
|
|
||||||
If at all possible, reading the following and performing part of the
|
|
||||||
suggested steps will be useful. This is not a condition for obtaining help
|
|
||||||
though ! If you have any problem and have a difficulty with the following,
|
|
||||||
just contact the mailing list or the developers (see contacts on
|
|
||||||
link:https://www.recoll.org/support.html[the Recoll site support page]).
|
|
||||||
|
|
||||||
If the problem concerns indexing, and was initially found using the
|
|
||||||
*recoll* GUI, you should try to reproduce it using the
|
|
||||||
*recollindex* command-line indexer, which is much simpler and easier to
|
|
||||||
debug.
|
|
||||||
|
|
||||||
There are then two sources of useful information to diagnose the issue: the
|
|
||||||
debug log file and, possibly, in case of a crash, a stack trace.
|
|
||||||
|
|
||||||
Crash and other problem reports are of very high value to me, and I am
|
|
||||||
willing to help you with any of the steps described below if it is not
|
|
||||||
familiar to you. I do realize that not everybody is a programmer or a
|
|
||||||
system administrator.
|
|
||||||
|
|
||||||
=== Obtaining information from the log file
|
|
||||||
|
|
||||||
All Recoll commands write a varying amount of information to a common log file.
|
|
||||||
|
|
||||||
_All commands use the same log, and the file is reset every time a command
|
|
||||||
is started: so it is important to make a copy right after the problem
|
|
||||||
occurs (for example, do not start *recoll* after a *recollindex*
|
|
||||||
crash, this would reset the log). A workaround for this issue is to let the
|
|
||||||
messages go to the default +stderr+, and redirect this._
|
|
||||||
|
|
||||||
By default, the messages are output to +stderr+, and you probably don't even
|
|
||||||
see them if Recoll is started from the desktop. In this case, you need to
|
|
||||||
set the parameters so that output goes to a file, and the appropriate
|
|
||||||
verbosity level is set. When using the command-line, you may actually
|
|
||||||
prefer to redirect stderr to avoid the log-truncating issue described
|
|
||||||
above.
|
|
||||||
|
|
||||||
You can set the log parameters from the GUI _Indexing parameters_
|
|
||||||
section or by editing the '~/.recoll/recoll.conf' file: set the
|
|
||||||
+loglevel+ and +logfilename+ parameters. E.g.:
|
|
||||||
|
|
||||||
----
|
|
||||||
loglevel = 6
|
|
||||||
logfilename = /tmp/recolltrace
|
|
||||||
----
|
|
||||||
|
|
||||||
The log file can become very big if you need a big indexing run to
|
|
||||||
reproduce the problem. Choose a file system with enough space available
|
|
||||||
(possibly a few gigabytes).
|
|
||||||
|
|
||||||
Then run the sequence that leads to the problem, and make a copy of the log
|
|
||||||
file just after. If the log is too big, it will usually be sufficient to
|
|
||||||
use the last 500 lines or so (tail -500).
|
|
||||||
|
|
||||||
==== Single file indexing issues
|
|
||||||
|
|
||||||
When the problem concerns, or can be reproduced with, a single file it is
|
|
||||||
very cumbersome to have to run a full indexing pass to reproduce it. There
|
|
||||||
are two ways around this:
|
|
||||||
|
|
||||||
- Set up an ad hoc configuration with only the file of interest, or its
|
|
||||||
parent directory:
|
|
||||||
----
|
|
||||||
cd
|
|
||||||
mkdir recoll-test
|
|
||||||
cd recoll-test
|
|
||||||
echo /path/to/my/file/or/its/parent/dir > recoll.conf
|
|
||||||
echo 'loglevel = 6' >> recoll.conf
|
|
||||||
echo 'logfilename = /tmp/recolltrace' >> recoll.conf
|
|
||||||
recollindex -z -c .
|
|
||||||
----
|
|
||||||
- Use the -e and -i options to recollindex to erase/reindex a single
|
|
||||||
file. Set up the log, then:
|
|
||||||
----
|
|
||||||
recollindex -e /path/to/my/file
|
|
||||||
recollindex -i /path/to/my/file
|
|
||||||
----
|
|
||||||
|
|
||||||
When using the second approach, you must take care that the path used is
|
|
||||||
consistent with the paths listed/used in the configuration (ie: if '/home' is
|
|
||||||
a link to '/usr/home', and '/usr/home/me' is used in the configuration
|
|
||||||
+topdirs+, `recollindex -i /home/me/myfile` will not work, you need
|
|
||||||
to use `recollindex -i /usr/home/me/myfile`.
|
|
||||||
|
|
||||||
|
|
||||||
=== Obtaining a stack trace
|
|
||||||
|
|
||||||
If the program actually crashes, and in order to maximize usefulness, a
|
|
||||||
crash report should also include a so-called stack trace, something that
|
|
||||||
indicates what the program was doing when it crashed. Getting a useful
|
|
||||||
stack trace is not very difficult, but it may need a little work on your
|
|
||||||
part (which will then enable me do my part of the work).
|
|
||||||
|
|
||||||
If your distribution includes a separate package for Recoll debugging
|
|
||||||
symbols, it probably also has a page on its web site explaining how to use
|
|
||||||
them to get a stack trace. You should follow these instructions. If there
|
|
||||||
is no debugging package, you should follow the instructions below. A little
|
|
||||||
familiarity with the command line will be necessary.
|
|
||||||
|
|
||||||
==== Compiling and installing a debugging version
|
|
||||||
|
|
||||||
- Obtain the recoll source for the version you are using (www.recoll.org),
|
|
||||||
and extract the source tree.
|
|
||||||
- Follow the
|
|
||||||
link:http://www.lesbonscomptes.com/recoll/usermanual/rcl.install.building.html[instructions
|
|
||||||
for building Recoll from source] with the following modifications:
|
|
||||||
- Before running configure, edit the mk/localdefs.in file and remove the
|
|
||||||
-O2 option(s).
|
|
||||||
- When running configure, specify the standard installation location for
|
|
||||||
your system as a prefix (to avoid ending up with two installed versions,
|
|
||||||
which would almost certainly end in confusion). On Linux this would
|
|
||||||
typically be: `configure --prefix=/usr`
|
|
||||||
- When installing, arrange for the installed executables not to be stripped
|
|
||||||
of debugging symbols by specifying a value for the STRIP environment
|
|
||||||
variable (ie: *echo* or *ls*): `sudo make install STRIP=ls`
|
|
||||||
|
|
||||||
==== Getting a core dump
|
|
||||||
|
|
||||||
You will need to run the operation that caused the crash inside a writable
|
|
||||||
directory, and tell the system that you accept core dumps. The commands
|
|
||||||
need to be run in a shell inside a terminal window. E.g.:
|
|
||||||
|
|
||||||
----
|
|
||||||
cd
|
|
||||||
ulimit -c unlimited
|
|
||||||
recoll #(or recollindex or whatever you want to run).
|
|
||||||
----
|
|
||||||
|
|
||||||
Hopefuly, you will succeed in getting the command to crash, and you will
|
|
||||||
get a core file. A possible approach then would be to make both the
|
|
||||||
executable and the core files available to me by uploading it to a file
|
|
||||||
sharing site (the core file may be quite big). You should be aware though
|
|
||||||
that the core file may contain some of the data that was being indexed,
|
|
||||||
which may be a privacy issue. Another approach is to generate the stack
|
|
||||||
trace yourself.
|
|
||||||
|
|
||||||
=== Using gdb to get a stack trace
|
|
||||||
|
|
||||||
- Install gdb if it is not already on the system.
|
|
||||||
- Run gdb on the command that crashed and the core file (depending on the
|
|
||||||
system, the core file may be named "core" or something else, like
|
|
||||||
recollindex.core, or core.pid), ie: {{{gdb /usr/bin/recollindex core}}}
|
|
||||||
- Inside gdb, you need to use different commands to get a stack trace for
|
|
||||||
recoll and recollindex. For recollindex you can use the bt command. For
|
|
||||||
recoll use `thread apply all bt full`
|
|
||||||
- Copy/paste the output to your report email :), and quit gdb ("q").
|
|
||||||
|
|
||||||
@ -1,61 +0,0 @@
|
|||||||
== Starting native applications ==
|
|
||||||
|
|
||||||
Another example of using an intermediary script for an application with a
|
|
||||||
command line syntax which can't be directly defined in mimeview.
|
|
||||||
|
|
||||||
We use a script to preprocess and adapt the options before calling the
|
|
||||||
actual command.
|
|
||||||
|
|
||||||
Details about configuring how the native application or script are called
|
|
||||||
are given with the
|
|
||||||
link:http://www.recoll.org/usermanual/usermanual.html#RCL.INSTALL.CONFIG.MIMEVIEW[description
|
|
||||||
of the mimeview configuration file].
|
|
||||||
|
|
||||||
*qpdfview* (link:http://launchpad.net/qpdfview[web site]) is a very
|
|
||||||
lightweight tabbed PDF viewer with great search performance and result
|
|
||||||
highlighting.
|
|
||||||
|
|
||||||
It does support parsing the search term and page number from the command
|
|
||||||
line with the following syntax:
|
|
||||||
|
|
||||||
----
|
|
||||||
qpdfview --unique "%f"#%p --search "%s"
|
|
||||||
----
|
|
||||||
|
|
||||||
However, qpdfview will not launch if either %p or %s are empty in the
|
|
||||||
command above. To accommodate for that, Recoll user Florian has written a
|
|
||||||
small wrapper shell script:
|
|
||||||
|
|
||||||
----
|
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
qpdfviewpath=qpdfview
|
|
||||||
|
|
||||||
if [ -z $2 ]
|
|
||||||
then
|
|
||||||
page=""
|
|
||||||
|
|
||||||
else
|
|
||||||
page="#"$2""
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -z $3 ]
|
|
||||||
then
|
|
||||||
search=""
|
|
||||||
|
|
||||||
else
|
|
||||||
search="--search "$3""
|
|
||||||
fi
|
|
||||||
|
|
||||||
$qpdfviewpath --unique "$1"$page $search >&0 2>&0 &
|
|
||||||
----
|
|
||||||
|
|
||||||
|
|
||||||
The corresponding handler line for Recoll would be (depending on how you
|
|
||||||
name the script and where you store it):
|
|
||||||
|
|
||||||
----
|
|
||||||
qpdfviewwrapper %f %p %s
|
|
||||||
----
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,18 +0,0 @@
|
|||||||
== Querying Recoll from a C program
|
|
||||||
|
|
||||||
The easiest way to query Recoll from a C or C++ program is to execute an
|
|
||||||
external search command (`recollq` or `recoll -t`).
|
|
||||||
|
|
||||||
I have written a simple C module which deals with the related housekeeping
|
|
||||||
and presents an easy to use API to the rest of the code. You will find it
|
|
||||||
here:
|
|
||||||
|
|
||||||
https://bitbucket.org/medoc/recoll-capi
|
|
||||||
|
|
||||||
It is a bit experimental and will only work with recoll 1.20 for now
|
|
||||||
(because it uses a new option for recollq). However it would be trivial to
|
|
||||||
modify for working with 1.19, get in touch with me if you need this.
|
|
||||||
|
|
||||||
The other approach is to link with the Recoll library. This has no official
|
|
||||||
API, but in practise, the internal one is fairly stable, and if you want to
|
|
||||||
choose this approach, you should start from the code in recollq.cpp
|
|
||||||
@ -1,58 +0,0 @@
|
|||||||
== Replacing the Category filter controls
|
|
||||||
|
|
||||||
The document category filter controls normally appear at the top of the
|
|
||||||
*recoll* GUI, either as checkboxes just above the result list, or as a
|
|
||||||
dropbox in the tool area.
|
|
||||||
|
|
||||||
By default, they are labeled _Media_, _Message_, _Spreadsheet_, _Text_,
|
|
||||||
etc. and each map to a document category.
|
|
||||||
|
|
||||||
The mapping used to be fixed. You could change the number and composition
|
|
||||||
of categories by redefining them inside the {{{mimeconf}}} configuration
|
|
||||||
file (you still can), but the filters always used document categories.
|
|
||||||
|
|
||||||
Categories can also be selected from the query language by using an
|
|
||||||
+rclcat:+ selector. E.g.: _rclcat:message_.
|
|
||||||
|
|
||||||
As of Recoll release 1.17, the filters are not hard-wired any more. They
|
|
||||||
map to query language fragments. This means that you can freely redefine
|
|
||||||
what they do.
|
|
||||||
|
|
||||||
The associations are configured inside the 'mimeconf' file, in the
|
|
||||||
+[guifilters]+ section. Most GUI parameters are stored in the *Qt*
|
|
||||||
configuration file, so this is not entirely consistent, and you will have
|
|
||||||
to bear with my lazyness here.
|
|
||||||
|
|
||||||
A simple exemple will hopefuly make things clearer. If you add the
|
|
||||||
following to your '~/.recoll/mimeconf' file:
|
|
||||||
|
|
||||||
----
|
|
||||||
[guifilters]
|
|
||||||
|
|
||||||
Big Books = dir:"~/My Books" size>10K
|
|
||||||
My Docs = dir:"~/My Documents"
|
|
||||||
Small Books = dir:"~/My Books" size<10K
|
|
||||||
System Docs = dir:/usr/share/doc
|
|
||||||
|
|
||||||
----
|
|
||||||
|
|
||||||
You will have four filter checkboxes, labelled _Big Books_, _My Docs_, etc.
|
|
||||||
|
|
||||||
The text after the equal sign must be a valid query language fragment, and
|
|
||||||
will be translated to a *Recoll* query and combined with the rest of the
|
|
||||||
query with an AND conjunction.
|
|
||||||
|
|
||||||
Any name text before a colon character will be erased in the display, but
|
|
||||||
used for sorting. You can use this to display the checkboxes in any order
|
|
||||||
you like. For exemple, the following would do exactly the same as above,
|
|
||||||
but ordering the checkboxes in the reverse order.
|
|
||||||
|
|
||||||
----
|
|
||||||
[guifilters]
|
|
||||||
|
|
||||||
d:Big Books = dir:"~/My Books" size>10K
|
|
||||||
c:My Docs = dir:"~/My Documents"
|
|
||||||
b:Small Books = dir:"~/My Books" size<10K
|
|
||||||
a:System Docs = dir:/usr/share/doc
|
|
||||||
|
|
||||||
----
|
|
||||||
@ -1,23 +0,0 @@
|
|||||||
== Result list thumbnails and how to create them
|
|
||||||
|
|
||||||
Recoll will display thumbnails for the results if the images exist in the
|
|
||||||
standard location ('$HOME/.thumbnails' or '$HOME/.cache/thumbnails' depending
|
|
||||||
on the xdg version).
|
|
||||||
|
|
||||||
But it will not create thumbnails, mainly because it is very hard to do
|
|
||||||
portably.
|
|
||||||
|
|
||||||
Thumbnails are most commonly created when you visit a directory with your
|
|
||||||
file manager, but visiting the whole file tree just to create thumbnails is
|
|
||||||
a bit fastidious.
|
|
||||||
|
|
||||||
One simple trick to create thumbnails from the recoll GUI is to visit the
|
|
||||||
parent directory for a result by using the _Open parent document/folder_
|
|
||||||
entry in the right-click menu.
|
|
||||||
|
|
||||||
You can also find tools for the systematic creation of thumbnails for a
|
|
||||||
directory tree. Three such tools are discussed on this
|
|
||||||
link:http://askubuntu.com/questions/199110/how-can-i-instruct-nautilus-to-pre-generate-pdf-thumbnails[askubuntu.com discussion]
|
|
||||||
|
|
||||||
Also please note that no thumbnails can currently be generated or displayed
|
|
||||||
for embedded documents (attachments, archive members, etc.).
|
|
||||||
@ -1,61 +0,0 @@
|
|||||||
== User configuration backup
|
|
||||||
|
|
||||||
=== Why you would want to do this
|
|
||||||
|
|
||||||
If you are going to reinstall your system, and have some custom
|
|
||||||
configuration, you may save some time by making a backup of your
|
|
||||||
configuration and restoring it on the new system, rather than going through
|
|
||||||
the menus to recreate it.
|
|
||||||
|
|
||||||
=== How to do it
|
|
||||||
|
|
||||||
==== Index/search configuration
|
|
||||||
|
|
||||||
The main recoll configuration data is normally kept inside '~/.recoll' or
|
|
||||||
whatever *$RECOLL_CONFDIR* is set to.
|
|
||||||
|
|
||||||
This directory contains both configuration files and generated index
|
|
||||||
data.In a standard configuration, the following files and directories
|
|
||||||
contain generated data:
|
|
||||||
|
|
||||||
- 'xapiandb' contains the Xapian index, which normally consumes most of the
|
|
||||||
total space.
|
|
||||||
- 'aspdict.en.rws' contains the aspell dictionary used for spelling
|
|
||||||
corrections.
|
|
||||||
- 'mboxcache' contains cached offset data for email messages inside mbox
|
|
||||||
folders.
|
|
||||||
- 'webcache' contains saved web pages. This is more than a cache as
|
|
||||||
destroying it will purge the corresponding data during the next
|
|
||||||
indexing.
|
|
||||||
|
|
||||||
The other files are either very small or contain configuration data.
|
|
||||||
|
|
||||||
If you want to only save configuration, using minimum space, you can
|
|
||||||
destroy the above files and directories (with the possible exception of
|
|
||||||
'webcache'). Then taking a copy of the '.recoll' directory and adding the
|
|
||||||
GUI configuration data described in the next will get you a full
|
|
||||||
configuration data backup.
|
|
||||||
|
|
||||||
==== GUI configuration
|
|
||||||
|
|
||||||
The parameters set from the _Query configuration_ Qt menus are stored in
|
|
||||||
Qt standard places:
|
|
||||||
|
|
||||||
- '~/.qt/recollrc' for Qt 3.x
|
|
||||||
- '~/.config/Recoll.org/recoll.conf' for Qt 4 and later
|
|
||||||
|
|
||||||
|
|
||||||
==== Other data
|
|
||||||
|
|
||||||
If you wish to save index data in addition to the customisation files,
|
|
||||||
which only makes sense if the document access paths do not change after
|
|
||||||
reinstallation, you can just take a backup of the full '.recoll'
|
|
||||||
directory, taking care that the storage locations for some data elements
|
|
||||||
can be changed (not be inside '.recoll'):
|
|
||||||
|
|
||||||
- The index data is normally kept inside '~/.recoll/xapiandb', but the
|
|
||||||
location of this directory can be modified by the +dbdir+
|
|
||||||
configuration parameter if it is set (check 'recoll.conf').
|
|
||||||
- If you use the Firefox Recoll plugin, the WEB history cache is normally
|
|
||||||
kept inside '~/.recoll/webcache', but the location can be modified by
|
|
||||||
the +webcachedir+ configuration parameter.
|
|
||||||
@ -1,109 +0,0 @@
|
|||||||
== Building and Installing the Ubuntu Unity Recoll Lens
|
|
||||||
|
|
||||||
Important preliminary notes:
|
|
||||||
|
|
||||||
- This only makes sense for Ubuntu versions using the Unity environment:
|
|
||||||
Natty (11.04), Oneiric (11.10), Precise (12.04), and later.
|
|
||||||
- _Remember that you still need to use the recoll GUI (or the recollindex
|
|
||||||
//command) to get the indexing going !_
|
|
||||||
- The Lens is artificially limited to showing at most 20 results. Use the
|
|
||||||
recoll GUI for more complete capabilities (or edit rclsearch.py, change
|
|
||||||
the "if actual_results >= 20:" line).
|
|
||||||
|
|
||||||
|
|
||||||
=== The Lens with Recoll 1.17 and later
|
|
||||||
|
|
||||||
If you are willing to install or upgrade to Recoll version 1.17, all
|
|
||||||
necessary packages are on the Recoll PPA, you just need to add the
|
|
||||||
repository to your system sources and add or upgrade the packages: *_/This
|
|
||||||
is the recommended approach!_*
|
|
||||||
|
|
||||||
----
|
|
||||||
sudo add-apt-repository ppa:recoll-backports/recoll-1.15-on
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install recoll-lens recoll
|
|
||||||
----
|
|
||||||
|
|
||||||
This document may still be useful if you want to modify the lens source
|
|
||||||
code.
|
|
||||||
|
|
||||||
=== The Lens with older Recoll versions
|
|
||||||
|
|
||||||
If, for some reason, you wish to test the Lens with an older Recoll
|
|
||||||
version, read the following.
|
|
||||||
|
|
||||||
Please not that such an installation is somewhat crippled: you will not be
|
|
||||||
able to display results for embedded documents (emails inside an mbox,
|
|
||||||
attachments etc.). This requires a recoll command line option which is only
|
|
||||||
available in 1.17
|
|
||||||
|
|
||||||
The Lens is based on the Recoll Python module which is not built by default
|
|
||||||
for versions prior to 1.17, so so you will first need to pull the Recoll
|
|
||||||
source code (for you version), then untar and proceed with the
|
|
||||||
configure/build instructions below.
|
|
||||||
|
|
||||||
The following uses --prefix=/usr. I have no real reason to believe
|
|
||||||
that this would not work with /usr/local (lenses are also searched there by
|
|
||||||
default). If you confirm that things work with another prefix, please drop
|
|
||||||
me a line.
|
|
||||||
|
|
||||||
When doing this over a previous Recoll compilation, run a "make clean" to
|
|
||||||
get rid of the non-PIC objects.
|
|
||||||
|
|
||||||
Note that the following instructions change nothing to your existing Recoll
|
|
||||||
installation, they only install the Python module and the Unity Lens,
|
|
||||||
recoll, recollindex etc. are unaffected.
|
|
||||||
|
|
||||||
'/TOP/OF/RECOLL/SRC' designates the top of the recoll source tree.
|
|
||||||
|
|
||||||
=== Configure and build the recoll library and python module, install the module
|
|
||||||
|
|
||||||
The following needs the development packages for Xapian, Python and zlib.
|
|
||||||
|
|
||||||
----
|
|
||||||
cd /TOP/OF/RECOLL/SRC
|
|
||||||
# May fail if no previous build was performed
|
|
||||||
make clean
|
|
||||||
|
|
||||||
# the gui/x11 disabling is just here to avoid having to install the
|
|
||||||
# development libraries for Qt.
|
|
||||||
configure --prefix=/usr --enable-pic --without-x --disable-qtgui
|
|
||||||
make
|
|
||||||
|
|
||||||
cd python/recoll
|
|
||||||
python setup.py build
|
|
||||||
sudo python setup.py install
|
|
||||||
----
|
|
||||||
|
|
||||||
=== Build and install the Unity Lens
|
|
||||||
|
|
||||||
----
|
|
||||||
cd /TOP/OF/RECOLL/SRC
|
|
||||||
cd desktop/unity-lens-recoll
|
|
||||||
configure --prefix=/usr --sysconfdir=/etc
|
|
||||||
sudo make install
|
|
||||||
|
|
||||||
----
|
|
||||||
|
|
||||||
Voilà, it should work...
|
|
||||||
|
|
||||||
Try to start the Dash, you should see the Recoll checkerboard (or
|
|
||||||
whatever...) in the Lens list.
|
|
||||||
|
|
||||||
The Recoll Lens expects a Recoll query language string, so you can use
|
|
||||||
field searches, directory, size, and date filtering (see the
|
|
||||||
link:http://www.lesbonscomptes.com/recoll/usermanual/rcl.search.lang.html[Recoll
|
|
||||||
manual] for a description of the query language).
|
|
||||||
|
|
||||||
If you want to disable the Lens, I think that you just have to delete
|
|
||||||
'/usr/share/unity/lenses/recoll'
|
|
||||||
|
|
||||||
Other installed files:
|
|
||||||
|
|
||||||
----
|
|
||||||
/usr/libexec/unity-recoll-daemon
|
|
||||||
/usr/share/dbus-1/services/unity-lens-recoll.service
|
|
||||||
/usr/share/doc/unity-lens-recoll
|
|
||||||
/usr/share/unity-lens-recoll
|
|
||||||
----
|
|
||||||
|
|
||||||
@ -1,68 +0,0 @@
|
|||||||
== Using the _Open With_ context menu in recoll 1.20 and newer
|
|
||||||
|
|
||||||
Recoll versions and newer have an _Open With_ entry in the result list
|
|
||||||
context menu (the thing which pops up on a right click).
|
|
||||||
|
|
||||||
This allows choosing the application used to edit the document, instead of
|
|
||||||
using the default one.
|
|
||||||
|
|
||||||
The list of applications is built from the desktop files found inside
|
|
||||||
'/usr/share/applications'. For each application on the system, these
|
|
||||||
files lists the mime types that the application can process.
|
|
||||||
|
|
||||||
If the application which you would want listed does not appear, the most
|
|
||||||
probable cause is that it has no desktop file, which could happen due to a
|
|
||||||
number of reasons.
|
|
||||||
|
|
||||||
This can be fixed very easily: just add a +.desktop+ file to
|
|
||||||
'/usr/share/applications', starting from an existing one as a template.
|
|
||||||
|
|
||||||
As an example, based on an original idea from Recoll user +florianbw+,
|
|
||||||
the following describes setting up a script for editing a PDF document
|
|
||||||
title found in the recoll result list.
|
|
||||||
|
|
||||||
The script uses the *zenity* shell script dialog box tool to let you
|
|
||||||
enter the new title, and then executes *exiftool* to actually change
|
|
||||||
the document.
|
|
||||||
|
|
||||||
----
|
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
PDF=$1
|
|
||||||
TITLE=`exiftool -Title -s3 "$PDF"`
|
|
||||||
|
|
||||||
RES=`zenity --entry \
|
|
||||||
--title="Change PDF Title" \
|
|
||||||
--text="Enter the Title:" \
|
|
||||||
--entry-text "$TITLE"`
|
|
||||||
|
|
||||||
if [ "$RES" != "" ]; then
|
|
||||||
echo -n "Changing title to $RES ... " && \
|
|
||||||
exiftool -Title="$RES" "$PDF" && \
|
|
||||||
recollindex -i "$PDF" && echo "Done!"
|
|
||||||
else
|
|
||||||
echo "No title entered"
|
|
||||||
fi
|
|
||||||
----
|
|
||||||
|
|
||||||
Name it, for example, 'pdf-edit-title.sh', and make it executable
|
|
||||||
(`chmod a+x pdf-edit-title.sh`).
|
|
||||||
|
|
||||||
Then create a file named 'pdf-edit-title.desktop' inside
|
|
||||||
'/usr/share/applications'. The file name does not need to be the same as the
|
|
||||||
script's, this is just to make things clearer:
|
|
||||||
|
|
||||||
----
|
|
||||||
[Desktop Entry]
|
|
||||||
Name=PDF Title Editor
|
|
||||||
Comment=Small script based on exiftool used to edit a pdf document title
|
|
||||||
Exec=/home/dockes/bin/pdf-edit-title.sh %F
|
|
||||||
Type=Application
|
|
||||||
MimeType=application/pdf;
|
|
||||||
----
|
|
||||||
|
|
||||||
You're done ! Restart Recoll, perform a search and right-click on a PDF
|
|
||||||
result: you should see an entry named _PDF Title Editor_ in the _Open
|
|
||||||
With_ list. Click on it, and you will be able to edit the title.
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,100 +0,0 @@
|
|||||||
== Using the log file to investigate indexing issues
|
|
||||||
|
|
||||||
All *Recoll* processes print trace messages. By default these go to the
|
|
||||||
standard error output, and you may not ever see them (in the case, for
|
|
||||||
example, of the *recoll* GUI started from the desktop interface).
|
|
||||||
|
|
||||||
There are a number of potential issues with indexing that may need
|
|
||||||
investigation, such as:
|
|
||||||
|
|
||||||
- A file can't be found by searching even if it appears that it should have
|
|
||||||
be indexed (this could happen because the file is not selected at all or
|
|
||||||
because a filter program crashes).
|
|
||||||
- The indexing process gets stuck and never finishes.
|
|
||||||
- The indexing process ends up with an error.
|
|
||||||
- The indexing process seems to be using too much system capacity.
|
|
||||||
|
|
||||||
The right way to approach these problems is to use the *recollindex*
|
|
||||||
command line tool (instead of the *recoll* GUI), and to set up the
|
|
||||||
trace log to provide information about what indexing is actually doing.
|
|
||||||
|
|
||||||
Trace log parameters can be set either from the GUI _Preferences->Indexing
|
|
||||||
Configuration->Global Parameters_ panel, or by editing the configuration
|
|
||||||
file '~/.recoll/recoll.conf'. You should set the following parameters:
|
|
||||||
|
|
||||||
----
|
|
||||||
loglevel = 6
|
|
||||||
logfilename = stderr
|
|
||||||
thrQSizes = -1 -1 -1
|
|
||||||
----
|
|
||||||
|
|
||||||
We use _stderr_ instead of an actual file in order to capture direct filter
|
|
||||||
messages (such as a *python* stack trace) along with normal
|
|
||||||
*recollindex* messages.
|
|
||||||
|
|
||||||
The last line sets recollindex for single-threaded operation, which will
|
|
||||||
make the log much more readable.
|
|
||||||
|
|
||||||
You should then check that no *recoll* or *recollindex* process is
|
|
||||||
currently running, and kill any you find.
|
|
||||||
|
|
||||||
Then, if this is an issue about an identified file, try indexing it only:
|
|
||||||
|
|
||||||
----
|
|
||||||
recollindex -i myunfindablefile.xxx > /tmp/myindexlog 2>&1
|
|
||||||
----
|
|
||||||
|
|
||||||
If this is a general issue with indexing (process not finishing properly),
|
|
||||||
just start it:
|
|
||||||
|
|
||||||
----
|
|
||||||
recollindex > /tmp/myindexlog 2>&1
|
|
||||||
----
|
|
||||||
|
|
||||||
Usually, having a look at the trace will allow to see what is wrong (e.g.:
|
|
||||||
a configuration issue or missing filter), and solve the problem.
|
|
||||||
|
|
||||||
In case of indexer misbehaviour (e.g. using too much memory, you should run
|
|
||||||
_tail -f_ on the log to see what is going on.
|
|
||||||
|
|
||||||
If this is not enough, please
|
|
||||||
link:https://opensourceprojects.eu/p/recoll1/tickets/new/[open a tracker
|
|
||||||
issue] and attach or link to the log data, or just email me (jfd at
|
|
||||||
recoll.org).
|
|
||||||
|
|
||||||
*recollindex* and *recollindex -i* usually have the same criteria to
|
|
||||||
include a file or not (but see the _Path gotcha_ note below). It may
|
|
||||||
happen that they behave differently, so it may sometimes be useful to run a
|
|
||||||
full *recollindex* even for a specific file, but this will produce a
|
|
||||||
big log file.
|
|
||||||
|
|
||||||
When you are done, it is better to reset the verbosity to a reasonable
|
|
||||||
level (e.g.: +2+ : just errors, +3+ : information, listing indexed files).
|
|
||||||
|
|
||||||
=== Note: the path gotcha
|
|
||||||
|
|
||||||
*recollindex -i* will only index files under the directories defined by the
|
|
||||||
+topdirs+ configuration variable (your home directory by
|
|
||||||
default). Unfortunately, the test is done on the file path text, ignoring
|
|
||||||
possible symbolic links. If you give a simple file name as a parameter to
|
|
||||||
*recollindex -i* and there are symbolic links inside the +topdirs+
|
|
||||||
entries, the comparison may fail. For example, if your home directory is
|
|
||||||
'/home/me/' and '/home/' is a link to '/usr/home/', *recollindex -i
|
|
||||||
somefilename* will actually try to index '/usr/home/somefilename/', and
|
|
||||||
fail (because '/usr/home/me/' is not a subdirectory of '/home/me/'). This
|
|
||||||
will manifest itself in the log by a message like the following.
|
|
||||||
|
|
||||||
----
|
|
||||||
:4:../index/fsindexer.cpp:149:FsIndexer::indexFiles: skipping [/usr/home/me/somefile] (ntd)
|
|
||||||
----
|
|
||||||
|
|
||||||
If this happens, give a full path consistent with what is found in the
|
|
||||||
configuration file (e.g.: _recollindex -i /home/me/somefile_).
|
|
||||||
|
|
||||||
=== File system occupation
|
|
||||||
|
|
||||||
One of the possible reasons for failed indexing is a +maxfsoccup+
|
|
||||||
parameter set too low. This is the value of file system occupation, not
|
|
||||||
free space, where indexing will stop. It is set from the GUI indexing
|
|
||||||
configuration or by editing 'recoll.conf'. A value of 0 implies no
|
|
||||||
checking, but a very low, non-zero, value will just prevent indexing.
|
|
||||||
@ -1,65 +0,0 @@
|
|||||||
== Recoll Wiki file index
|
|
||||||
link:ElinksWeb.html[Extending the Recoll Firefox visited web page indexing mechanism to other browsers]
|
|
||||||
|
|
||||||
link:FaqsAndHowTos.html[Faqs and Howtos]
|
|
||||||
|
|
||||||
link:FilterArch.html[Recoll input filters ]
|
|
||||||
|
|
||||||
link:FilterRetrofit.html[Installing a filter for a new document type]
|
|
||||||
|
|
||||||
link:FilteringOutZipArchiveMembers.html[Filtering out Zip archive members]
|
|
||||||
|
|
||||||
link:GUIKeyboard.html[# Recoll GUI keyboard navigation]
|
|
||||||
|
|
||||||
link:HandleCustomField.html[Generating a custom field and using it to sort results]
|
|
||||||
|
|
||||||
link:Home.html[Welcome to the Recoll Wiki]
|
|
||||||
|
|
||||||
link:HotRecoll.html[Recoll hotkey: starting / hiding recoll with a keyboard shortcut]
|
|
||||||
|
|
||||||
link:IndexMailHeader.html[Indexing arbitrary mail headers ]
|
|
||||||
|
|
||||||
link:IndexMozillaCalendari.html[Indexing Mozilla calendar data ]
|
|
||||||
|
|
||||||
link:IndexOnAc.html[Laptops: automatically starting or stopping indexing according to AC power status]
|
|
||||||
|
|
||||||
link:IndexOutlook.html[Indexing Outlook archives]
|
|
||||||
|
|
||||||
link:IndexWebHistory.html[Indexing Web history with the Firefox extension ]
|
|
||||||
|
|
||||||
link:MultipleIndexes.html[Creating and using multiple indexes]
|
|
||||||
|
|
||||||
link:MuttAndRecoll.html[Interfacing Recoll and Mutt]
|
|
||||||
|
|
||||||
link:NonAsciiFileNames.html[Unix and non-ASCII file names, a summary of issues]
|
|
||||||
|
|
||||||
link:OpenHelperScript.html[Starting native applications ]
|
|
||||||
|
|
||||||
link:PreventIndexingDir.html[Preventing indexing in a directory]
|
|
||||||
|
|
||||||
link:ProblemSolvingData.html[Gathering useful data for asking help about or reporting a Recoll issue]
|
|
||||||
|
|
||||||
link:QpdfviewHelperScript.html[Starting native applications ]
|
|
||||||
|
|
||||||
link:QueryFromC.html[Querying Recoll from a C program]
|
|
||||||
|
|
||||||
link:ReplaceCategories.html[Replacing the Category filter controls]
|
|
||||||
|
|
||||||
link:ResultsThumbnails.html[Result list thumbnails and how to create them]
|
|
||||||
|
|
||||||
link:SavingConfig.html[User configuration backup]
|
|
||||||
|
|
||||||
link:UnityLens.html[Building and Installing the Ubuntu Unity Recoll Lens]
|
|
||||||
|
|
||||||
link:UsingOpenWith.html[Using the Open With context menu in recoll 1.20 and newe]
|
|
||||||
|
|
||||||
link:WhyIsMyFileNotIndexed.html[Using the log file to investigate indexing issues]
|
|
||||||
|
|
||||||
link:XDGBase.html[XDG: Tidying Recoll data storage]
|
|
||||||
|
|
||||||
link:ZDevCaseAndDiacritics1.html[Character case and diacritic marks (1), issues with stemming]
|
|
||||||
|
|
||||||
link:ZDevCaseAndDiacritics2.html[Character case and diacritic marks (2), user interface]
|
|
||||||
|
|
||||||
link:ZDevCaseAndDiacritics3.html[Character case and diacritic marks (3), implementation]
|
|
||||||
|
|
||||||
@ -1,42 +0,0 @@
|
|||||||
== XDG: Tidying Recoll data storage ==
|
|
||||||
|
|
||||||
The default storage structure of Recoll configuration and index data is
|
|
||||||
quite at odds with what recommends the
|
|
||||||
link:http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html[XDG
|
|
||||||
Base Directory Specification], the reason being that it predates said spec.
|
|
||||||
|
|
||||||
By default, Recoll stores all its data in a single directory: '$HOME/.recoll'
|
|
||||||
|
|
||||||
This is not going to change, because it would be quite disturbing for
|
|
||||||
current users.
|
|
||||||
|
|
||||||
However, the location of this directory can be modified using the
|
|
||||||
+$RECOLL_CONFDIR+ environment variable.
|
|
||||||
|
|
||||||
Furthermore all significant Recoll data categories can be moved away from
|
|
||||||
the configuration directory (maybe to '$HOME/.cache'), by setting
|
|
||||||
configuration variables:
|
|
||||||
|
|
||||||
* _dbdir_ defines the location for storing the Xapian
|
|
||||||
index. This could be set to, e.g., '$HOME/.cache/recoll/xapiandb'. It is
|
|
||||||
quite recommended that
|
|
||||||
this directory be dedicated to Xapian (don't store other things in
|
|
||||||
there).
|
|
||||||
* _mboxcachedir_ defines the location for caching access speedup information
|
|
||||||
about mail folders in mbox format. e.g. '$HOME/.cache/recoll/mboxcache'
|
|
||||||
* New in 1.22: you can use _aspellDictDir_ to define the storage
|
|
||||||
location for the aspell spelling approximation
|
|
||||||
dictionary. E.g. '$HOME/.cache/recoll'
|
|
||||||
* _webcachedir_ may be used to define where the visited web pages
|
|
||||||
archive is stored. E.g. '$HOME/.cache/recoll/webcache'. This is only used
|
|
||||||
if you activate the Firefox plugin and web history indexing. You may
|
|
||||||
want to think a bit more about where to store it, because, contrary to
|
|
||||||
the above, this is not discardable data: your Recoll Web history goes
|
|
||||||
away if you delete it.
|
|
||||||
|
|
||||||
If you use multiple Recoll configurations, each will have to be customized.
|
|
||||||
|
|
||||||
Once these are put away, there are still a few modifyiable files in the
|
|
||||||
configuration directory, for example the 'recoll.pid' and 'history'
|
|
||||||
files, but these are small files. Moving 'recoll.pid' away would be a
|
|
||||||
serious headache because it is used by scripts.
|
|
||||||
@ -1,143 +0,0 @@
|
|||||||
== Character case and diacritic marks (1), issues with stemming
|
|
||||||
|
|
||||||
=== Case and diacritics in Recoll
|
|
||||||
|
|
||||||
Recoll versions up to 1.17 almost fully ignore character case and diacritic
|
|
||||||
marks.
|
|
||||||
|
|
||||||
All terms are converted to lower case and unaccented before they are
|
|
||||||
written to the index. There are only two exceptions:
|
|
||||||
|
|
||||||
* File paths (as used in _dir:_ clauses) are not converted. This might
|
|
||||||
be a bug or a feature, but the main reason is that we don't know how they
|
|
||||||
are encoded.
|
|
||||||
* It is possible to specify that some characters will keep their diacritic
|
|
||||||
marks, because the entity formed by the character and the diacritic mark
|
|
||||||
is considered to be a different letter, not a modified one. This is
|
|
||||||
highly dependant on the language. For exemple, in Swedish, +å+ should
|
|
||||||
be preserved, not turned into +a+.
|
|
||||||
|
|
||||||
As a necessary consequence, the same transformations are applied to search
|
|
||||||
terms, and it is impossible to search for a specific capitalization of a
|
|
||||||
word (+US+ is looked for as +us+), or a specific accented form
|
|
||||||
(+café+ will be looked for as +cafe+).
|
|
||||||
|
|
||||||
However, there are some cases where you would like to be more specific:
|
|
||||||
|
|
||||||
* Searching for +US+ or +us+ should probably return different results.
|
|
||||||
* Diacritics are seldom significant in English, but we can find a
|
|
||||||
few examples anyway: +sake+ and +saké+, +mate+ and +maté+. Of
|
|
||||||
course, there are many more cases in languages which use more diacritics.
|
|
||||||
|
|
||||||
On the other hand, accents are often mistyped or forgotten (résumé, résume,
|
|
||||||
resume?), and capitalization is most often unsignificant, so that it is
|
|
||||||
very important to retain the capability to ignore accent and character
|
|
||||||
case differences, and that the discrimination can be easily switched on or
|
|
||||||
off for each search (or even for specific terms).
|
|
||||||
|
|
||||||
This text and other pages which will follow will discuss issues in adding
|
|
||||||
character case and diacritics sensitivity to Recoll, under the assumption
|
|
||||||
that the main index will contain the raw source terms instead of
|
|
||||||
case-folded and unaccented ones.
|
|
||||||
|
|
||||||
The following will use the _unaccent_ neologism to mean _remove
|
|
||||||
diacritic marks_ (and not only accents).
|
|
||||||
|
|
||||||
English examples are used when possible, but given the limited use of
|
|
||||||
diacritics in English, some French will probably creep in.
|
|
||||||
|
|
||||||
=== Diacritics and stemming
|
|
||||||
|
|
||||||
Stemming is the process by which we extend a search to terms related by
|
|
||||||
grammatical inflexion, for example singular/plural, verb tenses, etc. For
|
|
||||||
example a search for +floor+ is normally expanded by Recoll to +floors,
|
|
||||||
floored, flooring, ...+
|
|
||||||
|
|
||||||
In practice Recoll has a separate data structure that has stemmed terms
|
|
||||||
(stems) as keys pointing to a list of expansion terms
|
|
||||||
{{{floor -> (floor,floors,floorings,...)}}}
|
|
||||||
|
|
||||||
Stemming should be applied to terms before they are stripped of
|
|
||||||
diacritics. Accents may have a grammatical significance, and the accent may
|
|
||||||
change how the term is stemmed. For example, in French the +âmes+ suffix
|
|
||||||
generally marks a past conjugation but +ames+ does not. The standard
|
|
||||||
Xapian French stemmer will turn +évitâmes+ (avoided) into an +évit+ stem,
|
|
||||||
but +évitames+ will be turned into +évitam+ (stripping
|
|
||||||
plural and feminine suffixes).
|
|
||||||
|
|
||||||
When the search is set to ignore diacritics, this poses a specific problem:
|
|
||||||
if the user enters the search term without accents (which is correct
|
|
||||||
because the system is supposed to ignore them), there is no warranty that
|
|
||||||
the term will be correctly expanded by stemming.
|
|
||||||
|
|
||||||
The diacritic mismatch breaks the family relationship between the stem
|
|
||||||
siblings, and this is independant of the type of index: it will happen with
|
|
||||||
an index where diacritics are stripped just as with a raw one.
|
|
||||||
|
|
||||||
The simpler case where diacritics in the original term only affects
|
|
||||||
diacritics in the stem also necessitates specific processing, but it is
|
|
||||||
easier to work around.
|
|
||||||
|
|
||||||
Two examples illustrating these issues follow.
|
|
||||||
|
|
||||||
==== The simple case: diacritics in the term only affect diacritics in the stem
|
|
||||||
|
|
||||||
Let's imagine that the document set contains the term +éviter+
|
|
||||||
(infinitive of +to avoid+), but not +évite+ (present). The only term in
|
|
||||||
the actual index is then +éviter+.
|
|
||||||
|
|
||||||
The user enters an unaccented +evite+, counting on the
|
|
||||||
diacritics-insensitive search mode to deal with the accents. As +évite+
|
|
||||||
is not present in the index, we have no way to guess that +evite+ is
|
|
||||||
really +évite+.
|
|
||||||
|
|
||||||
The stemmer will turn +evite+ into +evit+. There is no way that this
|
|
||||||
can be related to +éviter+, and this legitimate result can't be found.
|
|
||||||
|
|
||||||
There is a way around this: we can compute a separate
|
|
||||||
stem expansion dictionary for unaccented terms. This dictionary, to be used
|
|
||||||
with diacritic-unsensitive searches only, contains the relationship
|
|
||||||
between +evit+ and +eviter+ (as +éviter+ is in the index). We can
|
|
||||||
then relate +eviter+ and +éviter+ because they differ only by accents,
|
|
||||||
and the search will find the document with +éviter+.
|
|
||||||
|
|
||||||
==== The bad case: diacritics in the term change the stem beyond diacritics
|
|
||||||
|
|
||||||
Some grammatically significant accents will cause unexpectedly missing
|
|
||||||
search results when using a supposedly diacritics-insensitive search mode.
|
|
||||||
|
|
||||||
Let's imagine that the document set contains the term +éviter+
|
|
||||||
(infinitive of +to avoid+), but not +évitâmes+ (past). So the stemming
|
|
||||||
expansion table has an entry for +évit+ -> +éviter+.
|
|
||||||
|
|
||||||
If the user enters an unaccented +evitames+, she would expect to find the
|
|
||||||
documents containing +éviter+ in the results, because the latter term is
|
|
||||||
a stemming sibling of +évitâmes+ and the search is supposedly not
|
|
||||||
influenced by diacritics, so that +evitames+ and +évitâmes+ should be
|
|
||||||
equivalent.
|
|
||||||
|
|
||||||
However, our search is now in trouble, because +évitâmes+ is not in any
|
|
||||||
document, so that there is no data in the index which would inform us about
|
|
||||||
how to transform the input term into something that differs only by accents
|
|
||||||
but would yield a correct input for the stemmer.
|
|
||||||
|
|
||||||
If we try to feed the raw user input to the stemmer, it will propose
|
|
||||||
an +evitam+ stem, which will not work, because the stem that actually
|
|
||||||
exists is +évit+, and +evitam+ can not be related to +éviter+.
|
|
||||||
|
|
||||||
The only palliative approach I can think of would be a spelling correction
|
|
||||||
of the input, performed independantly of the actual index contents, which
|
|
||||||
would notice that +évitames+ is not a French word and propose a change or an
|
|
||||||
expansion to +évitâmes+, which would correctly stem to +évit+ and allow
|
|
||||||
us to find +éviter+.
|
|
||||||
|
|
||||||
This issue is not specific to Recoll or indeed to the fact that the index
|
|
||||||
retains accent or not. As far as I can see, it is an intrinsic bad
|
|
||||||
interaction between diacritics insensitivity and stemming.
|
|
||||||
|
|
||||||
It is also interesting to note that this case becomes less probable when
|
|
||||||
the data set becomes bigger, because more term inflexions will then be
|
|
||||||
present in the index.
|
|
||||||
|
|
||||||
We'll next think about an link:ZDevCaseAndDiacritics2.html[appropriate
|
|
||||||
interface].
|
|
||||||
@ -1,122 +0,0 @@
|
|||||||
== Character case and diacritic marks (2), user interface
|
|
||||||
|
|
||||||
In a link:ZDevCaseAndDiacritics1.html[previous document], we discussed some
|
|
||||||
of the problems which arise when mixing case/diacritics sensitivity and
|
|
||||||
stemming.
|
|
||||||
|
|
||||||
As of version 1.18, Recoll can create two types of indexes:
|
|
||||||
* _Dumb_ indexes contain terms which are lowercased and stripped of
|
|
||||||
diacritics. Searches using such an index are naturally case- and
|
|
||||||
diacritics- insensitive: search terms are stripped before processing.
|
|
||||||
* _Raw_ indexes contain terms which are just like they were found in the
|
|
||||||
source document. Searching such an index is naturally sensitive to case
|
|
||||||
and diacritics, and can be made insensitive by further processing.
|
|
||||||
|
|
||||||
The following explains how users can control these Recoll features.
|
|
||||||
|
|
||||||
=== Controlling the type of index we create: stripped or raw
|
|
||||||
|
|
||||||
The kind of index that recoll creates is determined by:
|
|
||||||
|
|
||||||
* A build-time *configure* switch: _--enable-stripchars_. If this is
|
|
||||||
set, the code for case and diacritics sensitivity is not compiled in and
|
|
||||||
recoll will work like the previous versions: unaccented and casefolded
|
|
||||||
index, no runtime options for case or diacritics sensitivity
|
|
||||||
|
|
||||||
* An indexing configuration switch (in recoll.conf): if Recoll was built
|
|
||||||
with _--disable-stripchars_, this will provide a dynamic way to return
|
|
||||||
to the "traditional" index. The case and diacritics code will be present
|
|
||||||
but inactive. Normally, a recoll installation with this switch set
|
|
||||||
should behave exactly like one built with _--enable-stripchars_. When
|
|
||||||
using multiple indexes, this switch MUST be consistent between
|
|
||||||
indexes. There is no support whatsoever for mixing raw and dumb indexes.
|
|
||||||
The option is named _indexStripChars_, and it is not settable from the
|
|
||||||
GUI to avoid errors. This is something that would typically be set once
|
|
||||||
and for all for a given installation. We need to decide what the default
|
|
||||||
value will be for 1.18
|
|
||||||
|
|
||||||
* A number of query time switches. Using these it is also possible to
|
|
||||||
perform a search insensitive to case and diacritics on a raw index. Note
|
|
||||||
however, that, given the complexity of the issues involved, I give no
|
|
||||||
guaranty at this time that this will yield exactly the same results as
|
|
||||||
searching a dumb index. Details about query time behaviour follow.
|
|
||||||
|
|
||||||
|
|
||||||
=== Controlling stem, case and diacritics expansion: user query interface
|
|
||||||
|
|
||||||
Recoll versions up to 1.17 were insensitive to case and diacritics. We only
|
|
||||||
needed to give the user a way to control stem expansion. This was done in
|
|
||||||
three ways:
|
|
||||||
|
|
||||||
* Globally, by setting a menu option.
|
|
||||||
* Globally, by setting the stemming language value to empty.
|
|
||||||
* On a term by term basis by Capitalizing the term, or, in query language
|
|
||||||
mode only, by using an 'l' clause modifier (_"term"l_).
|
|
||||||
|
|
||||||
After switching to an unstripped index, capable of case and diacritic
|
|
||||||
sensitivity, we need ways to control what processing is performed among:
|
|
||||||
|
|
||||||
* Case expansion.
|
|
||||||
* Diacritics expansion.
|
|
||||||
* Stem expansion.
|
|
||||||
|
|
||||||
The default mode will be compatible with the previous version, because
|
|
||||||
this is is most generally what we want to do: ignore case and diacritics,
|
|
||||||
expand stems.
|
|
||||||
|
|
||||||
There are two easy approaches for controlling the parameters:
|
|
||||||
* Global options set in the GUI menus or as *recollq* command line
|
|
||||||
switches.
|
|
||||||
* Per-clause options set by modifiers in the query language.
|
|
||||||
|
|
||||||
We would like, however to let the user entry automatically override the
|
|
||||||
defaults in a sensible way. For example:
|
|
||||||
|
|
||||||
* If a term is entered with diacritics, diacritic sensitivity is turned on
|
|
||||||
(for this term only).
|
|
||||||
* If a term is entered with upper-case characters, case sensitivity is
|
|
||||||
turned on. In this case, we turn off stem expansion, because it makes
|
|
||||||
really no sense with case sensitivity.
|
|
||||||
|
|
||||||
With this method we are stuck with 3 problems (only if the global mode is
|
|
||||||
set to insensitive, and we're not using the query language):
|
|
||||||
|
|
||||||
* Turning off stemming without turning on case sensitivity.
|
|
||||||
* Searching for an all lower-case term in case-sensitive mode.
|
|
||||||
* Searching for a term without diacritics in diacritic-sensitive mode.
|
|
||||||
|
|
||||||
The two latter issues are relatively marginal and can be worked around easily
|
|
||||||
by switching to query language mode or using negative clauses in the
|
|
||||||
advanced search.
|
|
||||||
|
|
||||||
However, we need to be able to turn stemming off while remaining
|
|
||||||
insensitive to case, and we need to stay reasonably compatible with the
|
|
||||||
previous versions. This means that a term which has a capital first letter
|
|
||||||
but is otherwise lowercase will turn stemming off, but not case sensitivity
|
|
||||||
on.
|
|
||||||
|
|
||||||
So we're left with how to search for such a term in a case-sensitive way,
|
|
||||||
and for this, you'll have to use global options or the query language.
|
|
||||||
|
|
||||||
The modified method is:
|
|
||||||
|
|
||||||
* If a term is entered with diacritics, diacritic sensitivity is turned on
|
|
||||||
(for this term only).
|
|
||||||
* If the first letter in a term is upper-case and the rest is lower-case,
|
|
||||||
we turn stem expansion off, but we do not become case-sensitive
|
|
||||||
* If any letter in a term except the first is upper-case, case sensitivity
|
|
||||||
is turned on. Stem expansion is also turned-off (even if the first
|
|
||||||
letter is lower-case), because it makes really no sense with case
|
|
||||||
sensitivity.
|
|
||||||
* To search for an all lower-case or capitalized term in a case-sensitive
|
|
||||||
way, use the query language: "Capitalized"C, "lowercase"C
|
|
||||||
* Use the query language and the "D" modifier to turn on diacritics
|
|
||||||
sensitivity.
|
|
||||||
|
|
||||||
It can be noted that some combinations of choices do not make sense and
|
|
||||||
they are not allowed by Recoll: for example, diacritics or case sensitivity
|
|
||||||
do not make sense with stem expansion (which cannot preserve diacritics in
|
|
||||||
any meaningful general way).
|
|
||||||
|
|
||||||
The [[ZDevCaseAndDiacritics3.wiki|next page]] describes the actual
|
|
||||||
implementation in Recoll 1.18.
|
|
||||||
@ -1,67 +0,0 @@
|
|||||||
== Character case and diacritic marks (3), implementation
|
|
||||||
|
|
||||||
In previous pages, we discussed link:ZDevCaseAndDiacritics1.html[diacritics
|
|
||||||
and stemming], and an link:ZDevCaseAndDiacritics2.html[appropriate
|
|
||||||
interface] for switchable search sensitivity to diacritics and character
|
|
||||||
case.
|
|
||||||
|
|
||||||
So you are in this mood again and you don't want to type accents (maybe you're
|
|
||||||
stuck with a QWERTY American english keyboard), or conversely you're
|
|
||||||
want to resume looking for your résumé, and you've told Recoll as much,
|
|
||||||
using the appropriate interface. What happens then ?
|
|
||||||
|
|
||||||
The second case is easy if the index is raw, and mostly impossible if it is
|
|
||||||
stripped. So we'll concentrate on the first case: how to achieve case and
|
|
||||||
diacritics insensitivity on a raw index ?
|
|
||||||
|
|
||||||
Recoll uses three expansion tables:
|
|
||||||
|
|
||||||
* The first table has stripped and lowercased terms as keys and raw terms as
|
|
||||||
data: +mate -> (mate, maté, MATE,...)+.
|
|
||||||
|
|
||||||
* The second table has lowercased stems as keys and original lowercase terms
|
|
||||||
as data (when using multiple languages, there are several such tables):
|
|
||||||
+évit -> (éviter, évite, évitâmes, ...)+.
|
|
||||||
|
|
||||||
* The third table has stripped and lowercased stems as keys and stripped
|
|
||||||
lowercased terms as data:
|
|
||||||
+evit -> (eviter, evite, evitons)+ and +evitam -> (evitames, ...)+
|
|
||||||
|
|
||||||
The first table can be used for full case and diacritics expansion or for
|
|
||||||
only one of those, by post-filtering the results of full expansion (e.g. if
|
|
||||||
we only want diacritics expansion, we filter by stripping diacritics from
|
|
||||||
each result term and check that it's identical to the input). For example
|
|
||||||
if we have +mate -> (mate, maté, MATE, MATÉ)+ in the table and want to
|
|
||||||
only perform case expansion for an input of +maté+, we apply case folding
|
|
||||||
to the initial output and keep only +maté+, as +mate+ differs from the
|
|
||||||
input.
|
|
||||||
|
|
||||||
We only perform stemming expansion when case and diacritics sensitivity is
|
|
||||||
off. It is performed using the second and third tables, both on the
|
|
||||||
lowercased and lowercased/stripped output of the first step, and each term
|
|
||||||
in the output stemming is expanded again for case (using the first table).
|
|
||||||
|
|
||||||
A full example of the expansion occurring during an insensitive search
|
|
||||||
for +resume+ using French stemming on a mixed English/French index
|
|
||||||
follows. An important thing to remember is that the result of each
|
|
||||||
expansion is a function of the terms actually present in the index, not
|
|
||||||
some arbitrary computation (and so, of course, many of the possible but
|
|
||||||
absent variations are missing).
|
|
||||||
|
|
||||||
# The case and diacritics expansion of +resume+ yields +RESUME Resume
|
|
||||||
Résumé resumé résume résumé resume+
|
|
||||||
|
|
||||||
# The Stem expansion input list (lower-cased) is:
|
|
||||||
+resume resumé résume résumé+, and the output is:
|
|
||||||
+resum resume resumenes resumer resumes resumé resumée résum résumait
|
|
||||||
résumant résume résumer résumerai résumerait résumes résumez résumé résumée
|
|
||||||
résumées résumés+
|
|
||||||
|
|
||||||
# Each of the above terms is then fed to case and diacritics expansion (first
|
|
||||||
table), for the final output:
|
|
||||||
+resume résumé Résumé résumer résume Resume résumés RESUME resumes
|
|
||||||
resumer résumant resúmenes resumé résumait résumes résumée resumee
|
|
||||||
résumerait Résumez résumerai RÉSUMÉES Resumée Resumes résumées+.
|
|
||||||
|
|
||||||
A Xapian OR query is finally constructed from the expanded term list.
|
|
||||||
|
|
||||||
@ -1,67 +0,0 @@
|
|||||||
== Recoll Faqs and Howtos file index
|
|
||||||
link:ElinksWeb.html[Extending the Recoll Firefox visited web page indexing mechanism to other browsers]
|
|
||||||
|
|
||||||
link:FilterArch.html[Recoll input handlers]
|
|
||||||
|
|
||||||
link:FilterRetrofit.html[Installing a filter for a new document type]
|
|
||||||
|
|
||||||
link:FilteringOutZipArchiveMembers.html[Filtering out Zip archive members]
|
|
||||||
|
|
||||||
link:GUIKeyboard.html[Recoll GUI keyboard navigation]
|
|
||||||
|
|
||||||
link:HandleCustomField.html[Generating a custom field and using it to sort results]
|
|
||||||
|
|
||||||
link:Home.html[Welcome to the Recoll Faqs and Recipees]
|
|
||||||
|
|
||||||
link:HotRecoll.html[Recoll hotkey: starting / hiding recoll with a keyboard shortcut]
|
|
||||||
|
|
||||||
link:IndexMailHeader.html[Indexing arbitrary mail headers]
|
|
||||||
|
|
||||||
link:IndexMozillaCalendari.html[Indexing Mozilla calendar data]
|
|
||||||
|
|
||||||
link:IndexOnAc.html[Laptops: starting or stopping indexing according to AC power status]
|
|
||||||
|
|
||||||
link:IndexOutlook.html[Indexing Outlook archives]
|
|
||||||
|
|
||||||
link:IndexWebHistory.html[Indexing Web history with the Firefox extension ]
|
|
||||||
|
|
||||||
link:MultipleIndexes.html[Creating and using multiple indexes]
|
|
||||||
|
|
||||||
link:MuttAndRecoll.html[Interfacing Recoll and Mutt]
|
|
||||||
|
|
||||||
link:NonAsciiFileNames.html[Unix and non-ASCII file names, a summary of issues]
|
|
||||||
|
|
||||||
link:OpenHelperScript.html[Starting native applications]
|
|
||||||
|
|
||||||
link:PreventIndexingDir.html[Preventing indexing in a directory]
|
|
||||||
|
|
||||||
link:ProblemSolvingData.html[Gathering useful data for asking help about or reporting a Recoll issue]
|
|
||||||
|
|
||||||
link:QpdfviewHelperScript.html[Starting native applications ]
|
|
||||||
|
|
||||||
link:QueryFromC.html[Querying Recoll from a C program]
|
|
||||||
|
|
||||||
link:ReplaceCategories.html[Replacing the Category filter controls]
|
|
||||||
|
|
||||||
link:ResultsThumbnails.html[Result list thumbnails and how to create them]
|
|
||||||
|
|
||||||
link:SavingConfig.html[User configuration backup]
|
|
||||||
|
|
||||||
link:UnityLens.html[Building and Installing the Ubuntu Unity Recoll Lens]
|
|
||||||
|
|
||||||
link:UsingOpenWith.html[Using the _Open With_ context menu in recoll 1.20 and newer]
|
|
||||||
|
|
||||||
link:WhyIsMyFileNotIndexed.html[Using the log file to investigate indexing issues]
|
|
||||||
|
|
||||||
link:WikiIndex.html[Recoll Wiki file index]
|
|
||||||
|
|
||||||
link:XDGBase.html[XDG: Tidying Recoll data storage]
|
|
||||||
|
|
||||||
link:ZDevCaseAndDiacritics1.html[Character case and diacritic marks (1), issues with stemming]
|
|
||||||
|
|
||||||
link:ZDevCaseAndDiacritics2.html[Character case and diacritic marks (2), user interface]
|
|
||||||
|
|
||||||
link:ZDevCaseAndDiacritics3.html[Character case and diacritic marks (3), implementation]
|
|
||||||
|
|
||||||
link:index.html[Faqs and Howtos]
|
|
||||||
|
|
||||||
@ -1,41 +0,0 @@
|
|||||||
== Faqs and Howtos
|
|
||||||
|
|
||||||
link:..[Back to recoll.org top page]
|
|
||||||
|
|
||||||
link:faqsindex.html[Full file index]
|
|
||||||
|
|
||||||
=== Indexing
|
|
||||||
* link:WhyIsMyFileNotIndexed.html[Why is this file not indexed ? Investigating indexing issues]
|
|
||||||
* link:PreventIndexingDir.html[Preventing the indexing of a directory]
|
|
||||||
* link:IndexOnAc.html[Starting/stopping the indexer depending on power/battery status]
|
|
||||||
* link:IndexMozillaCalendari.html[Indexing Mozilla Sunbird / Lightning calendar data]
|
|
||||||
* link:MultipleIndexes.html[Creating and using multiple indexes]
|
|
||||||
* link:IndexWebHistory.html[Indexing Web history with the Firefox browser extension]
|
|
||||||
* link:ElinksWeb.html[Extending the Web queue mechanism to other browsers and general WEB indexing]
|
|
||||||
* link:IndexMailHeader.html[Indexing arbitrary mail headers]
|
|
||||||
* link:IndexOutlook.html[Indexing Outlook archives]
|
|
||||||
* link:HandleCustomField.html[Generating a custom field and using it to sort results]
|
|
||||||
* link:http://www.recoll.org/recoll_XMP/index.html.html[An example of filter/field customisation, using XMP metadata with PDFs]
|
|
||||||
* link:FilteringOutZipArchiveMembers.html[Filtering out Zip archive members]
|
|
||||||
|
|
||||||
=== Searching
|
|
||||||
* link:GUIKeyboard.html[Recoll GUI keyboard navigation]
|
|
||||||
* link:HotRecoll.html[On the desktop: using a keyboard shortcut for starting/hiding recoll]
|
|
||||||
* link:OpenHelperScript.html[Handling issues for starting native apps, esp. email clients - getting Thunderbird to open message files]
|
|
||||||
* link:QpdfviewHelperScript.html[Another example open helper script - using qpdfview to open pdf and postscript files, with support for page and search options]
|
|
||||||
* link:UsingOpenWith.html[Using the new Open With menu in recoll 1.20 with a custom
|
|
||||||
app]
|
|
||||||
* link:ReplaceCategories.html[Replacing the document category filters]
|
|
||||||
* link:ResultsThumbnails.html[Result list thumbnails and how to create them]
|
|
||||||
* link:MuttAndRecoll.html[Interfacing Recoll and Mutt]
|
|
||||||
* link:QueryFromC.html[Querying from a C program]
|
|
||||||
|
|
||||||
=== Administration and miscellaneous
|
|
||||||
* link:http://www.recoll.org/pages/recoll-webui-install-wsgi.html.html[Installation of the Recoll WebUI with Apache]
|
|
||||||
* link:FilterRetrofit.wiki.html[Installing a filter for a new document type]
|
|
||||||
* link:UnityLens.html[Building and Installing the Ubuntu Unity Recoll Lens]
|
|
||||||
* link:SavingConfig.wiki.html[Recoll configuration backup]
|
|
||||||
* link:XDGBase.wiki.html[Tidying Recoll data storage]
|
|
||||||
* link:ProblemSolvingData.html[Collecting diagnostic information]
|
|
||||||
* link:NonAsciiFileNames.html[Unix and non-ascii file names]
|
|
||||||
* link:FilterArch.html[Recoll filters]
|
|
||||||
@ -1,20 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
WIDX=faqsindex.txt
|
|
||||||
|
|
||||||
echo "== Recoll Faqs and Howtos file index" > $WIDX
|
|
||||||
for f in *.txt; do
|
|
||||||
if test "$f" = $WIDX ; then continue; fi
|
|
||||||
h="`basename $f .txt`.html"
|
|
||||||
title=`head -1 "$f" | sed -e 's/=//g' -e 's/^ *//' -e 's/ *$//' -e 's/
//g'`
|
|
||||||
echo 'link:'$h'['$title']' >> $WIDX
|
|
||||||
echo >> $WIDX
|
|
||||||
done
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
# Check and display what files are in the index but not in the contents table:
|
|
||||||
|
|
||||||
grep \| FaqsAndHowTos.txt | awk -F\| '{print $1}' | sed -e 's/\* \[\[//' -e 's/.wiki//' |sort > ctfiles.tmp
|
|
||||||
grep '\[\[' WikiIndex.txt | awk -F\| '{print $1}' | sed -e 's/\[\[//' -e 's/.wiki//' -e 's/.md//' | sort > ixfiles.tmp
|
|
||||||
echo 'diff ContentFiles IndexFiles:'
|
|
||||||
diff ctfiles.tmp ixfiles.tmp
|
|
||||||
rm ctfiles.tmp ixfiles.tmp
|
|
||||||
|
Before Width: | Height: | Size: 318 B |
@ -1,490 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>RECOLL: a personal text search system for
|
|
||||||
Unix/Linux</title>
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll is a simple full-text search system for unix and linux based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"full text search,fulltext,desktop search,unix,linux,solaris,open source,free">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content=
|
|
||||||
"text/html; charset=iso-8859-1">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="index.html">Home</a></li>
|
|
||||||
|
|
||||||
<li><a href="pics/index.html">Screenshots</a></li>
|
|
||||||
|
|
||||||
<li><a href="download.html">Downloads</a></li>
|
|
||||||
|
|
||||||
<li><a href="doc.html">Documentation</a></li>
|
|
||||||
|
|
||||||
<li><a href="support.html">Support</a></li>
|
|
||||||
|
|
||||||
<li><a href="devel.html">Development</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
<h1>Recoll features</h1>
|
|
||||||
|
|
||||||
<div class="intrapage">
|
|
||||||
<table width=100%>
|
|
||||||
<tbody>
|
|
||||||
<tr>
|
|
||||||
<td><a href="#systems">Supported systems</a></td>
|
|
||||||
<td><a href="#doctypes">Document types</a></td>
|
|
||||||
<td><a href="#other">Other features</a></td>
|
|
||||||
<td><a href="#integration">Desktop and web integration</a></td>
|
|
||||||
<td><a href="#stemming">Stemming</a></td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<h2><a name="general">General features</a></h2>
|
|
||||||
<ul>
|
|
||||||
<li>Easy installation, few dependancies. No database daemon,
|
|
||||||
web server, desktop environment or exotic language necessary.</li>
|
|
||||||
<li>Will run on most Unix-based <a href="features.html#systems">
|
|
||||||
systems</a>, and on MS-Windows too.</li>
|
|
||||||
<li>Qt 4 GUI, plus command line, Unity Lens, KIO and krunner
|
|
||||||
interfaces.</li>
|
|
||||||
|
|
||||||
<li>Searches most common
|
|
||||||
<a href="features.html#doctypes">document types</a>, emails and
|
|
||||||
their attachments. Transparently handles decompression
|
|
||||||
(gzip, bzip2).</li>
|
|
||||||
|
|
||||||
<li>Powerful query facilities, with boolean searches,
|
|
||||||
phrases, proximity, wildcards, filter on file types and directory
|
|
||||||
tree.</li>
|
|
||||||
|
|
||||||
<li>Multi-language and multi-character set with Unicode based
|
|
||||||
internals.</li>
|
|
||||||
|
|
||||||
<li>Extensive documentation, with a
|
|
||||||
complete <a href="usermanual/usermanual.html">user
|
|
||||||
manual</a> and manual pages for each command.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="systems">Supported systems</a></h2>
|
|
||||||
|
|
||||||
<p><span class="application">Recoll</span> has been compiled and
|
|
||||||
tested on Linux, MS-Windows 7-10, MacOS X and Solaris (initial
|
|
||||||
versions Redhat 7, Fedora Core 5, Suse 10, Gentoo, Debian 3.1,
|
|
||||||
Solaris 8). It should compile and run on all subsequent releases
|
|
||||||
of these systems and probably a few others too.</p>
|
|
||||||
|
|
||||||
<p>Qt versions from 4.7 and later</p>
|
|
||||||
|
|
||||||
<h2><a name="doctypes">Document types</a></h2>
|
|
||||||
|
|
||||||
<p><span class="application">Recoll</span> can index many document
|
|
||||||
types (along with their compressed versions). Some types are
|
|
||||||
handled internally (no external application needed). Other types
|
|
||||||
need a separate application to be installed to extract the
|
|
||||||
text. Types that only need very common utilities
|
|
||||||
(awk/sed/groff/Python etc.) are listed in the native section.</p>
|
|
||||||
|
|
||||||
<p>The MS-Windows installer includes the supporting application,
|
|
||||||
the only additional package you will need is the Python language
|
|
||||||
installation.</p>
|
|
||||||
|
|
||||||
<p>Many formats are processed
|
|
||||||
by <span class="application">Python</span> scripts. The Python
|
|
||||||
dependency will not always be mentionned. In general, Recoll
|
|
||||||
expects Python 2.x to be available (many, but not all, scripts
|
|
||||||
are compatible with Python 3). Formats which are processed
|
|
||||||
using <span class="application">Python</span> and its standard
|
|
||||||
library are listed in the <i>native</i> section.</p>
|
|
||||||
|
|
||||||
<h4>File types indexed natively</h4>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li><span class="application">text</span>.</li>
|
|
||||||
<li><span class="application">html</span>.</li>
|
|
||||||
<li><span class="application">maildir</span>,
|
|
||||||
<span class="application">mh</span>, and
|
|
||||||
<span class="application">mailbox</span> (
|
|
||||||
<span class="application">Mozilla</span>,
|
|
||||||
<span class="application">Thunderbird</span> and
|
|
||||||
<span class="application">Evolution</span> mail ok).
|
|
||||||
<em><b>Evolution note</b>: be sure to remove <tt>.cache</tt> from
|
|
||||||
the <tt>skippedNames</tt> list in the GUI <tt>Indexing
|
|
||||||
preferences/Local Parameters/</tt> pane if you want to
|
|
||||||
index local copies of Imap mail.</em>
|
|
||||||
</li>
|
|
||||||
|
|
||||||
<li><span class="application">gaim</span> and
|
|
||||||
<span class="application">purple</span> log files.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Scribus</span> files.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Man pages</span> (needs
|
|
||||||
<span class="application">groff</span>).</li>
|
|
||||||
|
|
||||||
<li><span class="application">Dia</span> diagrams.</li>
|
|
||||||
<li><span class="application">Excel</span>
|
|
||||||
and <span class="application">Powerpoint</span>
|
|
||||||
for <span class="application">Recoll</span> versions 1.19.12
|
|
||||||
and later.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Tar</span> archives. Tar file
|
|
||||||
indexing is disabled by default (because tar archives don't
|
|
||||||
typically contain the kind of documents that people search
|
|
||||||
for), you will need to enable it explicitely, like with the
|
|
||||||
following in your
|
|
||||||
<span class="filename">$HOME/.recoll/mimeconf</span> file:
|
|
||||||
<pre>
|
|
||||||
[index]
|
|
||||||
application/x-tar = execm rcltar
|
|
||||||
</pre>
|
|
||||||
</li>
|
|
||||||
|
|
||||||
<li><span class="application">Zip</span> archives.</li>
|
|
||||||
<li><span class="application">Konqueror webarchive</span>
|
|
||||||
format with Python (uses the <tt>tarfile</tt> standard
|
|
||||||
library module).</li>
|
|
||||||
|
|
||||||
<li><span class="application">Mimehtml web archive
|
|
||||||
format</span> (support based on the mail
|
|
||||||
filter, which introduces some mild weirdness, but still
|
|
||||||
usable).</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<h4>File types indexed with external helpers</h4>
|
|
||||||
|
|
||||||
<p>Many document types need the <span class="command">iconv</span>
|
|
||||||
command in addition to the applications specifically listed.</p>
|
|
||||||
|
|
||||||
<h5>The XML ones</h5>
|
|
||||||
|
|
||||||
<p>The following types need <span class="command">
|
|
||||||
xsltproc</span> from the <b>libxslt</b> package for recoll
|
|
||||||
versions before 1.22, and in addition, python-libxslt1 and
|
|
||||||
python-libxml2 for 1.22 and newer.
|
|
||||||
Quite a few also need <span class="command">unzip</span>:</p>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li><span class="application">Abiword</span> files.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Fb2</span> ebooks.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Kword</span> files.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Microsoft Office Open XML</span>
|
|
||||||
files.</li>
|
|
||||||
|
|
||||||
<li><span class="application">OpenOffice</span> files.</li>
|
|
||||||
|
|
||||||
<li><span class="application">SVG</span> files.</li>
|
|
||||||
<li><span class="application">Gnumeric</span> files.</li>
|
|
||||||
<li><span class="application">Okular</span> annotations files.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h5>Other formats</h5>
|
|
||||||
|
|
||||||
<p>The following need miscellaneous helper programs to decode
|
|
||||||
the internal formats.</p>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li><span class="application">pdf</span> with the <span class=
|
|
||||||
"command">pdftotext</span> command, which comes with
|
|
||||||
<a href="http://poppler.freedesktop.org/">poppler</a>,
|
|
||||||
(the package name is quite often <tt>poppler-utils</tt>). <br/>
|
|
||||||
Note: the older <span class="command">pdftotext</span> command
|
|
||||||
which comes with <span class="application">xpdf</span> is
|
|
||||||
not compatible with <span class="application">
|
|
||||||
Recoll</span><br/>
|
|
||||||
|
|
||||||
<em>New in 1.21</em>: if the <span class="application">
|
|
||||||
tesseract</span> OCR application, and the
|
|
||||||
<span class="command">pdftoppm</span> command are available
|
|
||||||
on the system, the <span class="command">rclpdf</span>
|
|
||||||
filter has the capability to run OCR. See the comments at
|
|
||||||
the top of <span class="command">rclpdf</span> (usually
|
|
||||||
found
|
|
||||||
in <span class="filename">/usr/share/recoll/filters</span>)
|
|
||||||
for how to enable this and configuration details.<br/>
|
|
||||||
<em>Opening PDFs at the right page</em>: the default
|
|
||||||
configuration uses <span class="command">evince</span>,
|
|
||||||
which has options for direct page access and pre-setting the
|
|
||||||
search strings (hits will be highlighted). There is an
|
|
||||||
example line in the default mimeview for doing the same
|
|
||||||
thing with <span class="command">qpdfview</span>
|
|
||||||
(<span class="literal">qpdfview --search %s %f#%p</span>).
|
|
||||||
Okular does not have a search string option (but it does
|
|
||||||
have a page number one).
|
|
||||||
</li>
|
|
||||||
|
|
||||||
<li><span class="application">msword</span> with <a href=
|
|
||||||
"http://www.winfield.demon.nl/">antiword</a>. It is also useful to
|
|
||||||
have <a href="http://wvware.sourceforge.net/">wvWare</a> installed
|
|
||||||
as it may be be used as a fallback for some files which antiword
|
|
||||||
does not handle.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Wordperfect</span> with the
|
|
||||||
<span class="command">wpd2html</span> command from <a href=
|
|
||||||
"http://libwpd.sourceforge.net">libwpd</a>. On some distributions,
|
|
||||||
the command may come with a package named <span
|
|
||||||
class="literal">libwpd-tools</span> or such, not the base <a
|
|
||||||
span="literal">libwpd</a> package.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Lyx</span> files (needs
|
|
||||||
<span class="application">Lyx</span> to be installed).</li>
|
|
||||||
|
|
||||||
<li><span class="application">Powerpoint</span> and <span
|
|
||||||
class="application">Excel</span> with the <a href=
|
|
||||||
"http://vitus.wagner.pp.ru/software/catdoc/">catdoc</a>
|
|
||||||
utilities up to recoll 1.19.12. Recoll 1.19.12 and later use
|
|
||||||
internal Python filters for Excel and Powerpoint, and catdoc
|
|
||||||
is not needed at all (catdoc did not work on many semi-recent
|
|
||||||
Excel and Powerpoint files).</li>
|
|
||||||
|
|
||||||
<li><span class="application">CHM (Microsoft help)</span> files
|
|
||||||
with <span class="command">Python,
|
|
||||||
<a href="http://gnochm.sourceforge.net/pychm.html">pychm</a>
|
|
||||||
and <a href="http://www.jedrea.com/chmlib/">chmlib</a></span>.</li>
|
|
||||||
|
|
||||||
<li><span class="application">GNU info</span> files
|
|
||||||
with <span class="command">Python</span> and the
|
|
||||||
<span class="command">info</span> command.</li>
|
|
||||||
|
|
||||||
<li><span class="application">EPUB</span> files
|
|
||||||
with <span class="command">Python</span> and this
|
|
||||||
<a href="http://pypi.python.org/pypi/epub/">Python epub</a>
|
|
||||||
decoding module, which is packaged on Fedora, but not Debian.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Rar</span> archives (needs <span
|
|
||||||
class="command">Python</span>), the
|
|
||||||
<a href="http://pypi.python.org/pypi/rarfile/">rarfile</a> Python
|
|
||||||
module and the <a
|
|
||||||
href="http://www.rarlab.com/rar_add.htm">unrar</a>
|
|
||||||
utility. The Python module is packaged by Fedora, not by Debian.</li>
|
|
||||||
|
|
||||||
<li><span class="application">7zip</span> archives (needs
|
|
||||||
<span class="command">Python</span> and
|
|
||||||
the <a href="https://pypi.python.org/pypi/pylzma">pylzma
|
|
||||||
module</a>). This is a recent addition, and you need to
|
|
||||||
download the filter from
|
|
||||||
the <a href="filters/filters.html">filters pages</a> for
|
|
||||||
all Recoll versions prior to 1.21.</li>
|
|
||||||
|
|
||||||
<li><span class="application">iCalendar</span>(.ics) files
|
|
||||||
(needs <span class="command">Python, <a href=
|
|
||||||
"http://pypi.python.org/pypi/icalendar/2.1">icalendar</a></span>).</li>
|
|
||||||
|
|
||||||
<li><span class="application">Mozilla calendar data</span> See
|
|
||||||
<a href="faqsandhowtos/IndexMozillaCalendari.html">
|
|
||||||
the Howto</a> about this.</li>
|
|
||||||
|
|
||||||
<li><span class="application">postscript</span> with <a href=
|
|
||||||
"http://www.gnu.org/software/ghostscript/ghostscript.html">
|
|
||||||
ghostscript</a> and <a href=
|
|
||||||
"http://www.cs.wisc.edu/~ghost/doc/pstotext.htm">pstotext</a>.
|
|
||||||
Pstotext 1.9 has a serious issue with special characters in
|
|
||||||
file names, and you should either use the version packaged for
|
|
||||||
your system which is probably patched, or apply the Debian
|
|
||||||
patch which is stored <a href=
|
|
||||||
"files/pstotext-1.9_4-debian.patch">here</a> for
|
|
||||||
convenience. See http://packages.debian.org/squeeze/pstotext
|
|
||||||
and http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=356988
|
|
||||||
for references/explanations.
|
|
||||||
<blockquote>
|
|
||||||
To make things a bit easier, I also
|
|
||||||
store <a href="files/pstotext-1.9-patched.tar.gz">an
|
|
||||||
already patched version</a>. I added an
|
|
||||||
install target to the Makefile... This installs to
|
|
||||||
/usr/local, use <i>make install PREFIX=/usr</i> to
|
|
||||||
change. So all you need is:
|
|
||||||
<pre>
|
|
||||||
tar xvzf pstotext-1.9-patched.tar.gz
|
|
||||||
cd pstotext-1.9-patched
|
|
||||||
make
|
|
||||||
make install
|
|
||||||
</pre>
|
|
||||||
</blockquote>
|
|
||||||
</li>
|
|
||||||
|
|
||||||
|
|
||||||
<li><span class="application">RTF</span> files with
|
|
||||||
<a href="http://www.gnu.org/software/unrtf/unrtf.html">
|
|
||||||
unrtf</a>. Please note that up to version 0.21.3,
|
|
||||||
<span class="command">unrtf</span> mostly does not work with
|
|
||||||
non western-european character sets. Many serious problems
|
|
||||||
(crashes with serious security implications and infinite
|
|
||||||
loops) were fixed in unrtf 0.21.8, so you really want to use
|
|
||||||
this or a newer release. Building Unrtf from source is quick
|
|
||||||
and easy.</li>
|
|
||||||
|
|
||||||
<li><span class="application">TeX</span> with <span class=
|
|
||||||
"command">untex</span>. If there is no untex package for
|
|
||||||
your distribution, <a href="untex/untex-1.3.jf.tar.gz">a
|
|
||||||
source package is stored on this site</a> (as untex has no
|
|
||||||
obvious home). Will also work with <a href=
|
|
||||||
"http://www.cs.purdue.edu/homes/trinkle/detex/">detex</a>
|
|
||||||
if this is installed.</li>
|
|
||||||
|
|
||||||
<li><span class="application">dvi</span> with <a href=
|
|
||||||
"http://www.radicaleye.com/dvips.html">dvips</a>.</li>
|
|
||||||
|
|
||||||
<li><span class="application">djvu</span> with <a href=
|
|
||||||
"http://djvu.sourceforge.net">DjVuLibre</a>.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Audio file tags</span>.
|
|
||||||
Recoll releases 1.14 and later use a Python filter based
|
|
||||||
on <a href="http://code.google.com/p/mutagen/">mutagen</a>
|
|
||||||
for all audio types.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Image file tags</span> with <a href=
|
|
||||||
"http://www.sno.phy.queensu.ca/~phil/exiftool/">exiftool</a>.
|
|
||||||
This is a perl program, so you also need perl on the
|
|
||||||
system. This works with about any possible image file and
|
|
||||||
tag format (jpg, png, tiff, gif etc.).</li>
|
|
||||||
|
|
||||||
<li><span class="application">Midi karaoke files</span> with
|
|
||||||
Python, the
|
|
||||||
<a href="http://pypi.python.org/pypi/midi/0.2.1">
|
|
||||||
midi module</a>, and some help
|
|
||||||
from <a href="http://chardet.feedparser.org/">chardet</a>. There
|
|
||||||
is probably a <tt>python-chardet</tt> package for your distribution,
|
|
||||||
but you will quite probably need to build the midi
|
|
||||||
package. This is easy but see the <a href="helpernotes.html#midi">
|
|
||||||
notes here</a>.
|
|
||||||
</li>
|
|
||||||
|
|
||||||
<li><span class="application">MediaWiki dump files</span>:
|
|
||||||
Thomas Levine has written a handler for these, you will find
|
|
||||||
it here:
|
|
||||||
<a href="https://bitbucket.org/tlevine/recoll/src/0127be78bffdd8a294067966a3ba7b2663d7b0cf/src/filters/rclmwdump?at=default&fileviewer=file-view-default">rclmwdump</a>.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="other">Other features</a></h2>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li>Can use a Firefox extension to index visited Web pages
|
|
||||||
history. See <a href="faqsandhowtos/IndexWebHistory.html">the
|
|
||||||
Howto</a> for more detail.</li>
|
|
||||||
|
|
||||||
<li>Processes all email attachments, and more generally any
|
|
||||||
realistic level of container imbrication (the "msword attachment to
|
|
||||||
a message inside a mailbox in a zip" thingy...) .</li>
|
|
||||||
|
|
||||||
<li>Multiple selectable databases.</li>
|
|
||||||
|
|
||||||
<li>Powerful query facilities, with boolean searches,
|
|
||||||
phrases, filter on file types and directory tree.</li>
|
|
||||||
|
|
||||||
<li>Xesam-compatible query language.</li>
|
|
||||||
|
|
||||||
<li>Wildcard searches (with a specific and faster function
|
|
||||||
for file names).</li>
|
|
||||||
|
|
||||||
<li>Support for multiple charsets. Internal processing and
|
|
||||||
storage uses Unicode UTF-8.</li>
|
|
||||||
|
|
||||||
<li><a href="#Stemming">Stemming</a> performed at query
|
|
||||||
time (can switch stemming language after indexing).</li>
|
|
||||||
|
|
||||||
<li>Easy installation. No database daemon, web server or
|
|
||||||
exotic language necessary.</li>
|
|
||||||
|
|
||||||
<li>An indexer which runs either as a batch, cron'able
|
|
||||||
program, or as a real-time indexing daemon, depending on
|
|
||||||
preference.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="integration">Desktop and web integration</a></h2>
|
|
||||||
|
|
||||||
<p>The <span class="application">Recoll</span> GUI has many
|
|
||||||
features that help to specify an efficient search and to manage
|
|
||||||
the results. However it maybe sometimes preferable to use a
|
|
||||||
simpler tool with a better integration with your desktop
|
|
||||||
interfaces. Several solutions exist:</p>
|
|
||||||
<ul>
|
|
||||||
<li>The <span class="application">Recoll</span> KIO module
|
|
||||||
allows starting queries and viewing results from the
|
|
||||||
Konqueror browser or KDE applications <em>Open</em> dialogs.</li>
|
|
||||||
<li>The <a href="http://kde-apps.org">recollrunner</a> krunner
|
|
||||||
module allows integrating Recoll search results into a
|
|
||||||
krunner query.</li>
|
|
||||||
<li>The Ubuntu Unity Recoll Lens (or Scope for newer Unity
|
|
||||||
versions) lets you access Recoll search
|
|
||||||
from the Unity Dash. More
|
|
||||||
slightly obsolete information <a href="faqsandhowtos/UnityLens.html">
|
|
||||||
here</a>. </li>
|
|
||||||
<li>The <a href="http://github.com/medoc92/recoll-webui">Recoll
|
|
||||||
Web UI</a> lets you query a Recoll index from a web browser</li>
|
|
||||||
</ul>
|
|
||||||
<p>Recoll also has
|
|
||||||
<a href="usermanual/usermanual.html#RCL.PROGRAM.PYTHONAPI">
|
|
||||||
<span class="application">Python</span></a> and
|
|
||||||
<span class="application">PHP</span> modules which can allow
|
|
||||||
easy integration with web or other applications.</p>
|
|
||||||
|
|
||||||
<h2><a name="stemming"></a>Stemming</h2>
|
|
||||||
|
|
||||||
<p>Stemming is a process which transforms inflected words
|
|
||||||
into their most basic form. For example, <i>flooring</i>,
|
|
||||||
<i>floors</i>, <i>floored</i> would probably all be
|
|
||||||
transformed to <i>floor</i> by a stemmer for the English
|
|
||||||
language.</p>
|
|
||||||
|
|
||||||
<p>In many search engines, the stemming process occurs during
|
|
||||||
indexing. The index will only contain the stemmed form of
|
|
||||||
words, with exceptions for terms which are detected as being
|
|
||||||
probably proper nouns (ie: capitalized). At query time, the
|
|
||||||
terms entered by the user are stemmed, then matched against
|
|
||||||
the index.</p>
|
|
||||||
|
|
||||||
<p>This process results into a smaller index, but it has the
|
|
||||||
grave inconvenient of irrevocably losing information during
|
|
||||||
indexing.</p>
|
|
||||||
|
|
||||||
<p>Recoll works in a different way. No stemming is performed
|
|
||||||
at query time, so that all information gets into the index.
|
|
||||||
The resulting index is bigger, but most people probably don't
|
|
||||||
care much about this nowadays, because they have a 100Gb disk
|
|
||||||
95% full of binary data <em>which does not get
|
|
||||||
indexed</em>.</p>
|
|
||||||
|
|
||||||
<p>At the end of an indexing pass, Recoll builds one or
|
|
||||||
several stemming dictionaries, where all word stems are
|
|
||||||
listed in correspondence to the list of their
|
|
||||||
derivatives.</p>
|
|
||||||
|
|
||||||
<p>At query time, by default, user-entered terms are stemmed,
|
|
||||||
then matched against the stem database, and the query is
|
|
||||||
expanded to include all derivatives. This will yield search
|
|
||||||
results analogous to those obtained by a classical engine.
|
|
||||||
The benefits of this approach is that stem expansion can be
|
|
||||||
controlled instantly at query time in several ways:</p>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li>It can be selectively turned-off for any query term by
|
|
||||||
capitalizing it (<i>Floor</i>).</li>
|
|
||||||
|
|
||||||
<li>The stemming language (ie: english, french...) can be
|
|
||||||
selected (this supposes that several stemming databases
|
|
||||||
have been built, which can be configured as part of the
|
|
||||||
indexing, or done later, in a reasonably fast way).</li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
||||||
@ -1,242 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Recoll updated filters</title>
|
|
||||||
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll is a simple full-text search system for unix and linux
|
|
||||||
based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"full text search, desktop search, unix, linux">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
|
|
||||||
<link type="text/css" rel="stylesheet" href="../styles/style.css">
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="../index.html">Home</a></li>
|
|
||||||
<li><a href="../download.html">Downloads</a></li>
|
|
||||||
<li><a href="../usermanual/index.html">User manual</a></li>
|
|
||||||
<li><a href="../usermanual/RCL.INSTALL.html">Installation</a></li>
|
|
||||||
<li><a href="../index.html#support">Support</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h1>Updated filters for Recoll</h1>
|
|
||||||
|
|
||||||
<p>The following describe new and updated filters, which will be
|
|
||||||
part of the next release, but can be installed on an older
|
|
||||||
release if you need them.</p>
|
|
||||||
|
|
||||||
<p>For updated filters, you just need to copy the script to the
|
|
||||||
filters directory which may be typically either <span
|
|
||||||
class="filename">/usr/share/recoll/filters</span>, or <span
|
|
||||||
class="filename">/usr/local/share/recoll/filters</span>. Please check
|
|
||||||
that the script is executable after copying it, and make it so if
|
|
||||||
needed (chmod a+x <i>scriptname</i>)</p>
|
|
||||||
|
|
||||||
<p>For new filters, you'll need to copy the script file as
|
|
||||||
above, possibly install the supporting application, and usually
|
|
||||||
edit the
|
|
||||||
<span class="filename">mimemap</span>,
|
|
||||||
<span class="filename">mimeview</span> and
|
|
||||||
<span class="filename">mimeconf</span> files, either in the
|
|
||||||
shared directory
|
|
||||||
(<span class="filename">
|
|
||||||
/usr[/local]/share/recoll/examples</span>), or
|
|
||||||
in your personal configuration directory
|
|
||||||
(<span class="filename">$HOME/.recoll</span> or
|
|
||||||
<span class="filename">$RECOLL_CONFDIR</span>).</p>
|
|
||||||
|
|
||||||
<p>Alternatively, you can replace your system files with
|
|
||||||
these updated and complete versions:
|
|
||||||
<a href="mimemap">mimemap</a>
|
|
||||||
<a href="mimeconf">mimeconf</a>
|
|
||||||
<a href="mimeview">mimeview</a>.</p>
|
|
||||||
|
|
||||||
<p>There is a slightly more detailed description of the filter
|
|
||||||
installation procedure on the
|
|
||||||
<a href="http://www.recoll.org/faqsandhowtos/FilterRetrofit.html">
|
|
||||||
Recoll Wiki</a>.</p>
|
|
||||||
|
|
||||||
<p>The following entries are in reverse chronologic order. Each
|
|
||||||
lists the latest Recoll release on which the update makes sense
|
|
||||||
(newer releases have an up to date version of the filter).</p>
|
|
||||||
|
|
||||||
<p>However, if you are running a Recoll version older than 1.17,
|
|
||||||
you should really upgrade.</p>
|
|
||||||
|
|
||||||
<h2>PDF documents</h2>
|
|
||||||
<p>Fixded <a href="rclpdf">rclpdf</a> filter, compatible with
|
|
||||||
newer poppler pdftotext versions, which now properly escape
|
|
||||||
text inside the html <head> section (but not the body,
|
|
||||||
curiously).</p>
|
|
||||||
|
|
||||||
<h2>Scribus documents</h2>
|
|
||||||
<p>An improved <a href="rclscribus">rclscribus</a> filter,
|
|
||||||
thanks to Morten Langlo.</p>
|
|
||||||
|
|
||||||
<h2>7zip archives</h2>
|
|
||||||
<p>A new <a href="rcl7z">rcl7z</a> filter by François Botha
|
|
||||||
for 7zip archives. Needs the
|
|
||||||
<a href="https://pypi.python.org/pypi/pylzma">pylzma Python
|
|
||||||
module</a>. </p>
|
|
||||||
|
|
||||||
<h2>Attachments to PDF documents (1.20 and older)</h2>
|
|
||||||
|
|
||||||
<p>A new <a href="rclmpdf">rclmpdf</a> filter for processing
|
|
||||||
PDF files with attachments. This replaces the old <b>rclpdf</b>
|
|
||||||
filter. You need to add it to ~/.recoll/mimeconf until it is
|
|
||||||
made standard (this is still a bit experimental, and a big
|
|
||||||
change from the previous filter):
|
|
||||||
<pre><tt>
|
|
||||||
[index]
|
|
||||||
application/pdf = execm rclmpdf
|
|
||||||
</tt></pre>
|
|
||||||
Note the <tt>execm</tt> instead of <tt>exec</tt>. </p>
|
|
||||||
|
|
||||||
<h2><a name="soff1">Open/Libre-Office documents (1.19 and older)</a></h2>
|
|
||||||
|
|
||||||
<p><a href="rclsoff">rclsoff</a>: the previous version did not
|
|
||||||
produce white space between input tab-separated words, leading
|
|
||||||
to search failures.</p>
|
|
||||||
|
|
||||||
|
|
||||||
<h2>Purple logs (1.20 and older)</h2>
|
|
||||||
|
|
||||||
<p>New <a href="rclpurple">rclpurple</a> filter for Pidging and
|
|
||||||
other chat applications log files. Handles newer log
|
|
||||||
formats. </p>
|
|
||||||
|
|
||||||
<h2>PowerPoint documents (1.19 and older)</h2>
|
|
||||||
|
|
||||||
<p>The <b>rclppt</b> filter was based on <b>catppt</b>, but this
|
|
||||||
seems to fail quite often on newer PPT
|
|
||||||
documents. The new version is based on code from
|
|
||||||
the <b>libreoffice</b> <b>mso-dump</b> project. It is both
|
|
||||||
reasonably fast and quite thorough.
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<p>Installation:<ul>
|
|
||||||
<li>As <tt>recollindex</tt> was executing <b>catppt</b>
|
|
||||||
directly in the default configuration, you will also need to add
|
|
||||||
the following to
|
|
||||||
the <tt>mimeconf</tt> file (e.g.: ~/.recoll/mimeconf):
|
|
||||||
<pre>
|
|
||||||
[index]
|
|
||||||
application/vnd.ms-powerpoint = exec rclppt
|
|
||||||
</pre>
|
|
||||||
</li>
|
|
||||||
<li>Copy the 3 following files to the Recoll filters directory (e.g:
|
|
||||||
<i>/usr/share/recoll/filters</i>) and make sure
|
|
||||||
that <tt>ppt-dump.py</tt> and <tt>rclppt</tt> are executable.
|
|
||||||
<ul>
|
|
||||||
<li><a href="rclppt">rclppt</a></li>
|
|
||||||
<li><a href="ppt-dump.py">ppt-dump.py</a></li>
|
|
||||||
<li><a href="msodump.zip">msodump.zip</a></li>
|
|
||||||
</ul>
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<h2>EPUB documents (1.17 and older)</h2>
|
|
||||||
|
|
||||||
<p>New <a href="rclepub">rclepub</a> filter for EPUB documents.
|
|
||||||
This needs
|
|
||||||
the <a href="http://pypi.python.org/pypi/epub/0.5.0">
|
|
||||||
python epub decoding module</a>. </p>
|
|
||||||
|
|
||||||
<h2>CHM files (1.17.1 and older)</h2>
|
|
||||||
<p><a href="rclchm">rclchm</a>. The previous version of the
|
|
||||||
filter mishandled files which had encoded internal URLs (not
|
|
||||||
very frequent, but happens).</p>
|
|
||||||
|
|
||||||
<h2>Updated Open Document filter (1.17 and older)</h2>
|
|
||||||
|
|
||||||
<p>The <a href="rclsoff">new filter</a> will correctly handle
|
|
||||||
exported Google Docs documents and also Open/LibreOffice ones in
|
|
||||||
some cases. The previous filters concatenated all the text
|
|
||||||
inside the exported Google docs without any spacing...</p>
|
|
||||||
|
|
||||||
<h2>TAR archives (1.17 and older)</h2>
|
|
||||||
|
|
||||||
<p>New <a href="rcltar">rcltar</a> filter for tar archives. The
|
|
||||||
indexing of tar archives is disabled by default in the sample
|
|
||||||
configuration (stored here). This is an <tt>execm</tt>
|
|
||||||
filter !. You'll need to add an <br>
|
|
||||||
<tt>application/x-tar = execm rcltar</tt><br>
|
|
||||||
line in the [index] section of your
|
|
||||||
$HOME/mimeconf to enable it, not an <tt>exec</tt> one.</p>
|
|
||||||
|
|
||||||
<h2>XML files (1.17 and older)</h2>
|
|
||||||
|
|
||||||
<p>By default, the current recoll version does not index xml
|
|
||||||
content (except for known formats like dia, svg etc.). This
|
|
||||||
new <a href="rclxml">rclxml</a> filter will extract the data
|
|
||||||
from any xml file. Only text data is extracted, no attribute
|
|
||||||
values. The other option is to treat xml file as plain text
|
|
||||||
one (see comment in mimeconf), and index everything, including
|
|
||||||
a lot of garbage.</p>
|
|
||||||
|
|
||||||
<h2>DIA files (1.16 and older)</h2>
|
|
||||||
<p><a href="rcldia">rcldia</a> is a new filter
|
|
||||||
for <a href="http://projects.gnome.org/dia/">Dia</a> files,
|
|
||||||
contributed by Stefan Friedel.</p>
|
|
||||||
|
|
||||||
|
|
||||||
<h2>Okular annotations (1.16 and older)</h2>
|
|
||||||
<p><a href="rclokulnote">rclokulnote</a>. Okular lets you create
|
|
||||||
annotations for PDF documents and stores them in xml format
|
|
||||||
somewhere under ~/.kde. This filter does not do a nice job to
|
|
||||||
format the data, but will at least let you find it...</p>
|
|
||||||
|
|
||||||
<h2>Gnumeric (1.16 and older)</h2>
|
|
||||||
<p><a href="rclgnm">rclgnm</a>. Needs xsltproc and
|
|
||||||
gunzip. As <tt>.gnumeric</tt> was in the list of
|
|
||||||
explicitely ignored suffixes, you can't just add the mime
|
|
||||||
and indexer script lines to your local mimemap and mimeconf, you
|
|
||||||
also need to define recoll_noindex in the local mimemap (to
|
|
||||||
override the system one which
|
|
||||||
contains <tt>.gnumeric</tt>). The simplest approach may be to
|
|
||||||
just replace the system files with those above.</p>
|
|
||||||
|
|
||||||
<h2>Rar archive support (1.15 and older)</h2>
|
|
||||||
<p><a href="rclrar">rclrar</a>. This is up to date in Recoll
|
|
||||||
1.16.2 but may be added to Recoll 1.15. It needs the Python
|
|
||||||
rarfile module. </p>
|
|
||||||
|
|
||||||
<h2>Mimehtml support (1.15)</h2>
|
|
||||||
<p>This is based on the internal mail filter, you just need to
|
|
||||||
download and install the configuration files (mimemap and
|
|
||||||
mimeconf. Will only work with 1.15 and later.</p>
|
|
||||||
|
|
||||||
<h2>Konqueror webarchive (.war) filter (1.15)</h2>
|
|
||||||
<p><a href="rclwar">rclwar</a></p>
|
|
||||||
|
|
||||||
<h2>Updated zip archive filter (1.15)</h2>
|
|
||||||
<p>The filter is corrected to handle utf-8 paths in zip archives:
|
|
||||||
<a href="rclzip">rclzip</a>. Up to date in Recoll 1.16, but
|
|
||||||
may be useful with Recoll 1.15</p>
|
|
||||||
|
|
||||||
<h2>Updated audio tag filter (1.14)</h2>
|
|
||||||
<p>The mutagen-based rclaudio filter delivered with recoll 1.14.2
|
|
||||||
used a very recent mutagen interface which will only work with
|
|
||||||
mutagen versions after 1.17 (probably. at least works with 1.19,
|
|
||||||
doesn't with 1.15).
|
|
||||||
You can download the <a href="rclaudio">corrected script
|
|
||||||
here. Not useful with Recoll 1.5 or 1.6</a>.
|
|
||||||
</p>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@ -1,211 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>RECOLL: un outil personnel de recherche textuelle pour
|
|
||||||
Unix et Linux</title>
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll est un logiciel personnel de recherche textuelle pour unix et linux basé sur Xapian, un moteur d'indexation puissant et mature.">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"recherche textuelle,desktop,unix,linux,solaris,open source,free">
|
|
||||||
<meta http-equiv="Content-language" content="fr">
|
|
||||||
<meta http-equiv="content-type" content=
|
|
||||||
"text/html; charset=iso-8859-1">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
<link type="text/css" rel="stylesheet" href="../styles/style.css">
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="../index.html">Base</a></li>
|
|
||||||
<li><a href="../pics/index.html">Copies d'écrans</a></li>
|
|
||||||
<li><a href="../download.html">Téléchargements</a></li>
|
|
||||||
<li><a href="../doc.html">Documentation</a></li>
|
|
||||||
<li><a href="../index.html#support">Support</a></li>
|
|
||||||
<li><a href="../devel.html">Développement</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h1 class="intro">Caractéristiques de Recoll</h1>
|
|
||||||
|
|
||||||
<dl>
|
|
||||||
<dt><a name="systems">Systèmes</a></dt>
|
|
||||||
<dd><span class="application">Recoll</span> a été compilé et
|
|
||||||
testé sur FreeBSD, Linux, Darwin, Solaris (versions
|
|
||||||
FreeBSD 5/6, Fedora Core 5/6, Suse 10.1, Gentoo,
|
|
||||||
Debian 3.1, Ubuntu Edgy, Solaris 8/9, mais d'autres versions
|
|
||||||
récentes conviennent sans doute également).</dd>
|
|
||||||
|
|
||||||
<dd>Versions de QT: 3.2, 3.3 et 4.2</dd>
|
|
||||||
|
|
||||||
<dt><a name="doctypes">Types de documents</a></dt>
|
|
||||||
<dd>Recoll peut traiter les types de documents suivants, ainsi
|
|
||||||
que des fichiers compressés du même type:
|
|
||||||
|
|
||||||
<dl>
|
|
||||||
<dt>En interne</dt>
|
|
||||||
|
|
||||||
<dd>
|
|
||||||
<ul>
|
|
||||||
<li><var class="literal">text</var>.</li>
|
|
||||||
|
|
||||||
<li><var class="literal">html</var>.</li>
|
|
||||||
|
|
||||||
<li><span class="application">OpenOffice</span>
|
|
||||||
(avec l'aide de la commande <b>unzip</b>).</li>
|
|
||||||
|
|
||||||
<li><span class="application">Abiword</span>.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Kword</span>.</li>
|
|
||||||
|
|
||||||
<li><var class="literal">maildir</var>,
|
|
||||||
<var class="literal">mh</var> et <var
|
|
||||||
class="literal">mailbox</var> (<span class=
|
|
||||||
"application">Mozilla</span>, <span class=
|
|
||||||
"application">Thunderbird</span>, <span class=
|
|
||||||
"application">Evolution</span> et sans doute
|
|
||||||
d'autres).</li>
|
|
||||||
|
|
||||||
<li>Fichiers de conversation <span class="application">
|
|
||||||
gaim</span>.</li>
|
|
||||||
<li><span class="application">Lyx</span> (qui doit
|
|
||||||
être présent).</li>
|
|
||||||
|
|
||||||
<li><span class="application">Scribus</span>.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
</dd>
|
|
||||||
|
|
||||||
<dt>Avec des paquets externes</dt>
|
|
||||||
|
|
||||||
<dd>
|
|
||||||
<ul>
|
|
||||||
<li><var class="literal">pdf</var> avec <a href=
|
|
||||||
"http://www.foolabs.com/xpdf/">xpdf</a>.</li>
|
|
||||||
|
|
||||||
<li><var class="application">Wordperfect</var> avec <a href=
|
|
||||||
"http://libwpd.sourceforge.net">libwpd</a>.</li>
|
|
||||||
|
|
||||||
<li><var class="literal">postscript</var> avec
|
|
||||||
<a href="http://www.gnu.org/software/ghostscript/ghostscript.html">
|
|
||||||
ghostscript</a> et
|
|
||||||
<a href="http://www.cs.wisc.edu/~ghost/doc/pstotext.htm">
|
|
||||||
pstotext</a>.</li>
|
|
||||||
|
|
||||||
<li><span class="application">msword</span> avec <a href=
|
|
||||||
"http://www.winfield.demon.nl/">antiword</a>.</li>
|
|
||||||
|
|
||||||
<li><span class="application">Powerpoint</span> et
|
|
||||||
<span class="application">Excel</span> avec les utilitaires
|
|
||||||
<a href="http://www.45.free.net/~vitus/software/catdoc/">
|
|
||||||
catdoc</a>.</li>
|
|
||||||
|
|
||||||
<li><var class="literal">rtf</var> avec <a href=
|
|
||||||
"http://www.gnu.org/software/unrtf/unrtf.html">unrtf</a>.</li>
|
|
||||||
|
|
||||||
<li><var class="literal">dvi</var> avec
|
|
||||||
<a href="http://www.radicaleye.com/dvips.html">dvips</a>.
|
|
||||||
</li>
|
|
||||||
|
|
||||||
<li><var class="literal">djvu</var> avec
|
|
||||||
<a href="http://djvulibre.djvuzone.org/doc/index.html">
|
|
||||||
DjVuLibre</a>. </li>
|
|
||||||
|
|
||||||
<li>Tags <var class="literal">mp3</var> avec
|
|
||||||
<a href="http://id3lib.sourceforge.net/">
|
|
||||||
id3info (id3lib)</a>. </li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
</dd>
|
|
||||||
</dl>
|
|
||||||
</dd>
|
|
||||||
|
|
||||||
<dt>Autres caractéristiques</dt>
|
|
||||||
<dd>
|
|
||||||
<ul>
|
|
||||||
<li>Index multiples interrogeables ensemble ou séparément.</li>
|
|
||||||
|
|
||||||
<li>Fonctions de recherche puissantes, avec expressions
|
|
||||||
booléennes, phrases et proximité, caractères jokers,
|
|
||||||
filtrage sur les types de fichiers où l'emplacement.</li>
|
|
||||||
|
|
||||||
<li>Fonction spécifique de recherche de noms de fichiers.</li>
|
|
||||||
|
|
||||||
<li>Support de jeux de caractères multiples. Les traitements
|
|
||||||
internes et l'index utilisent l'encodage Unicode UTF-8.</li>
|
|
||||||
|
|
||||||
<li>L'extraction des racines de mots <a href="#Stemming">
|
|
||||||
Stemming</a> est effectuée au moment de la recherche
|
|
||||||
(permet de changer de langue après l'indexation).</li>
|
|
||||||
|
|
||||||
<li>Installation facile. Pas de processus permanent, de
|
|
||||||
serveur web ou environnement exotique.</li>
|
|
||||||
|
|
||||||
<li>Un indexeur qui peut fonctionner soit comme un
|
|
||||||
processus léger dans l'interface de consultation, comme un
|
|
||||||
programme batch externe intégrable par
|
|
||||||
<span class="application">cron</span>, ou comme un processus
|
|
||||||
permanent pour l'indexation au fil de l'eau.</li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
</dd>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="#stemming"></a>Lemmatisation</h2>
|
|
||||||
|
|
||||||
<p><em>Note: je serais preneur d'une traduction française
|
|
||||||
agréable pour "stemming".</em></p>
|
|
||||||
<p>La lemmatisation transforme un mot dérivé vers sa racine.
|
|
||||||
Par exemple, <i>aimer</i>, <i>aimerai</i>, <i>aimait</i>,
|
|
||||||
<i>aimez</i> etc. seraient transformés en <i>aim</i> en
|
|
||||||
français. Une recherche de l'un quelconque des dérivés peut
|
|
||||||
automatiquement être étendue vers tous les autres</p>
|
|
||||||
|
|
||||||
<p>Certains moteurs de recherche appliquent la transformation
|
|
||||||
pendant l'indexation. L'index ne stocke que les racines des
|
|
||||||
mots, avec des exceptions pour les termes qui sont reconnus
|
|
||||||
comme des noms propres (capitalisation). Au moment de la
|
|
||||||
recherche, les termes de la requête sont également transformés
|
|
||||||
avant comparaison à l'index.</p>
|
|
||||||
|
|
||||||
<p>Cette approche permet un index plus petit, mais elle perd
|
|
||||||
irrévocablement de l'information pendant l'indexation.</p>
|
|
||||||
|
|
||||||
<p>Recoll fonctionne différemment. Les termes sont indexés sans
|
|
||||||
transformation. L'index résultant est plus gros, ce qui n'a
|
|
||||||
probablement pas beaucoup d'importance à une époque de disques
|
|
||||||
de 100 Go principalement remplis d'information multimédia
|
|
||||||
<em>non indexée</em>.
|
|
||||||
|
|
||||||
<p>À la fin de l'indexation, Recoll construit un ou plusieurs
|
|
||||||
dictionnaires de transformation (pour différents langages), où
|
|
||||||
toutes les racines sont listées avec leurs transformations
|
|
||||||
possibles.</p>
|
|
||||||
|
|
||||||
|
|
||||||
<p>Au moment de la recherche, par défaut, les termes de
|
|
||||||
l'utilisateurs sont transformés, et étendus aux dérivés par
|
|
||||||
utilisation du dictionnaire.
|
|
||||||
Les résultats obtenus sont analogues à ceux de
|
|
||||||
l'autre méthode. L'avantage est que l'expansion peut être
|
|
||||||
contrôlée au moment de la recherche:
|
|
||||||
<ul>
|
|
||||||
<li>On peut la supprimer pour n'importe quel terme de la
|
|
||||||
requête, (en le faisant débuter par une capitale:
|
|
||||||
<em>Aime</em> par exemple pour chercher la ville d'Aime la
|
|
||||||
Plagne). </li>
|
|
||||||
<li>Le langage de transformation peut également être changé,
|
|
||||||
en supposant que plusieurs dictionnaires de transformation
|
|
||||||
aient été construits lors de l'indexation.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
||||||
@ -1,74 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>RECOLL: a personal text search system for
|
|
||||||
Unix/Linux</title>
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll is a simple full-text search system for unix and linux based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"full text search,fulltext,desktop search,unix,linux,solaris,open source,free">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content=
|
|
||||||
"text/html; charset=iso-8859-1">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="index.html">Home</a></li>
|
|
||||||
|
|
||||||
<li><a href="features.html#doctypes">Back to document types</a></li>
|
|
||||||
|
|
||||||
<li><a href="pics/index.html">Screenshots</a></li>
|
|
||||||
|
|
||||||
<li><a href="download.html">Downloads</a></li>
|
|
||||||
|
|
||||||
<li><a href="doc.html">User manual</a></li>
|
|
||||||
|
|
||||||
<li><a href="index.html#support">Support</a></li>
|
|
||||||
|
|
||||||
<li><a href="devel.html">Development</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
<h1>Notes about building/using specific external helper
|
|
||||||
applications</h1>
|
|
||||||
|
|
||||||
<h2><a name="midi">The Python midi module</a></h2>
|
|
||||||
<p>The normal procedure for building a Python module
|
|
||||||
applies:</p>
|
|
||||||
<pre><tt>
|
|
||||||
tar xvzf midi-0.2.1.tar.gz
|
|
||||||
cd midi-0.2.1
|
|
||||||
python setup.py build
|
|
||||||
sudo python setup.py install
|
|
||||||
</tt></pre>
|
|
||||||
|
|
||||||
<p>However, the midi module includes an alsa driver interface
|
|
||||||
which needs Swig to build and probably does not build at all
|
|
||||||
on recent Linux versions (the last version for the package
|
|
||||||
dates from 2006). Recoll does not need midi sequencer hardware
|
|
||||||
:), so if you don't need for other purposes, you can disable
|
|
||||||
the Alsa interface by editing setup.py and changing the
|
|
||||||
platform name at line 37 (the Alsa thing is only tried on
|
|
||||||
Linux):</p>
|
|
||||||
|
|
||||||
|
|
||||||
<pre><tt>
|
|
||||||
37c37
|
|
||||||
< if platform.startswith('linux'):
|
|
||||||
---
|
|
||||||
> if platform.startswith('NONE'):
|
|
||||||
</tt></pre>
|
|
||||||
|
|
||||||
<p>The package should then build and install just fine.</p>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@ -1,57 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>RECOLL: building id3lib with gcc 4.4</title>
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll is a simple full-text search system for unix and linux based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"full text search,fulltext,desktop search,unix,linux,solaris,open source,free">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content=
|
|
||||||
"text/html; charset=iso-8859-1">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="index.html">Home</a></li>
|
|
||||||
<li><a href="features.html">Features</a></li>
|
|
||||||
<li><a href="pics/index.html">Screenshots</a></li>
|
|
||||||
<li><a href="download.html">Downloads</a></li>
|
|
||||||
<li><a href="doc.html">User manual</a></li>
|
|
||||||
<li><a href="index.html#support">Support</a></li>
|
|
||||||
<li><a href="devel.html">Development</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h2>Compiling id3lib with recent gcc versions (2010-06-29)</h1>
|
|
||||||
<p>Recoll uses a program installed by the id3lib package for
|
|
||||||
indexing mp3 files. Id3lib has not been updated for some time and
|
|
||||||
will not compile with gcc versions after 4.4 because of gcc
|
|
||||||
incompatibilities.</p>
|
|
||||||
<p><a href="files/id3lib-3.8.3-gcc44.patch">Here is a minuscule
|
|
||||||
patch</a> to help compiling id3lib. To use it:<p>
|
|
||||||
<ul>
|
|
||||||
<li>Download the patch (right-click the link and use 'Save As').</li>
|
|
||||||
<li>Extract the id3 lib source
|
|
||||||
(<tt>tar xvzf id3lib-3.8.3.tar.gz</tt>).</li>
|
|
||||||
<li>Change your current directory to the top of the id3lib source
|
|
||||||
tree and apply the patch:<br>
|
|
||||||
<tt>cd id3lib-3.8.3<br>
|
|
||||||
patch -p1 < /path/to/the/saved/patch</tt></li>
|
|
||||||
<li>Run autoconf (you may have to install it, but your package
|
|
||||||
manager can certainly do it for you).</li>
|
|
||||||
<li>Run <tt>make</tt> and <tt>make install</tt>.</li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
||||||
@ -1,7 +0,0 @@
|
|||||||
.SUFFIXES: .txt .html
|
|
||||||
|
|
||||||
.txt.html:
|
|
||||||
asciidoc $<
|
|
||||||
|
|
||||||
all: threadingRecoll.html forkingRecoll.html xapDocCopyCrash.html
|
|
||||||
|
|
||||||
|
Before Width: | Height: | Size: 30 KiB |
@ -1,224 +0,0 @@
|
|||||||
= Recoll command execution performance
|
|
||||||
:Author: Jean-François Dockès
|
|
||||||
:Email: jfd@recoll.org
|
|
||||||
:Date: 2015-05-22
|
|
||||||
|
|
||||||
== Abstract
|
|
||||||
|
|
||||||
== Introduction
|
|
||||||
|
|
||||||
The Recoll indexer, *recollindex*, is a big process which executes many
|
|
||||||
others, mostly for extracting text from documents. Some of the executed
|
|
||||||
processes are quite short-lived, and the time used by the process execution
|
|
||||||
machinery can actually dominate the time used to translate data. This
|
|
||||||
document explores possible approaches to improving performance without
|
|
||||||
adding excessive complexity or damaging reliability.
|
|
||||||
|
|
||||||
Studying fork/exec performance is not exactly a new venture, and there are
|
|
||||||
many texts which address the subject. While researching, though, I found
|
|
||||||
out that not so many were accurate and that a lot of questions were left as
|
|
||||||
an exercise to the reader.
|
|
||||||
|
|
||||||
== Issues with fork
|
|
||||||
|
|
||||||
The traditional way for a Unix process to start another is the
|
|
||||||
+fork()+/+exec()+ system call pair.
|
|
||||||
|
|
||||||
+fork()+ duplicates the process address space and resources (open files
|
|
||||||
etc.), then duplicates the thread of execution, ending up with 2 mostly
|
|
||||||
identical processes.
|
|
||||||
|
|
||||||
+exec()+ then replaces part of the newly executing process with an address
|
|
||||||
space initialized from an executable file, inheriting some of the resources
|
|
||||||
under various conditions.
|
|
||||||
|
|
||||||
This was all fine with the small processes of the first Unix systems, but
|
|
||||||
as time progressed, processes became bigger and the copy-before-discard
|
|
||||||
operation was found to waste significant resources. It was optimized using
|
|
||||||
two methods (at very different points in time):
|
|
||||||
|
|
||||||
- The first approach was to supplement +fork()+ with the +vfork()+ call, which
|
|
||||||
is similar but does not duplicate the address space: the new process
|
|
||||||
thread executes in the old address space. The old thread is blocked
|
|
||||||
until the new one calls +exec()+ and frees up access to the memory
|
|
||||||
space. Any modification performed by the child thread persists when
|
|
||||||
the old one resumes.
|
|
||||||
|
|
||||||
- The more modern approach, which cohexists with +vfork()+, was to replace
|
|
||||||
the full duplication of the memory space with duplication of the page
|
|
||||||
descriptors only. The pages in the new process are marked copy-on-write
|
|
||||||
so that the new process has write access to its memory without
|
|
||||||
disturbing its parent. This approach was supposed to make +vfork()+
|
|
||||||
obsolete, but the operation can still be a significant resource consumer
|
|
||||||
for big processes mapping a lot of memory, so that +vfork()+ is still
|
|
||||||
around. Programs can have big memory spaces not only because they have
|
|
||||||
huge data segments (rare), but just because they are linked to many
|
|
||||||
shared libraries (more common).
|
|
||||||
|
|
||||||
NOTE: Orders of magnitude: a *recollindex* process will easily grow into a
|
|
||||||
few hundred of megabytes of virtual space. It executes the small and
|
|
||||||
efficient *antiword* command to extract text from *ms-word* files. While
|
|
||||||
indexing multiple such files, *recollindex* can spend '60% of its CPU time'
|
|
||||||
doing `fork()`/`exec()` housekeeping instead of useful work (this is on Linux,
|
|
||||||
where `fork()` uses copy-on-write).
|
|
||||||
|
|
||||||
Apart from the performance cost, another issue with +fork()+ is that a big
|
|
||||||
process can fail executing a small command because of the temporary need to
|
|
||||||
allocate twice its address space. This is a much discussed subject which we
|
|
||||||
will leave aside because it generally does not concern *recollindex*, which
|
|
||||||
in typical conditions uses a small portion of the machine virtual memory,
|
|
||||||
so that a temporary doubling is not an issue.
|
|
||||||
|
|
||||||
The Recoll indexer is multithreaded, which may introduce other issues. Here
|
|
||||||
is what happens to threads during the +fork()+/+exec()+ interval:
|
|
||||||
|
|
||||||
- +fork()+:
|
|
||||||
* The parent process threads all go on their merry way.
|
|
||||||
* The child process is created with only one thread active, duplicated
|
|
||||||
from the one which called +fork()+
|
|
||||||
- +vfork()+
|
|
||||||
* The parent process thread calling +vfork()+ is suspended, the others
|
|
||||||
are unaffected.
|
|
||||||
* The child is created with only one thread, as for +fork()+.
|
|
||||||
This thread shares the memory space with the parent ones, without
|
|
||||||
having any means to synchronize with them (pthread locks are not
|
|
||||||
supposed to work across processes): caution needed !
|
|
||||||
|
|
||||||
NOTE: for a multithreaded program using the classical pipe method to
|
|
||||||
communicate with children, the sequence between the `pipe()` call and the
|
|
||||||
parent `close()` of the unused side is a candidate for a critical section:
|
|
||||||
if several threads can interleave in there, children process may inherit
|
|
||||||
descriptors which 'belong' to other `fork()`/`exec()` operations, which may
|
|
||||||
in turn be a problem or not depending on how descriptor cleanup is
|
|
||||||
performed in the child (if no cleanup is performed, pipes may remain open
|
|
||||||
at both ends which will prevents seeing EOFs etc.). Thanks to StackExchange
|
|
||||||
user Celada for explaining this to me.
|
|
||||||
|
|
||||||
For multithreaded programs, both +fork()+ and +vfork()+ introduce possibilities
|
|
||||||
of deadlock, because the resources held by a non-forking thread in the
|
|
||||||
parent process can't be released in the child because the thread is not
|
|
||||||
duplicated. This used to happen from time to time in *recollindex* because
|
|
||||||
of an error logging call performed if the +exec()+ failed after the +fork()+
|
|
||||||
(e.g. command not found).
|
|
||||||
|
|
||||||
With +vfork()+ it is also possible to trigger a deadlock in the parent by
|
|
||||||
(inadvertently) modifying data in the child. This could happen just
|
|
||||||
link:http://www.oracle.com/technetwork/server-storage/solaris10/subprocess-136439.html[because
|
|
||||||
of dynamic linker operation] (which, seriously, should be considered a
|
|
||||||
system bug).
|
|
||||||
|
|
||||||
|
|
||||||
In general, the state of program data in the child process is a semi-random
|
|
||||||
snapshot of what it was in the parent, and the official word about what you
|
|
||||||
can do is that you can only call
|
|
||||||
link:http://man7.org/linux/man-pages/man7/signal.7.html[async-safe library
|
|
||||||
functions] between +fork()+ and +exec()+. These are functions which are
|
|
||||||
safe to call from a signal handler because they are either reentrant or
|
|
||||||
can't be interrupted by a signal. A notable missing entry in the list is
|
|
||||||
`malloc()`.
|
|
||||||
|
|
||||||
These are normally not issues for programs which only fork to execute
|
|
||||||
another program (but the devil is in the details as demonstrated by the
|
|
||||||
logging call issue...).
|
|
||||||
|
|
||||||
One of the approaches often proposed for working around this mine-field is
|
|
||||||
to use an auxiliary small process to execute any command needed by the main
|
|
||||||
one. The small process can just use +fork()+/+exec()+ with no performance
|
|
||||||
issues. This has the inconvenient of complicating communication a lot if
|
|
||||||
data needs to be transferred one way or another.
|
|
||||||
|
|
||||||
////
|
|
||||||
Passing descriptors around
|
|
||||||
http://stackoverflow.com/questions/909064/portable-way-to-pass-file-descriptor-between-different-processes
|
|
||||||
http://www.normalesup.org/~george/comp/libancillary/
|
|
||||||
http://stackoverflow.com/questions/28003921/sending-file-descriptor-by-linux-socket/
|
|
||||||
|
|
||||||
The process would then be:
|
|
||||||
- Tell slave to fork/exec cmd (issue with cmd + args format)
|
|
||||||
- Get fds
|
|
||||||
- Tell slave to wait, recover status.
|
|
||||||
////
|
|
||||||
|
|
||||||
== The posix_spawn() Linux non-event
|
|
||||||
|
|
||||||
Given the performance issues of `fork()` and tricky behaviour of `vfork()`,
|
|
||||||
a "simpler" method for starting a child process was introduced by Posix:
|
|
||||||
`posix_spawn()`.
|
|
||||||
|
|
||||||
The `posix_spawn()` function is a black box, externally equivalent to a
|
|
||||||
`fork()`/`exec()` sequence, and has parameters to specify the usual
|
|
||||||
house-keeping performed at this time (file descriptors and signals
|
|
||||||
management etc.). Hiding the internals gives the system a chance to
|
|
||||||
optimize the performance and avoid `vfork()` pitfalls like the `ld.so`
|
|
||||||
lockup described in the Oracle article.
|
|
||||||
|
|
||||||
The Linux posix_spawn() is implemented by a `fork()`/`exec()` pair by default.
|
|
||||||
|
|
||||||
`vfork()` is used either if specified by an input flag or no
|
|
||||||
signal/scheduler/process_group changes are requested. There must be a
|
|
||||||
reason why signal handling changes would preclude `vfork()` usage, but I
|
|
||||||
could not find it (signal handling data is stored in the kernel task_struct).
|
|
||||||
|
|
||||||
The Linux glibc `posix_spawn()` currently does nothing that user code could
|
|
||||||
not do. Still, using it would probably be a good future-proofing idea, but
|
|
||||||
for a significant problem: there is no way to specify closing all open
|
|
||||||
descriptors bigger than a specified value (closefrom() equivalent). This is
|
|
||||||
available on Solaris and quite necessary in fact, because we have no way to
|
|
||||||
be sure that all open descriptors have the CLOEXEC flag set.
|
|
||||||
|
|
||||||
So, no `posix_spawn()` for us (support was implemented inside
|
|
||||||
*recollindex*, but the code is normally not used).
|
|
||||||
|
|
||||||
== The chosen solution
|
|
||||||
|
|
||||||
The previous version of +recollindex+ used to use +vfork()+ if it was running
|
|
||||||
a single thread, and +fork()+ if it ran multiple ones.
|
|
||||||
|
|
||||||
After another careful look at the code, I could see few issues with
|
|
||||||
using +vfork()+ in the multithreaded indexer, so this was committed.
|
|
||||||
|
|
||||||
The only change necessary was to get rid of an implementation of the
|
|
||||||
lacking Linux +closefrom()+ call (used to close all open descriptors above a
|
|
||||||
given value). The previous Recoll implementation listed the +/proc/self/fd+
|
|
||||||
directory to look for open descriptors but this was unsafe because of of
|
|
||||||
possible memory allocations in +opendir()+ etc.
|
|
||||||
|
|
||||||
== Test results
|
|
||||||
|
|
||||||
.Indexing 12500 small .doc files
|
|
||||||
[options="header"]
|
|
||||||
|===============================
|
|
||||||
|call |real |user |sys
|
|
||||||
|fork |0m46.025s |0m26.574s |0m39.494s
|
|
||||||
|vfork |0m18.223s |0m17.753s |0m1.736s
|
|
||||||
|spawn/fork| 0m45.726s|0m27.082s| 0m40.575s
|
|
||||||
|spawn/vfork|0m18.915s|0m18.681s|0m3.828s
|
|
||||||
|recoll 1.18|1m47.589s|0m21.537s|0m29.458s
|
|
||||||
|================================
|
|
||||||
|
|
||||||
No surprise here, given the implementation of +posix_spawn()+, it gets the
|
|
||||||
same times as the +fork()+/+vfork()+ options.
|
|
||||||
|
|
||||||
The tests were performed on an Intel Core i5 750 (4 cores, 4 threads).
|
|
||||||
|
|
||||||
It would be painful to play it safe and discard the 60% reduction in
|
|
||||||
execution time offered by using +vfork()+, so this was adopted for Recoll
|
|
||||||
1.21. To this day, no problems were discovered, but, still crossing
|
|
||||||
fingers...
|
|
||||||
|
|
||||||
The last line in the table is just for the fun: *recollindex* 1.18
|
|
||||||
(single-threaded) needed almost 6 times as long to process the same
|
|
||||||
files...
|
|
||||||
|
|
||||||
////
|
|
||||||
Objections to vfork:
|
|
||||||
sigaction locks
|
|
||||||
https://bugzilla.redhat.com/show_bug.cgi?id=193631
|
|
||||||
Is Linux vfork thread-safe ? Quoting interesting comments from Solaris
|
|
||||||
implementation: No answer to the issues cited though.
|
|
||||||
https://sourceware.org/bugzilla/show_bug.cgi?id=378
|
|
||||||
Aussi:
|
|
||||||
http://blog.famzah.net/2009/11/20/fork-gets-slower-as-parent-process-use-more-memory/
|
|
||||||
http://blog.famzah.net/2009/11/20/a-much-faster-popen-and-system-implementation-for-linux/
|
|
||||||
Avec un workaround basé sur clone (donc linux-only). Tried it but crashes.
|
|
||||||
////
|
|
||||||
|
Before Width: | Height: | Size: 35 KiB |
|
Before Width: | Height: | Size: 19 KiB |
@ -1,406 +0,0 @@
|
|||||||
= Converting Recoll indexing to multithreading
|
|
||||||
:Author: Jean-François Dockès
|
|
||||||
:Email: jfd@recoll.org
|
|
||||||
:Date: 2012-12-03
|
|
||||||
|
|
||||||
== Abstract
|
|
||||||
|
|
||||||
This relates lessons learned while modifying *Recoll* indexing to be
|
|
||||||
multithreaded. I am by no means a threaded applications expert, so that a
|
|
||||||
few of the observations I made whole doing this may be of use to other
|
|
||||||
novices.
|
|
||||||
|
|
||||||
== Introduction
|
|
||||||
|
|
||||||
http://www.recoll.org[*Recoll*] is a document indexing application, it
|
|
||||||
allows you to find documents by specifying search terms.
|
|
||||||
|
|
||||||
The documents need to be _indexed_ for searches to be fast. In a nutshell,
|
|
||||||
we convert the different document formats to text, then split the text into
|
|
||||||
terms and remember where those occur. This is a time-consuming operation.
|
|
||||||
|
|
||||||
Up to version 1.18 *Recoll* indexing is single-threaded: routines which
|
|
||||||
call each other sequentially.
|
|
||||||
|
|
||||||
In most personal indexer contexts, it is also CPU-bound. There is a lot of
|
|
||||||
conversion work necessary for turning those PDF (or other) files into
|
|
||||||
appropriately cleaned up pure text, then split it into terms and update the
|
|
||||||
index. Given the relatively modest amount of data, and the speed of
|
|
||||||
storage, I/O issues are secondary.
|
|
||||||
|
|
||||||
Looking at the _CPU idle_ *top* output stuck at 75% on my quad-core CPU,
|
|
||||||
while waiting for the indexing to finish, was frustrating, and I was
|
|
||||||
tempted to find a way to keep those other cores at temperature and shorten
|
|
||||||
the waiting.
|
|
||||||
|
|
||||||
For some usages, the best way to accomplish this may be to just partition
|
|
||||||
the index and independantly start indexing on different configurations,
|
|
||||||
using multiple processes to better utilize the available processing power.
|
|
||||||
|
|
||||||
This is not an universal solution though, as it is complicated to set up,
|
|
||||||
not optimal in general for indexing performance, and not always optimal
|
|
||||||
either at query time.
|
|
||||||
|
|
||||||
The most natural way to improve indexing times is to increase CPU
|
|
||||||
utilization by using multiple threads inside an indexing process.
|
|
||||||
|
|
||||||
Something similar had been done with earlier versions of the *Recoll* GUI,
|
|
||||||
which had an internal indexing thread. This had been a frequent source of
|
|
||||||
trouble though, and linking the GUI and indexing process lifetimes was a
|
|
||||||
bad idea, so, in recent versions, the indexing is always performed by an
|
|
||||||
external process. Still, this experience had put in light most of the
|
|
||||||
problem areas, and prepared the code for further work.
|
|
||||||
|
|
||||||
It should be noted that, as `recollindex` is both _nice_'d and _ionice_'d
|
|
||||||
as a lowest priority process, it will only use free computing power on the
|
|
||||||
machine, and will step down as soon as anything else wants to work.
|
|
||||||
|
|
||||||
****
|
|
||||||
|
|
||||||
The only case where you may notice that the indexing is at work
|
|
||||||
is when the machine is short on memory and things (such as
|
|
||||||
your Web browser) get swapped-out while you are not actively using
|
|
||||||
them. You then notice a long delay when you want to start, because they
|
|
||||||
need to be swapped back in. There is little which can be done about
|
|
||||||
this. Setting _idxflushmb_ to a low value may help in some cases (depending
|
|
||||||
on the document sizes). May I also suggest in this case that, if your
|
|
||||||
machine can take more memory, it may be a good idea to procure some, as
|
|
||||||
memory is nowadays quite cheap, and memory-starved machines are not fun.
|
|
||||||
|
|
||||||
****
|
|
||||||
|
|
||||||
In general, augmenting the machine utilisation by `recollindex` just does
|
|
||||||
not change its responsiveness. My PC has a an Intel Pentium Core i5 750 (4
|
|
||||||
cores, no hyperthreading), which is far from being a high performance CPU
|
|
||||||
(nowadays...), and I often forget that I am running indexing tests, it is
|
|
||||||
just not noticeable. The machine does have a lot of memory though (12GB).
|
|
||||||
|
|
||||||
|
|
||||||
== The Recoll indexing processing flow
|
|
||||||
|
|
||||||
image::nothreads.png["Basic flow", float="right"]
|
|
||||||
|
|
||||||
There are 4 main steps in the `recollindex` processing pipeline:
|
|
||||||
|
|
||||||
. Find the file
|
|
||||||
. Convert it to text
|
|
||||||
. Process the text (split, strip etc.) and create a *Xapian* document
|
|
||||||
. Update the index
|
|
||||||
|
|
||||||
The first step, walking the file system (or some other data source), is
|
|
||||||
usually much faster than the others, and we just leave it alone to be
|
|
||||||
performed by the main thread. It outputs file names (and the associated
|
|
||||||
*POSIX* _stat_ data).
|
|
||||||
|
|
||||||
The last step, *Xapian* index updating, can only be single-threaded.
|
|
||||||
|
|
||||||
The first idea is to change the indexing pipeline so that each step is
|
|
||||||
performed by an independant worker thread, passing its output to the next
|
|
||||||
thread, in assembly-line fashion.
|
|
||||||
|
|
||||||
In order to achieve this, we need to decouple the different phases. They
|
|
||||||
are normally linked by procedure calls, which we replace with a job
|
|
||||||
control object: the 'WorkQueue'.
|
|
||||||
|
|
||||||
=== The WorkQueue
|
|
||||||
|
|
||||||
|
|
||||||
The _WorkQueue_ object is implemented by a reasonably simple class, which
|
|
||||||
manages an input queue on which client append jobs, and a set of worker
|
|
||||||
threads, which retrieve and perform the jobs, and whose lifetime are
|
|
||||||
managed by the _WorkQueue_ object. The implementation is straightforward
|
|
||||||
with *POSIX* threads synchronization functions and C++ *STL* data
|
|
||||||
structures.
|
|
||||||
|
|
||||||
In practise it proved quite simple to modify existing code to create a job
|
|
||||||
object and put it on the queue, instead of calling the downstream routine
|
|
||||||
with the job parameters, _while keeping the capacity to call the downstream
|
|
||||||
routine directly_. The kind of coupling is determined either by compilation
|
|
||||||
flags (for global disabling/enabling of multithreading), or according to
|
|
||||||
configuration data, which allows experimenting with different threads
|
|
||||||
arrangements just by changing parameters in a file, without recompiling.
|
|
||||||
|
|
||||||
Each _WorkQueue_ accepts two parameters: the length of the input queue
|
|
||||||
(before a client will block when trying to add a job), and the number of
|
|
||||||
worker threads. Both parameters can be set in the *Recoll* configuration
|
|
||||||
file for each of the three queues used in the indexing pipeline. Setting
|
|
||||||
the queue length to -1 will disable the corresponding queue (using a direct
|
|
||||||
call instead).
|
|
||||||
|
|
||||||
unfloat::[]
|
|
||||||
|
|
||||||
|
|
||||||
== The Assembly Line
|
|
||||||
|
|
||||||
image::assembly.png["Assembly line", float="right"]
|
|
||||||
|
|
||||||
So the first idea is to create 3 explicit threads to manage the file
|
|
||||||
conversion, the term generation, and the *Xapian* index update. The first
|
|
||||||
thread prepares a file, passes it on to the term generation thread, and
|
|
||||||
immediately goes back to work on the next file, etc.
|
|
||||||
|
|
||||||
The presumed advantage of this method is that the different stages, which
|
|
||||||
perform disjointed processing, should share little, so that we can hope to
|
|
||||||
minimize the changes necessitated by the threads interactions.
|
|
||||||
|
|
||||||
However some changes to the code were needed to make this work (and a few
|
|
||||||
bugs were missed, which only became apparent at later stages, confirming
|
|
||||||
that the _low interaction_ idea was not completely false).
|
|
||||||
|
|
||||||
=== Converting to multithreading: what to look for
|
|
||||||
|
|
||||||
I am probably stating the obvious here, but when preparing a program for
|
|
||||||
multi-threading, problems can only arise where non-constant data is
|
|
||||||
accessed by different threads.
|
|
||||||
|
|
||||||
Once you have solved the core problems posed by the obvious data that needs
|
|
||||||
to be shared, you will be left to deal with less obvious, hidden,
|
|
||||||
interactions inside the program.
|
|
||||||
|
|
||||||
Classically this would concern global or static data, but in a C++ program,
|
|
||||||
class members will be a concern if a single object can be accessed by
|
|
||||||
several threads.
|
|
||||||
|
|
||||||
Hunting for static data inside a program of non trivial size is not always
|
|
||||||
obvious. Two approaches can be used: hunting for the _static_ keyword in
|
|
||||||
source code, or looking at global and static data symbols in *nm* output.
|
|
||||||
|
|
||||||
Once found, there are mostly three types of static/global data:
|
|
||||||
|
|
||||||
* Things that need to be eliminated: for example, routines can be made
|
|
||||||
reentrant by letting the caller supply a storage buffer instead of using
|
|
||||||
an internal static one (which was a bad idea in the first place
|
|
||||||
anyway).
|
|
||||||
* Things that need to be protected: sometimes, the best approach is just
|
|
||||||
to protect the access with a mutex lock. It is trivial to encapsulate
|
|
||||||
the locks in C++ objects to use the "Resource Acquisition is
|
|
||||||
Initialization" idiom, easily making sure that locks are freed when
|
|
||||||
exiting the critical section. Recoll used to include a basic home-made
|
|
||||||
implementation, but now lets C++11 work for it.
|
|
||||||
* Things which can stay: this is mostly initialization data such as value
|
|
||||||
tables which are computed once, and then stay logically constant during
|
|
||||||
program execution. In order to be sure of a correct single-threaded
|
|
||||||
initialization, it is best to explicitly initialize the modules or
|
|
||||||
functions that use this kind of data in the main thread when the program
|
|
||||||
starts.
|
|
||||||
|
|
||||||
=== Assembly line approach: the results
|
|
||||||
|
|
||||||
Unfortunately, the assembly line approach yields very modest improvements
|
|
||||||
when used inside *Recoll* indexing. The reason, is that this method needs
|
|
||||||
stages of equivalent complexity to be efficient. If one of the stages
|
|
||||||
dominates the others, its thread will be the only one active at any time,
|
|
||||||
and little will be gained.
|
|
||||||
|
|
||||||
What is especially problematic is that the balance between tasks need not
|
|
||||||
only exist on average, but also for the majority of individual jobs.
|
|
||||||
|
|
||||||
For *Recoll* indexing, even if the data preparation and index update steps
|
|
||||||
are often of the same order of magnitude _on average_, their balance
|
|
||||||
depends a lot on the kind of data being processed, so that things are
|
|
||||||
usually unbalanced at any given time: the index update thread is mostly
|
|
||||||
idle while processing PDF files, and the data preparation has little to do
|
|
||||||
when working on HTML or plain text.
|
|
||||||
|
|
||||||
In practice, very modest indexing time improvements from 5% to 15% were
|
|
||||||
achieved with this method.
|
|
||||||
|
|
||||||
[[recoll.idxthreads.multistage]]
|
|
||||||
== The next step: multi-stage parallelism
|
|
||||||
|
|
||||||
image::multipara.png["Multi-stage parallelism", float="right"]
|
|
||||||
|
|
||||||
Given the limitations of the assembly line approach, the next step in the
|
|
||||||
transformation of *Recoll* indexing was to enable full parallelism wherever
|
|
||||||
possible.
|
|
||||||
|
|
||||||
Of the four processing steps (see figures), two are not candidates for
|
|
||||||
parallelization:
|
|
||||||
|
|
||||||
* File system walking is so fast compared to the other steps that using
|
|
||||||
several threads would make no sense (it would also quite probably become
|
|
||||||
IO bound if we tried anyway).
|
|
||||||
* The *Xapian* library index updating code is not designed for
|
|
||||||
multi-threading and must stay protected from multiple accesses.
|
|
||||||
|
|
||||||
The two other steps are good candidates.
|
|
||||||
|
|
||||||
Most of the work to make *Recoll* code reentrant had been performed for the
|
|
||||||
previous transformation. Going full-parallel only implied protecting the
|
|
||||||
data structures that needed to be shared by the threads performing a given
|
|
||||||
processing step.
|
|
||||||
|
|
||||||
Just for the anecdotic value, a list of the elements that needed mutexes:
|
|
||||||
|
|
||||||
- Filter subprocesses cache: some file conversion subprocesses may be
|
|
||||||
expensive (starting a Python process is no piece of cake), so they are
|
|
||||||
cached for reuse after they are done translating a file. The shared cache
|
|
||||||
needs protection.
|
|
||||||
- Status updates: an object used to update the current file name and indexing
|
|
||||||
status to a shared file.
|
|
||||||
- Missing store: the list of missing helper programs
|
|
||||||
- The readonly *Xapian* database object: a Xapian::Database object which is
|
|
||||||
used for checking the validity of current index data against a file's
|
|
||||||
last modification date.
|
|
||||||
- Document existence map: a bit array used to store an existence bit about
|
|
||||||
every document, and purge the disappeared at the end of the indexing
|
|
||||||
pass. This is accessed both from the file conversion and database update
|
|
||||||
code, so it also needed protection in the previous assembly line
|
|
||||||
approach.
|
|
||||||
- Mbox offsets cache. Used to store the offsets of individual messages
|
|
||||||
inside *mbox* files.
|
|
||||||
- *iconv* control blocks: these are cached for reuse in several places, and
|
|
||||||
need protection. Actually, it might be better in multithreading context
|
|
||||||
to just suppress the reuse and locking. Rough tests seem to indicate that
|
|
||||||
the impact on overall performance is small, but this might change with
|
|
||||||
higher parallelism (or not...).
|
|
||||||
|
|
||||||
The *Recoll* configuration also used to be managed by a single shared
|
|
||||||
object, which is mutable as values may depend on what area of the
|
|
||||||
file-system we are exploring, so that the object is stateful and updated as
|
|
||||||
we change directories. The choice made here was to duplicate the object
|
|
||||||
where needed (each indexing thread gets its own). This gave rise to the
|
|
||||||
sneakiest bug in the whole transformation (see further down).
|
|
||||||
|
|
||||||
Having a dynamic way to define the threads configuration makes it easy to
|
|
||||||
experiment. For example, the following data defines the configuration that
|
|
||||||
was finally found to be best overall on my hardware:
|
|
||||||
|
|
||||||
thrQSizes = 2 2 2
|
|
||||||
thrTCounts = 4 2 1
|
|
||||||
|
|
||||||
This is using 3 queues of depth 2, 4 threads working on file conversion, 2
|
|
||||||
on text splitting and other document processing, and 1 on Xapian updating
|
|
||||||
(no choice here).
|
|
||||||
|
|
||||||
unfloat::[]
|
|
||||||
|
|
||||||
== Bench results
|
|
||||||
|
|
||||||
So the big question after all the work: was it worth it ? I could only get
|
|
||||||
a real answer when the program stopped crashing, so this took some time and
|
|
||||||
a little faith, but the answer is positive, as far as I'm
|
|
||||||
concerned. Performance has improved significantly and this was a fun
|
|
||||||
project.
|
|
||||||
|
|
||||||
|
|
||||||
.Results on a variety of file system areas:
|
|
||||||
[options="header", width="70%"]
|
|
||||||
|=======================
|
|
||||||
|Area |Seconds before |Seconds after| Percent Improvement| Speed Factor
|
|
||||||
|home |12742 | 6942 | 46%| 1.8
|
|
||||||
|mail |2700 | 1563 | 58% | 1.7
|
|
||||||
|projets | 5022 | 1970 | 61% | 2.5
|
|
||||||
|pdf | 2164 | 770 | 64% | 2.8
|
|
||||||
|otherhtml | 5593 | 4014| 28% | 1.4
|
|
||||||
|=======================
|
|
||||||
|
|
||||||
.Characteristics of the data
|
|
||||||
[options="header", width="70%"]
|
|
||||||
|=======================
|
|
||||||
|Area | Files MB | Files | DB MB | Documents
|
|
||||||
|home | 64106 | 44897 | 1197 | 104797
|
|
||||||
|mail | 813 | 232 | 663 | 47267
|
|
||||||
|projets | 2056 | 34504 | 549 | 40281
|
|
||||||
|pdf | 1123 | 1139 | 111 | 1139
|
|
||||||
|otherhtml | 3442 | 223007 | 2080 | 221890 |
|
|
||||||
|=======================
|
|
||||||
|
|
||||||
_home_ is my home directory. The high megabyte value is due to a number of
|
|
||||||
very big and not indexed *VirtualBox* images. Otherwise, it's a wide
|
|
||||||
mix of source files, email, miscellaneous documents and ebooks.
|
|
||||||
|
|
||||||
_mail_ is my mail directory, full of *mbox* files.
|
|
||||||
|
|
||||||
_projets_ mostly holds source files, and a number of documents.
|
|
||||||
|
|
||||||
_pdf_ holds random *pdf* files harvested on the internets. The performance
|
|
||||||
is quite spectacular, because most of the processing time goes to
|
|
||||||
converting them to text, and this is done in parallel. Probably could be
|
|
||||||
made a bit faster with more cores, until we hit the *Xapian* update speed
|
|
||||||
limit.
|
|
||||||
|
|
||||||
_otherhtml_ holds myriad of small html files, mostly from
|
|
||||||
*wikipedia*. The improvement is not great here because a lot of time is
|
|
||||||
spent in the single-threaded *Xapian* index update.
|
|
||||||
|
|
||||||
The tests were made with queue depths of 2 on all queues, and 4 threads
|
|
||||||
working on the file conversion step, 2 on the term generation.
|
|
||||||
|
|
||||||
== A variation: linear parallelism
|
|
||||||
|
|
||||||
Once past the assembly-line idea, another possible transformation would be
|
|
||||||
to get rid of the two downstream queues, and just create a job for each
|
|
||||||
file and let it go to the end (using a mutex to protect accesses to the
|
|
||||||
writable *Xapian* database).
|
|
||||||
|
|
||||||
With the current *Recoll* code, this can be defined by the following
|
|
||||||
parameters (one can also use a deeper front queue, this changes little):
|
|
||||||
|
|
||||||
thrQSizes = 2 -1 -1
|
|
||||||
thrTCounts = 4 0 0
|
|
||||||
|
|
||||||
In practise, the performance is close to the one for the multistage
|
|
||||||
version.
|
|
||||||
|
|
||||||
If we were to hard-code this approach, this would be a simpler
|
|
||||||
modification, necessitating less changes to the code, but it has a slight
|
|
||||||
inconvenient: when working on a single big multi-document file, no
|
|
||||||
parallelism at all can be obtained. In this situation, the multi-stage
|
|
||||||
approach brings us back to the assembly-line behaviour, so the improvements
|
|
||||||
are not great, but they do exist.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
== Miscellany
|
|
||||||
|
|
||||||
=== The big gotcha: my stack dump staring days
|
|
||||||
|
|
||||||
Overall, debugging the modified program was reasonably
|
|
||||||
straightforward. Data access synchronization issues mostly provoke dynamic
|
|
||||||
data corruption, which can be beastly to debug. I was lucky enough that
|
|
||||||
most crashes occurred in the code that was actually related to the
|
|
||||||
corrupted data, not in some randomly located and unrelated dynamic memory
|
|
||||||
user, so that the issues were reasonably easy to find.
|
|
||||||
|
|
||||||
One issue though kept me working for a few days. The indexing process kept
|
|
||||||
crashing randomly at an interval of a few thousands documents, segfaulting
|
|
||||||
on a bad pointer. An access to the configuration data structure seemed to
|
|
||||||
be involved, but, as each thread was supposed to have its own copy, I was
|
|
||||||
out of ideas.
|
|
||||||
|
|
||||||
After reviewing all the uses for the configuration data (there are quite a
|
|
||||||
few), the problem was finally revealed to lie with the filter process
|
|
||||||
cache. Each filter structure stored in the cache stores a pointer to a
|
|
||||||
configuration structure. This belonged to the thread which initially
|
|
||||||
created the filter. But the filter would often be reused by a different
|
|
||||||
thread, with the consequence that the configuration object was now accessed
|
|
||||||
and modified by two unsynchronized threads... Resetting the config pointer
|
|
||||||
at the time of filter reuse was a very simple (almost)single-line fix to
|
|
||||||
this evasive problem.
|
|
||||||
|
|
||||||
Looking at multi-threaded stack dumps is mostly fun for people with several
|
|
||||||
heads, which is unfortunately not my case, so I was quite elated when this
|
|
||||||
was over.
|
|
||||||
|
|
||||||
=== Fork performance issues
|
|
||||||
|
|
||||||
On a quite unrelated note, something that I discovered while evaluating the
|
|
||||||
program performance is that forking a big process like `recollindex` can be
|
|
||||||
quite expensive. Even if the memory space of the forked process is not
|
|
||||||
copied (it's Copy On Write, and we write very little before the following
|
|
||||||
exec), just duplicating the memory maps can be slow when the process uses a
|
|
||||||
few hundred megabytes.
|
|
||||||
|
|
||||||
I modified the single-threaded version of `recollindex` to use *vfork*
|
|
||||||
instead of *fork*, but this can't be used with multiple threads (no
|
|
||||||
modification of the process memory space is allowed in the child between
|
|
||||||
*vfork* and *exec*, so we'd have to have a way to suspend all the threads
|
|
||||||
first).
|
|
||||||
|
|
||||||
I did not implement a solution to this issue, and I don't think
|
|
||||||
that a simple one exists. The workaround is to use modest *Xapian* flush
|
|
||||||
values to prevent the process from becoming too big.
|
|
||||||
|
|
||||||
A longer time solution would be to implement a small slave process to do
|
|
||||||
the executing of ephemeral external commands.
|
|
||||||
@ -1,138 +0,0 @@
|
|||||||
= The case of the bad Xapian::Document copy
|
|
||||||
|
|
||||||
== How things were supposed to work
|
|
||||||
|
|
||||||
Coming from the link:threadingRecoll.html[threading *Recoll*] page,
|
|
||||||
you may remember that the third stage of the
|
|
||||||
processing pipeline breaks up text into terms, producing a *Xapian*
|
|
||||||
document (+Xapian::Document+) which is finally processed by the last stage,
|
|
||||||
the index updater.
|
|
||||||
|
|
||||||
What happens in practise is that the main routine in this stage has a local
|
|
||||||
+Xapian::Document+ object, automatically allocated on the stack, which it
|
|
||||||
updates appropriately and then copies into a task object which is placed on
|
|
||||||
the input queue for the last stage.
|
|
||||||
|
|
||||||
The text-splitting routine then returns, and its local +Xapian::Document+
|
|
||||||
object is (implicitely) deleted while the stack unwinds.
|
|
||||||
|
|
||||||
The idea is that the *copy* of the document which is on the queue should be
|
|
||||||
unaffected, it is independant of the original and will further be processed
|
|
||||||
by the index update thread, without interaction with the text-splitting one.
|
|
||||||
|
|
||||||
At no point do multiple threads access the +Xapian::Document+ data, so
|
|
||||||
there should be no problem.
|
|
||||||
|
|
||||||
== The problem
|
|
||||||
|
|
||||||
Most *Xapian* objects are reference-counted, which means that the object
|
|
||||||
itself is a small block of house-keeping variables. The actual data is
|
|
||||||
allocated on the heap through eventual calls to new/malloc, and is shared
|
|
||||||
by multiple copies of the object. This is the case for +Xapian::Document+
|
|
||||||
|
|
||||||
This is aboundantly documented, and users are encouraged to use copies
|
|
||||||
instead of passing pointers around (copies are cheap because only a small
|
|
||||||
block of auxiliary data is actually duplicated). This in general makes
|
|
||||||
memory management easier.
|
|
||||||
|
|
||||||
This is well-known, and it would not appear to be a problem in the above
|
|
||||||
case as the +Xapian::Document+ actual data is never accessed by multiple
|
|
||||||
threads.
|
|
||||||
|
|
||||||
The problem is that the reference counter which keeps track of the object
|
|
||||||
usage and triggers actual deletion when it goes to zero is accessed by two
|
|
||||||
threads:
|
|
||||||
|
|
||||||
- It is decremented while the first local object is destroyed during the
|
|
||||||
stack unwind in the first thread
|
|
||||||
- It is also updated by the last stage thread, incremented if copies are
|
|
||||||
made, then decremented until it finally goes down to 0 when we are done
|
|
||||||
with the object, at which point the document data is unallocated.
|
|
||||||
|
|
||||||
As the counter is not protected in any way against concurrent access, the
|
|
||||||
actual sequence of events is undefined and at least two kinds of problems
|
|
||||||
may occur: double deletion of the data, or accesses to already freed heap
|
|
||||||
data (potentially thrashing other threads allocations, or reading modified
|
|
||||||
data).
|
|
||||||
|
|
||||||
A relatively simple fix for this would be to use atomic test-and-set
|
|
||||||
operations for the counter (which is what the GNU +std::string+ does). But
|
|
||||||
the choice made by *Xapian* to let the application deal with all
|
|
||||||
synchronization issues is legitimate and documented, nothing to complain
|
|
||||||
about here. I just goofed.
|
|
||||||
|
|
||||||
Because the counter test and update operations are very fast, and occur
|
|
||||||
among a lot of processing from the final stage thread, the chances of
|
|
||||||
concurrent access are low, which is why the problem manifests itself very
|
|
||||||
rarely. Depending on thread scheduling and all manners of semi-random
|
|
||||||
conditions, it is basically impossible to reproduce reliably.
|
|
||||||
|
|
||||||
== The fix
|
|
||||||
|
|
||||||
The implemented fix was trivial: the upstream thread allocates the initial
|
|
||||||
+Xapian::Document+ on the heap, copies the pointer to the queue object, and
|
|
||||||
forgets about it. The index-updating thread peruses the object then
|
|
||||||
+delete+'s it. Real easy.
|
|
||||||
|
|
||||||
An alternative solution would have been to try and use locking to protect
|
|
||||||
the counter updates. The only place where such locking operations could
|
|
||||||
reasonably occur is inside the +Xapian::Document+ refcounted pointer
|
|
||||||
object, which we can't modify. Otherwise, we would have to protect the
|
|
||||||
_whole scopes of existence_ of the Xapian::Document object in any routine
|
|
||||||
which creates/copies or (implicitely) deletes it, which would cause many
|
|
||||||
problems and/or contention issues
|
|
||||||
|
|
||||||
== Why did I miss this ?
|
|
||||||
|
|
||||||
The mechanism of the crashes is simple enough, quasi-obvious.
|
|
||||||
How on earth could I miss this problem while writing the code ?
|
|
||||||
|
|
||||||
For the sake of anecdote, my first brush with atomicity for updates of
|
|
||||||
reference counters was while debugging a System V release 4 kernel VFS file
|
|
||||||
system module, at the time when SVR4 got a preemptive kernel with SVR4-MP,
|
|
||||||
circa 1990... I ended up replacing a +counter+++ with +atomic_add()+ after
|
|
||||||
a set of _interesting_ debugging sessions interspersed with kernel crashes
|
|
||||||
and +fsck+ waits. This should have left some memories. So what went wrong ?
|
|
||||||
Here follow a list of possible reasons:
|
|
||||||
|
|
||||||
- Reasoning by analogy: std::string are safe to use in this way. The other
|
|
||||||
objects used in the indexing pipe are also safe. I just used
|
|
||||||
+Xapian::Document+ in the same way without thinking further.
|
|
||||||
- Probably not how I would do it: faced with designing +Xapian::Document+,
|
|
||||||
(not clever enough to do this anyway), I'd probably conclude that not
|
|
||||||
wanting to deal with full-on concurrency is one thing, not protecting the
|
|
||||||
reference counters is another, and going too far.
|
|
||||||
- The problem was not so easily visible because the object deletion is
|
|
||||||
implicitely performed during the stack unwind: this provides no clue, no
|
|
||||||
specific operation to think about.
|
|
||||||
- Pure lazyness.
|
|
||||||
|
|
||||||
|
|
||||||
As a conclusion, a humble request to library designers: when an
|
|
||||||
interface works counter to the reasonable expectations of at least some of
|
|
||||||
the users (for example because it looks like, but works differently, than a
|
|
||||||
standard library interface), it is worth it to be very specific in the
|
|
||||||
documentation and header file comments about the gotcha's. Saving people
|
|
||||||
from their own deficiencies is a worthy goal.
|
|
||||||
|
|
||||||
Here, a simple statement that the reference count was not mt-safe
|
|
||||||
(admittedly redundant with the general statement that the *Xapian* library
|
|
||||||
does not deal with threads), would have got me thinking and avoided the
|
|
||||||
error.
|
|
||||||
|
|
||||||
++++
|
|
||||||
<h2 id="comments">Comments</h2>
|
|
||||||
|
|
||||||
<div id="disqus_thread"></div>
|
|
||||||
<script type="text/javascript">
|
|
||||||
var disqus_shortname = 'lesbonscomptes';
|
|
||||||
(function() {
|
|
||||||
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
|
|
||||||
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
|
|
||||||
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
|
|
||||||
})();
|
|
||||||
</script>
|
|
||||||
<noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
|
|
||||||
<a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
|
|
||||||
|
|
||||||
++++
|
|
||||||
@ -1,401 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Recoll text search finds your documents</title>
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Description" content="Recoll is a desktop text search application for Unix, Linux, Microsoft Windows and Mac OS X, based on the Xapian search engine library.">
|
|
||||||
<meta name="Keywords" content="text search, pdf search, document search, full-text search, desktop search, open source,free">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
|
||||||
<link rel="shortcut icon" href="favicon.ico" />
|
|
||||||
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="http://www.recoll.org">Home</a></li>
|
|
||||||
<li><a href="pics/index.html">Screenshots</a></li>
|
|
||||||
<li><a href="download.html">Downloads</a></li>
|
|
||||||
<li><a href="doc.html">Documentation</a></li>
|
|
||||||
<li><a href="support.html">Support</a></li>
|
|
||||||
<li><a href="devel.html">Helping out</a></li>
|
|
||||||
<li><a href="index.html.fr">En Français</a></li>
|
|
||||||
<li><a class="weak" href="../pages/lbc-hosting.html">lesbonscomptes</a>
|
|
||||||
</ul>
|
|
||||||
<p class="indexthumb">
|
|
||||||
<a href="pics/index.html"><img width="100" alt=
|
|
||||||
"Thumbnail of recoll main screen" src=
|
|
||||||
"pics/recoll0-thumb.png"></a>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h1><img align="center" src="pics/recoll64.png"/>
|
|
||||||
<a href="http://www.recoll.org/">Recoll</a> is
|
|
||||||
a desktop full-text search tool.</h1>
|
|
||||||
|
|
||||||
<p><span class="application">Recoll</span> finds keywords
|
|
||||||
inside documents as well as file names.</p>
|
|
||||||
<ul>
|
|
||||||
<li>Versions are available for <a href="download.html">Linux</a>
|
|
||||||
and <a href="pages/recoll-windows.html">MS Windows</a>.</li>
|
|
||||||
<li>A
|
|
||||||
<a href="https://github.com/koniu/recoll-webui">WEB
|
|
||||||
front-end</a> with preview and download features can
|
|
||||||
replace or supplement the GUI for remote
|
|
||||||
use.</li>
|
|
||||||
<li>It can search
|
|
||||||
most <span class="important"><a href="features.html#doctypes">document
|
|
||||||
formats</a></span>. <a href="features.html#doctypes">You may
|
|
||||||
need external applications for text extraction</a>.</li>
|
|
||||||
<li>It can reach any storage place: files,
|
|
||||||
archive members, email attachments, transparently
|
|
||||||
handling decompression.</li>
|
|
||||||
<li>One click will open the document inside a native editor or
|
|
||||||
display an even quicker text preview.</li>
|
|
||||||
<li>The software is free, open source,
|
|
||||||
and licensed under the GPL.</li>
|
|
||||||
<li><a href="features.html">Detailed features</a> and
|
|
||||||
application requirements for supported document types.</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p>The current <span class="application">Recoll</span> version is
|
|
||||||
<a href="download.html">1.23.2</a>
|
|
||||||
(<a href="release-1.23.html">Release notes</a>,
|
|
||||||
<a href="BUGS.html">known bugs</a>,
|
|
||||||
<a href="release-history.html">Release history</a>).</p>
|
|
||||||
|
|
||||||
|
|
||||||
<p><span class="application">Recoll</span> is based on the very
|
|
||||||
capable <a href="http://www.xapian.org">Xapian</a> search
|
|
||||||
engine library, for which it provides a powerful text
|
|
||||||
extraction layer and a complete, yet easy to use, Qt graphical
|
|
||||||
interface.</p>
|
|
||||||
|
|
||||||
<p class="remark">Recoll will index an <b>MS-Word</b> document
|
|
||||||
stored as an <b>attachment</b> to an <b>e-mail message</b> inside
|
|
||||||
a <b>Thunderbird folder</b> archived in a <b>Zip file</b> (and
|
|
||||||
more...). It will also help you search for it with a friendly and
|
|
||||||
powerful interface, and let you open a copy of a PDF at the right
|
|
||||||
page with two clicks. There is little that will remain
|
|
||||||
hidden on your disk.</p>
|
|
||||||
|
|
||||||
<p>Recoll has extensive <a href="doc.html">
|
|
||||||
documentation</a>. If you run into a problem, or want to
|
|
||||||
propose improvements, you are welcome to use
|
|
||||||
the <a href="support.html">
|
|
||||||
<span class="important">mailing list or problem
|
|
||||||
tracker</span></a>.</p>
|
|
||||||
|
|
||||||
<p><b><i>Recoll user ?</i></b> Maybe there are still a few useful
|
|
||||||
search tricks that you don't know about. A quick look at
|
|
||||||
the <a href="usermanual/RCL.SEARCH.html#RCL.SEARCH.GUI.TIPS">search
|
|
||||||
tips</a> might prove useful ! Also
|
|
||||||
the <a href="faqsandhowtos/index.html">
|
|
||||||
Faqs and Howtos section</a>, and some contributed
|
|
||||||
<a href="custom.html">result list formats</a>.</p>
|
|
||||||
|
|
||||||
<h2>Thanks</h2>
|
|
||||||
<p>Recoll borrows a lot of code
|
|
||||||
from other packages, and welcomes code and ideas from
|
|
||||||
contributors, see some of the
|
|
||||||
<a class="important" href="credits.html">Credits</a>.</p>
|
|
||||||
|
|
||||||
<h2>News</h2>
|
|
||||||
<div class="news">
|
|
||||||
|
|
||||||
<dl>
|
|
||||||
<dt>2017-07-31</dt><dd>Finalizing the move to the new site,
|
|
||||||
I am closing the old BitBucket project. The existing
|
|
||||||
BitBucket issues <a href="bitbucket-issues-recoll/index.html">
|
|
||||||
have been archived</a>.</dd>
|
|
||||||
|
|
||||||
<dt>2017-07-02</dt><dd>The source code repository and issue
|
|
||||||
tracker are moving to a
|
|
||||||
<a href="https://opensourceprojects.eu/p/recoll1/">
|
|
||||||
new place</a>.<br clear="all"></dd>
|
|
||||||
|
|
||||||
<dt>2017-05-23</dt><dd>Release 1.23.2 has gotten much
|
|
||||||
better at <a href="recoll_XMP">processing PDF XMP
|
|
||||||
data</a>.</dd>
|
|
||||||
|
|
||||||
<dt>2017-05-15</dt><dd>Release 1.23.2. This fixes a couple
|
|
||||||
of quite serious bugs. See
|
|
||||||
the <a href="release-1.23.html">Release notes</a></dd>
|
|
||||||
|
|
||||||
<dt>2017-03-09</dt><dd>Release 1.23.1. See
|
|
||||||
the <a href="release-1.23.html">Release notes</a></dd>
|
|
||||||
|
|
||||||
<dt>2016-11-25</dt><dd>Release 1.22.4 is available and fixes
|
|
||||||
an ennoying qt5 glitch (advanced search 'start search'
|
|
||||||
button doing nothing). <a href="release-1.22.html">Release
|
|
||||||
notes</a></dd>
|
|
||||||
|
|
||||||
<dt>2016-06-21</dt><dd>Release 1.22.3 is available. This is
|
|
||||||
going to replace 1.21 as the main release. See
|
|
||||||
the <a href="release-1.22.html">the release
|
|
||||||
notes</a>. Some input handler dependancies have changed.</dd>
|
|
||||||
|
|
||||||
<dt>2016-05-11</dt><dd>Release 1.21.7 fixes an ennoying but
|
|
||||||
benign GUI crash-on-exit bug reported on Fedora 23 (qt5).</dd>
|
|
||||||
|
|
||||||
<dt>2016-04-21</dt><dd>I experimented with installing
|
|
||||||
the <a href="https://github.com/koniu/recoll-webui">Recoll
|
|
||||||
Web UI</a> with Apache, and found out
|
|
||||||
that <a href="pages/recoll-webui-install-wsgi.html">this
|
|
||||||
is really easy</a>, actually both easier to set up and
|
|
||||||
more useful than running it standalone. Recently added:
|
|
||||||
instructions for running with Nginx instead of Apache.</dd>
|
|
||||||
|
|
||||||
<dt>2016-04-18</dt><dd>Found a <a href="BUGS.html#GUIADV">GUI
|
|
||||||
crash bug</a> with a reasonably easy workaround.</dd>
|
|
||||||
|
|
||||||
<dt>2016-04-14</dt><dd>Release 1.22.0 is now available from
|
|
||||||
the download area. The binary packages should wait until
|
|
||||||
enough brave souls have tested it. See
|
|
||||||
the <a href="release-1.22.html">the release notes</a>.</dd>
|
|
||||||
|
|
||||||
<dt>2016-04-07</dt><dd>Release 1.21.6 adds KDE5 compatibility
|
|
||||||
for the KIO slave.</dd>
|
|
||||||
|
|
||||||
<dt>2016-01-29</dt><dd>Release 1.21.5 is out. It fixes a
|
|
||||||
relatively nasty bug affecting all previous 1.21 versions:
|
|
||||||
the query language parser processed incorrectly multiple
|
|
||||||
mime type or category specifications, with missing results
|
|
||||||
as a consequence </dd>
|
|
||||||
|
|
||||||
<dt>2016-01-12</dt><dd>It seems that we currently have a
|
|
||||||
relatively frequent problem resulting in damaged indexes. If
|
|
||||||
you are experimenting heavy reindexing (incremental indexing
|
|
||||||
takes longer than it should), or missing search results,
|
|
||||||
please take a look at the top of
|
|
||||||
the <a href="BUGS.html">known bugs page</a></dd>
|
|
||||||
|
|
||||||
<dt>2015-11-09</dt>
|
|
||||||
<dd><a href="pics/windows-recoll.html">
|
|
||||||
<img align="left" width="100" alt="Recoll on MS-Windows"
|
|
||||||
src="pics/windows-recoll-thumb.png"></a>
|
|
||||||
<span class="important">Recoll for
|
|
||||||
MS-Windows</span>. Still a few things missing (like
|
|
||||||
real-time monitoring), but it does work, and it has a proper
|
|
||||||
installer, so you can easily get rid of it if you don't like
|
|
||||||
it. <a href="pages/recoll-windows.html">Have a look.</a>.
|
|
||||||
This is an almost-native port, based on Qt and the Windows
|
|
||||||
API, no need for Cygwin. Thanks to Christian Motz for
|
|
||||||
helping with the filter interface (and the rest). I would
|
|
||||||
love some feedback!<br clear="all">
|
|
||||||
</dd>
|
|
||||||
|
|
||||||
<dt>2015-10-17</dt>
|
|
||||||
<dd>A bug in the verification of configuration file path variables
|
|
||||||
generates spurious warnings from recollindex when the
|
|
||||||
skippedPaths variable contains elements with wildcards. This
|
|
||||||
has no consequence except for the spurious error
|
|
||||||
message.</dd>
|
|
||||||
|
|
||||||
<dt>2015-10-01</dt>
|
|
||||||
<dd>Release 1.21.2 is out, and replaces 1.20 as production
|
|
||||||
release. </dd>
|
|
||||||
|
|
||||||
<dt>2015-06-30</dt>
|
|
||||||
<dd>A new rclpdf filter, with improved compatibility with
|
|
||||||
recent poppler pdftotext
|
|
||||||
versions. See <a href="filters/filters.html">rclpdf
|
|
||||||
filter</a>.</dd>
|
|
||||||
|
|
||||||
<dt>2015-06-16</dt>
|
|
||||||
<dd>Recoll 1.21.0 is out. This has a new query parser and
|
|
||||||
should be considered an instable release, please do not
|
|
||||||
package it (1.20.6 is the one you want for stability). It
|
|
||||||
also <a href="idxthreads/forkingRecoll.html">changes the way
|
|
||||||
filters are executed</a> for better performance. See the
|
|
||||||
<a href="release-1.21.html">release notes</a> for more
|
|
||||||
detail about the few other changes.</dd>
|
|
||||||
|
|
||||||
<dt>2015-04-25</dt>
|
|
||||||
<dd>Recoll 1.20.6 is out, with mostly small fixes to
|
|
||||||
compressed file handling, which may make a big difference in
|
|
||||||
some cases. See the <a href="release-1.20.html">release
|
|
||||||
notes</a>. Of course it also incorportates the Qt 5
|
|
||||||
compatibility from 1.20.5 (Qt
|
|
||||||
5.3.2 ok, 5.2 does not work).</dd>
|
|
||||||
|
|
||||||
<dt>2015-03-30</dt>
|
|
||||||
<dd>Recoll 1.20.4 released. This fixes real time indexing of
|
|
||||||
the web history (when using the Firefox plugin).</dd>
|
|
||||||
|
|
||||||
<dt>2014-12-27</dt>
|
|
||||||
<dd><a href="https://www.gnu.org/software/unrtf/">
|
|
||||||
Unrtf 21.8</a> has been released. This fixes many issues
|
|
||||||
in unrtf, some with possible security implications. You
|
|
||||||
really want to use this version.</dd>
|
|
||||||
|
|
||||||
<dt>2014-12-18</dt> <dd>Recoll 1.20.1 is out and replaces 1.19
|
|
||||||
as the main version. I have been using 1.20 for months
|
|
||||||
(along with a number of fearless builders-from-source), and
|
|
||||||
it's as stable as 1.19, with nice
|
|
||||||
small <a href="release-1.20.html">new features</a>. Packages
|
|
||||||
will follow shortly. It is recommended (but not strictly
|
|
||||||
required, see the notes) to run an index reset when
|
|
||||||
upgrading.</dd>
|
|
||||||
|
|
||||||
<dt>2014-12-10</dt> <dd>The aspell command used for
|
|
||||||
orthographic suggestions is broken on Debian Jessie (because
|
|
||||||
of an aspell packaging issue), and this will not be fixed
|
|
||||||
for the Debian release. See the <a href="BUGS.html#aspelljessie">
|
|
||||||
simple workaround here</a>.</dd>
|
|
||||||
|
|
||||||
<dt>2014-11-09</dt> <dd>If you are still running anything
|
|
||||||
older than 1.19.14p2, <span class="important">YOU SHOULD
|
|
||||||
UPGRADE</span>. In
|
|
||||||
particular, <a href="release-1.19.html#rodb">this index
|
|
||||||
corruption issue</a> leading to repeated reindexing of
|
|
||||||
documents, and possibly query problems too, can be pretty
|
|
||||||
ennoying.<br/>
|
|
||||||
GOTO <a href="download.html">download</a> and
|
|
||||||
install 1.19.14p2 or 1.20. <em>Reset your index after
|
|
||||||
upgrading (rm -rf ~/.recoll/xapiandb)</em>.</dd>
|
|
||||||
|
|
||||||
<dt>2014-07-28</dt> <dd>A nice new application to complement
|
|
||||||
Recoll: <a href="https://github.com/pidlug/recollfs">recollfs</a>
|
|
||||||
implements a Fuse filesystem where Recoll queries are
|
|
||||||
represented as directories, the contents of which are links
|
|
||||||
to the result documents.</dd>
|
|
||||||
|
|
||||||
<dt>2014-07-16</dt> <dd>Recoll version 1.19.14p2 fixes more
|
|
||||||
resource management issues in the Python module (only the
|
|
||||||
Python package needs upgrading for this), and the processing
|
|
||||||
of Bengali characters (no more diacritics stripping).</dd>
|
|
||||||
|
|
||||||
<dt>2014-06-24</dt> <dd><a href="filters/filters.html#soff1">An
|
|
||||||
updated filter</a> for Open/LibreOffice documents. The
|
|
||||||
previous version merged words which were tab-separated in
|
|
||||||
the input.</dd>
|
|
||||||
|
|
||||||
<dt>2014-06-17</dt> <dd>The source tarball for version 1.20.0
|
|
||||||
has been released. This version has
|
|
||||||
a <a href="release-1.20.html">number of improvements</a> over
|
|
||||||
1.19, but also some incompatibilities. The first minor
|
|
||||||
releases for 1.20 may contain some functional changes in
|
|
||||||
addition to bug fixes, so they may be slightly less stable
|
|
||||||
than 1.19, and 1.19 packages remain the "safe Recoll" for
|
|
||||||
now. Still, if you build from source, there are a few nice
|
|
||||||
things in 1.20...</dd>
|
|
||||||
|
|
||||||
<dt>2014-06-07</dt> <dd>Version 1.19.14 is out and fixes a
|
|
||||||
handful of minor-to-ennoying indexing glitches (see the
|
|
||||||
<a href="release-1.19.html">Release notes</a>).</dd>
|
|
||||||
|
|
||||||
<dt>2014-05-06</dt> <dd>Version 1.19.13 is out and hopefully
|
|
||||||
fixes the remaining (rare) crashes of multithreaded
|
|
||||||
indexing.</dd>
|
|
||||||
|
|
||||||
<dt>2014-04-03</dt> <dd>I have separated the code for the
|
|
||||||
<a href="https://opensourceprojects.eu/p/unityscoperecol/">Recoll
|
|
||||||
Unity Scope</a> from the main body of code, in hope that it may
|
|
||||||
interest someone to work on it. It's Python and simple,
|
|
||||||
mostly depending on the Unity API. The Ubuntu Unity API is
|
|
||||||
apparently going to change *again* for the next version, and
|
|
||||||
I think I've seen enough of it.</dd>
|
|
||||||
|
|
||||||
<dt>2014-04-02</dt> <dd>1.19.12 is out. It's mostly identical
|
|
||||||
to 1.19.11 apart from a new parameter to change the max size
|
|
||||||
of stored attributes. No need to update in general.</dd>
|
|
||||||
|
|
||||||
<dt>2014-02-27</dt> <dd>I hear from time to time about
|
|
||||||
recollindex crashes. These appear to be quite rare, but they
|
|
||||||
do happen, and I think that they are linked to a yet unfound
|
|
||||||
bug in multithread indexing. If you experience such crashes or
|
|
||||||
stalls, you can disable multithreading by adding the following
|
|
||||||
to your recoll.conf:
|
|
||||||
<pre><tt>thrQSizes = -1 -1 -1</tt></pre>
|
|
||||||
</dd>
|
|
||||||
|
|
||||||
<dt>2014-02-27</dt><dd>While working on a
|
|
||||||
<a href="http://www.recoll.org/faqsandhowtos/MuttAndRecoll.html">
|
|
||||||
Recoll-Mutt interface</a> I discovered incidentally that
|
|
||||||
the <a href="https://github.com/koniu/recoll-webui">Recoll
|
|
||||||
Webui Web interface</a> works quite well with the
|
|
||||||
<a href="http://links.twibright.com/">links</a> web browser
|
|
||||||
inside a terminal window. This appears to be an interesting
|
|
||||||
solution for people looking for a search interface usable in
|
|
||||||
a non-GUI environment.</dd>
|
|
||||||
|
|
||||||
<dt>2013-11-19</dt> <dd>A <a href="filters/filters.html">new
|
|
||||||
filter</a> for PowerPoint files. The previous one was
|
|
||||||
based on the ancient <b>catppt</b> from the <b>catdoc</b>
|
|
||||||
utilities and usually extracted nothing from more recent
|
|
||||||
PowerPoint files (this is about .ppt: .pptx is handled by a native
|
|
||||||
Recoll filter).</dd>
|
|
||||||
|
|
||||||
<dt>2013-05-18</dt><dd>Sometimes things
|
|
||||||
<a href="http://www.lesbonscomptes.com/pages/happysearch.html">
|
|
||||||
just work</a>...</dd>
|
|
||||||
|
|
||||||
<dt>2013-04-30</dt><dd>Thanks to some of its users, Recoll now
|
|
||||||
has filters to
|
|
||||||
<a href="http://sourceforge.net/projects/rcollnotesfiltr/">
|
|
||||||
index and retrieve Lotus Notes messages</a>
|
|
||||||
(some
|
|
||||||
<a href="http://richardappleby.wordpress.com/2013/04/11/you-dont-have-to-know-the-answer-to-everything-just-how-to-find-it/">
|
|
||||||
implementation notes from an early user</a>), and there is
|
|
||||||
also now a
|
|
||||||
<a href="https://github.com/koniu/recoll-webui/">
|
|
||||||
Web browser interface</a> for querying your Recoll
|
|
||||||
indexes.</dd>
|
|
||||||
|
|
||||||
<dt>2012-10-25</dt> <dd>A problem with a simple workaround has caused
|
|
||||||
several reported <span class="important">recollindex
|
|
||||||
crashes</span> recently (for 1.17). If you store and index
|
|
||||||
Mozilla/Thunderbird email out of the standard location
|
|
||||||
(~/.thunderbird), you should add the following at the end of
|
|
||||||
your configuration file (e.g.:
|
|
||||||
~/.recoll/recoll.conf): <pre><tt>
|
|
||||||
[/path/to/my/mozilla/mail]
|
|
||||||
mhmboxquirks = tbird
|
|
||||||
</tt></pre> Adjust the path to your local value of course...
|
|
||||||
Without this hint, recollindex has trouble finding the
|
|
||||||
message delimiters inside the folder files, and will
|
|
||||||
possibly use all the computer's memory and crash. Apart from
|
|
||||||
crashes, which only occur for very big folders, this also
|
|
||||||
causes incorrect mail indexing.
|
|
||||||
</dd>
|
|
||||||
|
|
||||||
<dt>2012-09-11</dt> <dd>A new user-contributed script for those who use
|
|
||||||
real-time indexing on laptops: stop or start indexing
|
|
||||||
according to AC power status. See the details on
|
|
||||||
the <a href="http://www.recoll.org/faqsandhowtos/IndexOnAc.html">
|
|
||||||
Wiki</a>. </dd>
|
|
||||||
|
|
||||||
<dt>2012-04-07</dt><dd>We now have a Chinese user manual:
|
|
||||||
Recoll现在有中文手册咯:
|
|
||||||
<a href="http://stupidbeauty.com/Blog/2012/03/recoll%E7%94%A8%E6%88%B6%E6%89%8B%E5%86%8A%E7%BF%BB%E8%AD%AF%EF%BC%8Crecoll-user-manual-2/">
|
|
||||||
Recoll中文手册,HTML</a></dd>
|
|
||||||
|
|
||||||
|
|
||||||
</dl>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<h2>On the side</h2>
|
|
||||||
|
|
||||||
<div class="news">
|
|
||||||
<blockquote>
|
|
||||||
<p>We rent <a href="http://www.metairie-enbor.com/index.html.en">
|
|
||||||
a big country house</a> in the Aude area, in the south of
|
|
||||||
France (<a href="http://www.metairie-enbor.com/acces.html.en">see
|
|
||||||
map on the site</a>). If you are
|
|
||||||
looking for a wonderful country place with a pool to
|
|
||||||
spend holidays with a big bunch of family and/or
|
|
||||||
friends in a nice historical but very quiet area, this may be it.</p>
|
|
||||||
</blockquote>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@ -1,193 +0,0 @@
|
|||||||
<!DDOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>RECOLL: un outil personnel de recherche textuelle pour
|
|
||||||
Unix et Linux</title>
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll est un logiciel personnel de recherche textuelle pour unix et linux basé sur Xapian, un moteur d'indexation puissant et mature.">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"recherche textuelle,desktop,unix,linux,solaris,open
|
|
||||||
source, free, bois de chauffage">
|
|
||||||
<meta http-equiv="Content-language" content="fr">
|
|
||||||
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
|
||||||
<link rel="shortcut icon" href="favicon.ico" />
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="http://www.recoll.org">Base</a></li>
|
|
||||||
<li><a href="pics/index.html">Copies d'écrans</a></li>
|
|
||||||
<li><a href="download.html">Téléchargements</a></li>
|
|
||||||
<li><a href="doc.html">Documentation</a></li>
|
|
||||||
<li><a href="devel.html">Développement</a></li>
|
|
||||||
</ul>
|
|
||||||
<p class="indexthumb">
|
|
||||||
<a href="pics/index.html"><img width="100" alt=
|
|
||||||
"Imagette de l'écran principal" src=
|
|
||||||
"pics/recoll0-thumb.png"></a>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h1><img align="center" src="pics/recoll64.png"/>
|
|
||||||
<a href="http://www.recoll.org/">Recoll</a> est
|
|
||||||
un outil personnel de recherche textuelle pour Unix et Linux</h1>
|
|
||||||
|
|
||||||
<p>Il est basé sur le puissant moteur d'indexation <a href=
|
|
||||||
"http://www.xapian.org">Xapian</a>, pour lequel il offre une
|
|
||||||
interface graphique QT facile d'utilisation, riche, et facile à
|
|
||||||
mettre en oeuvre.</p>
|
|
||||||
|
|
||||||
<p><span class="application">Recoll</span> est un logiciel libre
|
|
||||||
gratuit, dont le code source est disponible sous licence GPL.
|
|
||||||
La dernière version est
|
|
||||||
<a class="important" href="download.html">1.23.1</a>
|
|
||||||
(<a href="release-1.23.html">notes sur la version, en
|
|
||||||
anglais</a>)</p>
|
|
||||||
|
|
||||||
<p>L'interface utilisateur de
|
|
||||||
<span class="application">Recoll</span> est traduite en
|
|
||||||
Français, mais pas encore la documentation, malheureusement,
|
|
||||||
et la plupart des liens de cette page pointent sur des textes
|
|
||||||
en Anglais.</p>
|
|
||||||
|
|
||||||
|
|
||||||
<h2>Caractéristiques: </h2>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li>Installation facile, peu de dépendances. Pas besoin de
|
|
||||||
démon permanent, de serveur http, d'un environnement de bureau
|
|
||||||
particulier ou d'un langage exotique.</li>
|
|
||||||
|
|
||||||
<li>Tourne sur la plupart des
|
|
||||||
<a href="fr/features.html#systems">systèmes</a> fondés sur
|
|
||||||
Unix.</li>
|
|
||||||
|
|
||||||
<li>Interface conçue avec <a href="http://www.trolltech.com">
|
|
||||||
Qt 4 ou 5 selon les plateformes.</a></li>
|
|
||||||
|
|
||||||
<li>Traite la plupart des <a href="fr/features.html#doctypes">
|
|
||||||
types de documents</a> courants, les messages et leurs fichiers
|
|
||||||
attachés. Peut aussi traiter leurs versions comprimées
|
|
||||||
(gzip ou bzip2) de tous ces documents.
|
|
||||||
<a href="features.html#doctypes">Application externes pour
|
|
||||||
l'extraction du texte</a>.</li>
|
|
||||||
|
|
||||||
<li>Fonctions de recherche puissantes, avec expressions Booléennes,
|
|
||||||
phrases et proximité, wildcards, filtrage sur les types de fichiers
|
|
||||||
ou l'emplacement.</li>
|
|
||||||
|
|
||||||
<li>Multi-langage et multi-jeu de caractères, utilisant
|
|
||||||
Unicode en interne.</li>
|
|
||||||
|
|
||||||
<li><a class="weak" href="fr/features.html">
|
|
||||||
(plus de détails)</a></li>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<p><b><i>Déjà utilisateur ?</i></b> Il est possible qu'il
|
|
||||||
y ait encore quelques astuces qui vous aient échappées. Un coup
|
|
||||||
d'oeil rapide sur la page des <a
|
|
||||||
href="usermanual/RCL.SEARCH.html#RCL.SEARCH.GUI.TIPS"> petites
|
|
||||||
recettes de recherche</a> (en anglais) pourrait s'avérer
|
|
||||||
fructueux ! Également, en anglais,
|
|
||||||
la <a href="faqsandhowtos/index.html">section des questions
|
|
||||||
fréquentes et trucs divers</a>.</p>
|
|
||||||
|
|
||||||
<h2>Nouvelles: </h2>
|
|
||||||
|
|
||||||
<dl>
|
|
||||||
<dt>2017-05-15</dt><dd>Version 1.23.2. Corrige quelques bugs
|
|
||||||
sérieux. Voir les <a href="release-1.23.html">Release notes (en
|
|
||||||
anglais).</a></dd>
|
|
||||||
<dt>2017-03-09</dt><dd>Version 1.23.1.
|
|
||||||
the <a href="release-1.23.html">Release notes (en
|
|
||||||
anglais).</a></dd>
|
|
||||||
|
|
||||||
<td>2016-11-23</td><dd>Version 1.22.4.</dd>
|
|
||||||
<dt>2016-06-15</dt><dd>La version 1.22.3 est disponible et va
|
|
||||||
progressivement remplacer 1.21 comme version
|
|
||||||
principale. <a href="release-1.22.html">Notes de version</a>
|
|
||||||
(en anglais).</dd>
|
|
||||||
|
|
||||||
<dt>2016-05-11</dt><dd>Release 1.21.7: corrige un crash bénin
|
|
||||||
mais agaçant au moment de quitter l'interface utilisateur
|
|
||||||
(Fedora 23 / qt5).</dd>
|
|
||||||
|
|
||||||
<dt>2015-11-09</dt>
|
|
||||||
<dd>Recoll indexe Windows ! Il y a encore quelques éléments
|
|
||||||
manquants, comme l'indexation temps-réel, et la traduction
|
|
||||||
en Français, mais ça marche suffisamment bien pour être
|
|
||||||
essayé. Il y a un installeur standard, donc si vous n'aimez
|
|
||||||
pas, c'est facile à désinstaller...
|
|
||||||
Pas de traduction Française pour le moment. Il y
|
|
||||||
a <a href="pages/recoll-windows.html"> quelques
|
|
||||||
explications en Anglais sur l'installation </a>.
|
|
||||||
Si vous l'essayez, dites moi ce que vous en pensez !
|
|
||||||
</dd>
|
|
||||||
|
|
||||||
<dt>2012-10-25</dt><dd> Un problème avec une solution simple
|
|
||||||
peut provoquer
|
|
||||||
des <span class="important">plantages de
|
|
||||||
recollindex</span>.
|
|
||||||
Si vous indexez des messages mail Mozilla/Thunderbird
|
|
||||||
ailleurs qu'à l'endroit standard (~/.thunderbird), vous
|
|
||||||
devriez ajouter les lignes qui suivent à la fin de votre
|
|
||||||
fichier de configuration (~/.recoll/recoll.conf):
|
|
||||||
<pre><tt>
|
|
||||||
[/path/to/my/mozilla/mail]
|
|
||||||
mhmboxquirks = tbird
|
|
||||||
</tt></pre> Changez le chemin d'accès pour le votre bien
|
|
||||||
sûr. Sans cette indication, recollindex a des difficultés à
|
|
||||||
déterminer les limites de message dans les fichiers mailbox,
|
|
||||||
et peut arriver à utiliser toute la mémoire de la machine,
|
|
||||||
et à se planter. Dans les cas moins graves (avec des
|
|
||||||
fichiers de taille "raisonnable"), cela provoque aussi une
|
|
||||||
indexation incorrecte des messages.
|
|
||||||
</dd>
|
|
||||||
|
|
||||||
<dt>2010-11-20</dt><dd>Un petit script pour activer/cacher recoll sur un
|
|
||||||
bureau gnome d'un seul coup de clavier:
|
|
||||||
<a href="http://www.recoll.org/faqsandhowtos/HotRecoll.html">
|
|
||||||
recette d'installation</a>.</dd>
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2><a name="support">Support</a></h2>
|
|
||||||
|
|
||||||
<p>Si vous avez un problème quelconque avec le logiciel ou son
|
|
||||||
installation, ou une idée de fonctions à ajouter, merci de me
|
|
||||||
<a href=
|
|
||||||
"mailto:jfd@recoll.org">contacter</a>.</p>
|
|
||||||
|
|
||||||
<p>Voir aussi la <a href="devel.html">page sur le
|
|
||||||
développement</a>.</p>
|
|
||||||
<p><a href="BUGS.html">Liste des problèmes connus</a> (en
|
|
||||||
anglais). </p>
|
|
||||||
|
|
||||||
<h2>Remerciements</h2>
|
|
||||||
<p><span class="application">Recoll</span> emprunte beaucoup de code
|
|
||||||
d'autres logiciels libres, et accueille volontiers les
|
|
||||||
contributions en code ou en suggestions, voir la page des
|
|
||||||
<a class="important" href="credits.html">Attributions</a>.</p>
|
|
||||||
|
|
||||||
<h2>Autres</h2>
|
|
||||||
<p>Je loue une
|
|
||||||
<a href="http://www.metairie-enbor.com/index.html.fr">
|
|
||||||
grande maison sympa dans l'Aude</a> :), et nous produisons aussi
|
|
||||||
du <a href="http://www.metairie-enbor.com/bois-de-chauffage.html">
|
|
||||||
bois de chauffage</a>. (Il faut bien que cette page me serve
|
|
||||||
tout de même à <em>quelque chose</em> à moi aussi de temps
|
|
||||||
en temps !).</p>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@ -1,10 +0,0 @@
|
|||||||
.SUFFIXES: .txt .html
|
|
||||||
|
|
||||||
.txt.html:
|
|
||||||
asciidoc $<
|
|
||||||
|
|
||||||
all: recoll-windows.html recoll-windows-faq.html \
|
|
||||||
recoll-webui-install-wsgi.html
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -f *.html
|
|
||||||
@ -1,280 +0,0 @@
|
|||||||
= Recoll WebUI Apache and nginx installation from scratch
|
|
||||||
|
|
||||||
NOTE: thanks to Michael L. Wilson for the `nginx` part.
|
|
||||||
|
|
||||||
The https://github.com/koniu/recoll-webui[Recoll WebUI] offers an
|
|
||||||
alternative, WEB-based, interface for querying a Recoll index.
|
|
||||||
|
|
||||||
It can be quite useful to extend the use of a shared index to multiple
|
|
||||||
workstations, without the need for a local Recoll installation and shared
|
|
||||||
data storage.
|
|
||||||
|
|
||||||
The Recoll WebUI is based on the
|
|
||||||
http://bottlepy.org/docs/dev/index.html[Bottle Python framework], which has
|
|
||||||
a built-in WEB server, and the simplest deployment approach is to run it
|
|
||||||
standalone. However the built-in server is restricted to handling one
|
|
||||||
request at a time, which is problematic in multi-user situations,
|
|
||||||
especially because some requests, like extracting a result list into a CSV
|
|
||||||
file, can take a significant amount of time.
|
|
||||||
|
|
||||||
The Bottle framework can work with several multi-threading Python HTTP
|
|
||||||
server libraries, but, given the limitations of the Recoll Python module
|
|
||||||
and the Python interpreter itself, this will not yield optimal performance,
|
|
||||||
and, especially can't efficiently leverage the now ubiquitous
|
|
||||||
multiprocessors.
|
|
||||||
|
|
||||||
In multi-user situations, you can get better performance and ease of use
|
|
||||||
from the Recoll WebUI by running it under Apache or Nginx rather than as a
|
|
||||||
standalone process. With this approach, a few requests per second can
|
|
||||||
easily be handled even in the presence of long-running ones.
|
|
||||||
|
|
||||||
Neither Recoll nor the WebUI are optimized for high multi-user load, and it
|
|
||||||
would be very unwise to use them as the search interface to a busy WEB
|
|
||||||
site.
|
|
||||||
|
|
||||||
The instructions about using the WebUI under Apache as given in the
|
|
||||||
repository README are a bit terse, and are missing a few details,
|
|
||||||
especially ones which impact performance.
|
|
||||||
|
|
||||||
Here follows the synopsis of three WebUI installations on initially
|
|
||||||
Apache-less Ubuntu (14.04) and DragonFly BSD systems, and for
|
|
||||||
Nginx/BSD. The first should extend easily to other Debian-based systems,
|
|
||||||
the second at least to FreeBSD. rpm-based systems are left as an exercise
|
|
||||||
to the reader, at least for now...
|
|
||||||
|
|
||||||
|
|
||||||
CAUTION: THE CONFIGURATIONS DESCRIBED HAVE NO ACCESS CONTROL. ANYONE WITH
|
|
||||||
ACCESS TO THE NETWORK WHERE THE SERVER IS LOCATED CAN RETRIEVE ANY
|
|
||||||
DOCUMENT.
|
|
||||||
|
|
||||||
link:#nginx[Jump to the nginx section].
|
|
||||||
|
|
||||||
[[apache]]
|
|
||||||
== Apache
|
|
||||||
=== On a Debian/Ubuntu system
|
|
||||||
|
|
||||||
==== Install recoll
|
|
||||||
|
|
||||||
sudo apt-get install recoll python-recoll
|
|
||||||
|
|
||||||
Configure the indexing and check that the normal search works (I spent
|
|
||||||
quite a lot of time trying to understand why the WebUI did not work, when
|
|
||||||
in fact it was the normal recoll configuration which was broken and the
|
|
||||||
regular search did not work either).
|
|
||||||
|
|
||||||
Take care to be logged in as the user you want to run the web search as
|
|
||||||
while you do this.
|
|
||||||
|
|
||||||
|
|
||||||
==== Install the WebUI
|
|
||||||
|
|
||||||
Clone the github repository, or extract the master tar installation, and
|
|
||||||
move it to '/var/www/recoll-webui-master/'. Take care that it is read/execute
|
|
||||||
accessible by your user.
|
|
||||||
|
|
||||||
==== Install Apache and mod-wsgi
|
|
||||||
|
|
||||||
|
|
||||||
sudo apt-get install apache2 libapache2-mod-wsgi
|
|
||||||
|
|
||||||
I then got the following message:
|
|
||||||
|
|
||||||
AH00558: apache2: Could not reliably determine the server's fully qualified domain name, using 127.0.1.1. Set the 'ServerName' directive globally to suppress this message
|
|
||||||
|
|
||||||
To clear it, I added a ServerName directive to the Apache config, maybe you
|
|
||||||
won't need it. Edit '/etc/apache2/sites-available/000-default.conf' and add
|
|
||||||
the following at the top (globally). Things work without this fix anyway,
|
|
||||||
this is just to suppress the error message. You probably need to adjust the
|
|
||||||
address or use a real host name:
|
|
||||||
|
|
||||||
ServerName 192.168.4.6
|
|
||||||
|
|
||||||
|
|
||||||
Edit '/etc/apache2/mods-enabled/wsgi.conf', add the following at the end of
|
|
||||||
the "IfModule" section.
|
|
||||||
|
|
||||||
Change the user ('dockes' in the example) taking care that he is the one who
|
|
||||||
owns the index ('.recoll' is in his home directory).
|
|
||||||
|
|
||||||
WSGIDaemonProcess recoll user=dockes group=dockes \
|
|
||||||
threads=1 processes=5 display-name=%{GROUP} \
|
|
||||||
python-path=/var/www/recoll-webui-master
|
|
||||||
WSGIScriptAlias /recoll /var/www/recoll-webui-master/webui-wsgi.py
|
|
||||||
<Directory /var/www/recoll-webui-master>
|
|
||||||
WSGIProcessGroup recoll
|
|
||||||
Order allow,deny
|
|
||||||
allow from all
|
|
||||||
</Directory>
|
|
||||||
|
|
||||||
NOTE: the Recoll WebUI application is mostly single-threaded, so it is of
|
|
||||||
little use (and may actually be counter-productive in some cases) to
|
|
||||||
specify multiple threads on the WSGIDaemonProcess line. Specify multiple
|
|
||||||
processes instead to put multiple CPUs to work on simultaneous requests.
|
|
||||||
|
|
||||||
|
|
||||||
Then run the following to restart Apache:
|
|
||||||
|
|
||||||
sudo apachectl restart
|
|
||||||
|
|
||||||
The Recoll WebUI should now be accessible. on 'http://my.server.com/recoll/'
|
|
||||||
|
|
||||||
NOTE: Take care that you need a '/' at the end of the URL used to access
|
|
||||||
the search (use: 'http://my.server.com/recoll/', not
|
|
||||||
'http://my.server.com/recoll'), else files other than the script itself are
|
|
||||||
not found (the page looks weird and the search does not work).
|
|
||||||
|
|
||||||
CAUTION: THERE IS NO ACCESS CONTROL. ANYONE WITH ACCESS TO THE NETWORK
|
|
||||||
WHERE THE SERVER IS LOCATED CAN RETRIEVE ANY DOCUMENT.
|
|
||||||
|
|
||||||
=== Apache Variant for BSD/ports
|
|
||||||
|
|
||||||
==== Packages
|
|
||||||
|
|
||||||
As root:
|
|
||||||
|
|
||||||
pkg install recoll
|
|
||||||
|
|
||||||
|
|
||||||
Do what you need to do to configure the indexing and check that the normal
|
|
||||||
search works.
|
|
||||||
|
|
||||||
Take care to be logged in as the user you want to run the web search as
|
|
||||||
while you do this.
|
|
||||||
|
|
||||||
pkg install apache24
|
|
||||||
|
|
||||||
Add apache24_enable="YES" in /etc/rc.conf
|
|
||||||
|
|
||||||
pkg install ap24-mod_wsgi4
|
|
||||||
pkg install git
|
|
||||||
|
|
||||||
==== Clone the webui repository
|
|
||||||
|
|
||||||
cd /usr/local/www/apache24/
|
|
||||||
git clone https://github.com/koniu/recoll-webui.git recoll-webui-master
|
|
||||||
|
|
||||||
Important: most input handler helper applications (e.g. 'pdftotext') are
|
|
||||||
installed in '/usr/local/bin' which is not in the PATH as seen by Apache
|
|
||||||
(at least on DragonFly). The simplest way to fix this is to modify the
|
|
||||||
launcher module for the webui app so that it fixes the PATH.
|
|
||||||
|
|
||||||
Edit 'recoll-webui-master/webui-wsgi.py' and add the following line after
|
|
||||||
the 'import os' line:
|
|
||||||
|
|
||||||
os.environ['PATH'] = os.environ['PATH'] + ':' + '/usr/local/bin'
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
==== Configure Apache
|
|
||||||
|
|
||||||
Edit /usr/local/etc/apache24/modules.d/270_mod_wsgi.conf
|
|
||||||
|
|
||||||
Uncomment the LoadModule line, and add the directives to alias /recoll/ to
|
|
||||||
the webui script.
|
|
||||||
|
|
||||||
Change the user (dockes in the example) taking care that he is the one who
|
|
||||||
owns the index (.recoll is in his home directory).
|
|
||||||
|
|
||||||
Contents of the file:
|
|
||||||
|
|
||||||
## $FreeBSD$
|
|
||||||
## vim: set filetype=apache:
|
|
||||||
##
|
|
||||||
## module file for mod_wsgi
|
|
||||||
##
|
|
||||||
## PROVIDE: mod_wsgi
|
|
||||||
## REQUIRE:
|
|
||||||
|
|
||||||
LoadModule wsgi_module libexec/apache24/mod_wsgi.so
|
|
||||||
|
|
||||||
WSGIDaemonProcess recoll user=dockes group=dockes \
|
|
||||||
threads=1 processes=5 display-name=%{GROUP} \
|
|
||||||
python-path=/usr/local/www/apache24/recoll-webui-master/
|
|
||||||
WSGIScriptAlias /recoll /usr/local/www/apache24/recoll-webui-master/webui-wsgi.py
|
|
||||||
|
|
||||||
<Directory /usr/local/www/apache24/recoll-webui-master>
|
|
||||||
WSGIProcessGroup recoll
|
|
||||||
Require all granted
|
|
||||||
</Directory>
|
|
||||||
|
|
||||||
==== Restart Apache
|
|
||||||
|
|
||||||
As root:
|
|
||||||
|
|
||||||
apachectl restart
|
|
||||||
|
|
||||||
|
|
||||||
[[nginx]]
|
|
||||||
== Nginx
|
|
||||||
=== Nginx for BSD/ports
|
|
||||||
|
|
||||||
As root:
|
|
||||||
|
|
||||||
pkg install recoll
|
|
||||||
|
|
||||||
Do what you need to do to configure the indexing and check that the normal
|
|
||||||
search works. Take care to be logged in as the user you want to run the web
|
|
||||||
search as while you do this.
|
|
||||||
|
|
||||||
Install required packages:
|
|
||||||
|
|
||||||
pkg install nginx uwsgi git
|
|
||||||
|
|
||||||
=== Nginx: clone the webui repository
|
|
||||||
|
|
||||||
rm /usr/local/www/nginx
|
|
||||||
mkdir /usr/local/www/nginx
|
|
||||||
cd /usr/local/www/nginx
|
|
||||||
git clone https://github.com/koniu/recoll-webui.git recoll-webui-master
|
|
||||||
|
|
||||||
Important: most input handler helper applications (e.g. 'pdftotext') are
|
|
||||||
installed in '/usr/local/bin' which is not in the PATH as seen by Nginx
|
|
||||||
(at least on DragonFly). The simplest way to fix this is to modify the
|
|
||||||
launcher module for the webui app so that it fixes the PATH.
|
|
||||||
|
|
||||||
Edit 'recoll-webui-master/webui-wsgi.py' and add the following line after
|
|
||||||
the 'import os' line:
|
|
||||||
|
|
||||||
os.environ['PATH'] = os.environ['PATH'] + ':' + '/usr/local/bin'
|
|
||||||
|
|
||||||
Also change the following to find the correct path:
|
|
||||||
|
|
||||||
#os.chdir(os.path.dirname(__file__))
|
|
||||||
os.chdir('/usr/local/www/nginx/recoll-webui-master')
|
|
||||||
|
|
||||||
|
|
||||||
=== Nginx: configure uWSGI
|
|
||||||
|
|
||||||
Assuming the user running the search is "dockes" (change it to your user),
|
|
||||||
|
|
||||||
sysrc uwsgi_uid=$(id -u dockes)
|
|
||||||
sysrc uwsgi_gid=$(id -g dockes)
|
|
||||||
sysrc uwsgi_flags="-M -L --wsgi-file /usr/local/www/nginx/recoll-webui-master/webui-wsgi.py"
|
|
||||||
|
|
||||||
(ALTERNATIVELY)
|
|
||||||
|
|
||||||
Add the following to rc.conf
|
|
||||||
|
|
||||||
uwsgi_uid="dockes"
|
|
||||||
uwsgi_gid="dockes"
|
|
||||||
uwsgi_flags="-M -L --wsgi-file /usr/local/www/nginx/recoll-webui-master/webui-wsgi.py"
|
|
||||||
|
|
||||||
|
|
||||||
=== Configure nginx
|
|
||||||
|
|
||||||
Edit /usr/local/etc/nginx/nginx.conf and set up a proxy to uwsgi service:
|
|
||||||
|
|
||||||
location / {
|
|
||||||
include uwsgi_params;
|
|
||||||
uwsgi_pass unix:///tmp/uwsgi.sock;
|
|
||||||
}
|
|
||||||
|
|
||||||
=== Enable and start both services
|
|
||||||
|
|
||||||
As root:
|
|
||||||
|
|
||||||
sysrc uwsgi_enable=YES #Or uwsgi_enable="YES" (in rc.conf)
|
|
||||||
sysrc nginx_enable=YES #Or nginx_enable="YES" (in rc.conf)
|
|
||||||
|
|
||||||
service uwsgi start
|
|
||||||
service nginx start
|
|
||||||
@ -1,88 +0,0 @@
|
|||||||
= Recoll on Windows tips and tricks
|
|
||||||
Jean-Francois Dockes <jf@dockes.org>
|
|
||||||
:toc:
|
|
||||||
|
|
||||||
== Checking that Python is in the PATH
|
|
||||||
|
|
||||||
Recoll input handlers are the programs which extract the documents text
|
|
||||||
content for indexing. Most of these programs are Python scripts. If Recoll
|
|
||||||
can find documents by file name but not by content, the first thing to
|
|
||||||
check is that you do have the Python interpreter in your PATH.
|
|
||||||
|
|
||||||
NOTE: Only Python 2 is supported at the moment (2.7 and later were
|
|
||||||
tested). This limitation is not caused by the Recoll scripts themselves but
|
|
||||||
to some of the auxiliary libraries (e.g.: the one used for LibreOffice text
|
|
||||||
extraction). If you also have Python 3 installed, you will have to arrange
|
|
||||||
for Recoll to only 'see' the Python 2 version.
|
|
||||||
|
|
||||||
For simple cases, to check that the Python interpreter is in the PATH, the
|
|
||||||
easiest approach is to start a command window and type 'python' in it. You
|
|
||||||
should see messages from the Python interpreter, which you can then
|
|
||||||
exit by typing 'quit()'. If the command interpreter complains about Python
|
|
||||||
not being found, you probably need to adjust the PATH.
|
|
||||||
|
|
||||||
NOTE: To start a command window, type 'command' in the start menu input
|
|
||||||
area and select 'Command Prompt'.
|
|
||||||
|
|
||||||
If the Python interpreter is not found, check that Python 2 is indeed
|
|
||||||
installed. Adding the Python binary to the PATH is an option during
|
|
||||||
installation (so one approach to fix the issue is to just run the
|
|
||||||
installation again).
|
|
||||||
|
|
||||||
You can also edit the environment variable directly:
|
|
||||||
|
|
||||||
- Start the Control Panel
|
|
||||||
- Select 'System and Security'
|
|
||||||
- Select 'System'
|
|
||||||
- Select 'Advanced system settings' in the left panel,
|
|
||||||
- Select 'Environment Variables' at the bottom of the dialog
|
|
||||||
- Edit 'Path' inside 'System variables' and add:
|
|
||||||
`C:\Python27\;C:\Python27\Scripts;` to it.
|
|
||||||
|
|
||||||
== Using an alternate configuration directory
|
|
||||||
|
|
||||||
This tip is useful if you want to manage several configurations, or if you
|
|
||||||
really have some reason to not let the configuration directory stay in its
|
|
||||||
default location ($HOMEDIR/AppData/Local/Recoll). If your concerns are only
|
|
||||||
a bout storage space, and do not actually want to manage multiple
|
|
||||||
configuration directories, you can more simply change the index storage
|
|
||||||
location from the GUI 'Index Configuration' panel.
|
|
||||||
|
|
||||||
The easiest approach is to create a shortcut on the desktop and have it
|
|
||||||
start the GUI with a '-c' option. For example, set the shortcut's 'Target'
|
|
||||||
to something like:
|
|
||||||
|
|
||||||
----
|
|
||||||
"C:\Program Files (x86)\Recoll\recoll.exe" -c c:/path/to/my/configdir
|
|
||||||
----
|
|
||||||
|
|
||||||
_Do use forward slashes for the configuration directory path_. This will
|
|
||||||
hopefully be fixed some day.
|
|
||||||
|
|
||||||
You will need to create the configuration directory, Recoll will not do it
|
|
||||||
by itself. You can just leave it empty, Recoll will then propose to start
|
|
||||||
the configuration editor.
|
|
||||||
|
|
||||||
You can find a more complete and general explanation about using shortcuts,
|
|
||||||
for example http://www.rjlsoftware.com/support/faq/sa.cfm?q=6&n=61[on this
|
|
||||||
page].
|
|
||||||
|
|
||||||
|
|
||||||
== File name character case sensitivity
|
|
||||||
|
|
||||||
_This should be fixed as of the the November 2016 version. Please report
|
|
||||||
the problem if you still see case sensitivity issues_
|
|
||||||
|
|
||||||
Recoll was born on Unix, on which file names are case-sensitive. At the
|
|
||||||
moment this is also the case for path-related queries on Windows, including
|
|
||||||
the drive letters.
|
|
||||||
|
|
||||||
When filtering results on location (e.g. with a 'dir:' clause), you need to
|
|
||||||
enter all path elements as they appear in the URLs in result lists (and use
|
|
||||||
forward slashes).
|
|
||||||
|
|
||||||
It is also advisable to enter configuration filenames with their actual
|
|
||||||
case (e.g. _topdirs_).
|
|
||||||
|
|
||||||
I am looking into fixing this, but this made a bit complicated by non ASCII
|
|
||||||
character sets issues.
|
|
||||||
@ -1,191 +0,0 @@
|
|||||||
= Recoll on Windows
|
|
||||||
Jean-Francois Dockes <jf at dockes.org>
|
|
||||||
:date:
|
|
||||||
|
|
||||||
:recollversion: 1.23.0-2017-01-07-78b8ad
|
|
||||||
:windir: downwin-12e3f
|
|
||||||
|
|
||||||
image:recoll-windows10-thumb.png[link="recoll-windows10.png"]
|
|
||||||
|
|
||||||
Recoll for Windows was built on Windows 7, and tried on Windows 7 and
|
|
||||||
10. It does not work on Windows XP.
|
|
||||||
|
|
||||||
Recoll is free and licensed under the GPL. You will be asked to accept the
|
|
||||||
license during the installation. For a regular user, and in a nutshell, the
|
|
||||||
license means that you are free to do what you want with the program (use,
|
|
||||||
copy, share, etc.). If you are a developper and intend to modify and
|
|
||||||
distribute the program, you probably know the GPL, else you should read it.
|
|
||||||
|
|
||||||
NOTE: As much as I have fun writing software, producing the Windows version is
|
|
||||||
just tedious. If you use Recoll on Windows, please consider contributing to
|
|
||||||
its availability: image:/donations/btn_donate_LG.gif[link="/donations/index.html"]
|
|
||||||
|
|
||||||
Actually I'm tired of nobody ever using the donate button among thousands
|
|
||||||
of downloads, so recoll for windows is gone for now.
|
|
||||||
|
|
||||||
== Note for updating
|
|
||||||
|
|
||||||
Recoll versions 1.23.0-9c5e32-20161216 and 1.23.0-2bfd80-20161115 had been
|
|
||||||
switched to using Xapian 1.4 which has a new and different index
|
|
||||||
format. Due to issues in Xapian 1.4, I have switched back to using Xapian
|
|
||||||
1.2 as of Recoll 1.23.0-2017-01-07-78b8ad.
|
|
||||||
|
|
||||||
This simply means that, if your index was created by one of the above
|
|
||||||
versions, it will have to be recreated from scratch after installing the
|
|
||||||
current Recoll version. I advise explicitely deleting
|
|
||||||
$HOME/AppData/Local/Recoll/xapiandb, as this will avoid leaving around 1.4
|
|
||||||
files which would take space for nothing otherwise.
|
|
||||||
|
|
||||||
== Installation
|
|
||||||
|
|
||||||
- Download and install Python 2.7.10 or 2.7.11 (e.g.
|
|
||||||
https://www.python.org/ftp/python/2.7.11/python-2.7.11.msi[Python
|
|
||||||
2.7.11]). Recoll currently does not work with Python3. *_On the
|
|
||||||
`Customize installation` screen, select "Add python.exe to Path"_*
|
|
||||||
|
|
||||||
- Optional: download and install the 7-zip program from
|
|
||||||
http://www.7-zip.org/. This is only useful if you need to index files
|
|
||||||
compressed with Unix methods (not needed for zip files).
|
|
||||||
|
|
||||||
- Download the
|
|
||||||
http://www.recoll.org/downloads/{windir}/recoll-setup-{recollversion}.exe[Recoll
|
|
||||||
setup file]. - Not possible right now -
|
|
||||||
|
|
||||||
- Execute the setup file. This is a vanilla installer generated by Inno
|
|
||||||
Setup, and it will ask the usual questions.
|
|
||||||
|
|
||||||
//NOTE: The installer needs administrator rights in order to install to
|
|
||||||
//`C:\Program Files`. If you want to install on a machine where you have no
|
|
||||||
//administrator rights, you can use the
|
|
||||||
//http://www.recoll.org/downloads/{windir}/recoll-{recollversion}.7z[installation
|
|
||||||
//directory archive] instead and extract it anywhere, this works just the
|
|
||||||
//same (you will need the free http://www.7-zip.org/[7z] to extract it). If
|
|
||||||
//you are in this case, you can ignore the setup-related steps of the
|
|
||||||
//procedure of course.
|
|
||||||
|
|
||||||
== Configuration
|
|
||||||
|
|
||||||
- Start recoll. It will ask if you want to customize the configuration.
|
|
||||||
The default is to index the content of your user directory. Then start
|
|
||||||
indexing. This can take some time.
|
|
||||||
- The default result list font is particularly ugly. Change it from
|
|
||||||
`Preferences->GUI Configuration->Result List->Result List Font`
|
|
||||||
|
|
||||||
- Have a look at the
|
|
||||||
https://www.lesbonscomptes.com/recoll/usermanual/webhelp/docs/index.html[Recoll
|
|
||||||
manual] !
|
|
||||||
- I have also started a small link:recoll-windows-faq.html[Recoll on
|
|
||||||
MS-Windows FAQ].
|
|
||||||
|
|
||||||
== Support
|
|
||||||
|
|
||||||
Please use the
|
|
||||||
https://opensourceprojects.eu/p/recoll1/tickets/new/[Recoll issues tracker]
|
|
||||||
for reporting problems, or contact me by email: jfd at recoll.org.
|
|
||||||
|
|
||||||
|
|
||||||
== Known problems:
|
|
||||||
|
|
||||||
- Having a drive root (e.g.: c:/) in the topdirs (things to index) list
|
|
||||||
does not work (it indexes nothing). You need to list the sub-directories
|
|
||||||
to index. This will be fixed in a future release.
|
|
||||||
|
|
||||||
- Setting the log level to 4 or higher can cause the GUI to deadlock while
|
|
||||||
displaying results. This will be fixed in a future release.
|
|
||||||
|
|
||||||
- Indexing is very slow, especially when using external commands (e.g. for
|
|
||||||
PDF files). I don't know if this is a case of my doing something stupid,
|
|
||||||
or if the general architecture is really bad fitted for Windows. If
|
|
||||||
someone with good Windows programming knowledge reads this, I'd be very
|
|
||||||
interested by a discussion. Windows indexing can be ten times slower than
|
|
||||||
the Linux version. The index formats are compatible, so, if you have
|
|
||||||
shared Linux/Windows data, it's best to process it on Linux.
|
|
||||||
|
|
||||||
- Filtering by directory location ('dir:' clauses) used to be
|
|
||||||
case-sensitive, including drive letters. This is hopefully fixed by the
|
|
||||||
November 2016 version.
|
|
||||||
|
|
||||||
- Also, when filtering the search with a `dir:` clause, an absolute path
|
|
||||||
should be specified as `/c/mydir` instead of `c:/mydir`
|
|
||||||
|
|
||||||
- There is no real-time or scheduled indexing as on Linux. For now, you
|
|
||||||
create and update the index by using the `File` menu (or executing
|
|
||||||
`recollindex.exe` from a command window).
|
|
||||||
|
|
||||||
== Change Log
|
|
||||||
|
|
||||||
Changes in 20161115
|
|
||||||
|
|
||||||
- File path names case sensitivity and other small path issues should be fixed.
|
|
||||||
- Based on Xapian 1.4. New stemming languages are available (e.g. Arabic).
|
|
||||||
- Fixed date display encoding issues.
|
|
||||||
|
|
||||||
Changes in 20160414
|
|
||||||
|
|
||||||
- The setup script has changed back to needing administrator rights,
|
|
||||||
because this is what is convenient for most people. Use the installation
|
|
||||||
directory archive to install in a non-standard location without admin
|
|
||||||
rights.
|
|
||||||
- Fixed a bug which had the whole indexing stop if a script would time out
|
|
||||||
on a specific file (it will very rarely happen that a pathologically bad
|
|
||||||
file can throw an input handler in a loop).
|
|
||||||
|
|
||||||
|
|
||||||
Changes in 20160317
|
|
||||||
|
|
||||||
- Small change to the setup script so that administrative rights are not
|
|
||||||
required.
|
|
||||||
|
|
||||||
Changes/fixes in 20160129
|
|
||||||
|
|
||||||
- Changed the method used for checking that index data is up to date with
|
|
||||||
documents. This will impose a re-indexing of all data, but it was
|
|
||||||
necessary because the previous method was incorrect.
|
|
||||||
- Fixed crash which occured after changing some configuration parameters.
|
|
||||||
- Warn when editing a temporary copy of a document (e.g. a temp file
|
|
||||||
extracted from a zip archive.
|
|
||||||
|
|
||||||
Changes in 20151202
|
|
||||||
|
|
||||||
- Fixed mbox parsing. This was getting the message separators completely
|
|
||||||
wrong, and taking a lot of time to do it. This should be especially
|
|
||||||
welcome by Thunderbird users.
|
|
||||||
|
|
||||||
- Fixed email attachement processing. A fault in the code which saved
|
|
||||||
attachment data to disk for further processing resulted in a practical
|
|
||||||
fuzzing experiment on the input processors. Especially, frequent crashes
|
|
||||||
in the image tag extractor caused very ennoying Windows popups about
|
|
||||||
a Python error.
|
|
||||||
|
|
||||||
Fixed in 20151115 and later
|
|
||||||
|
|
||||||
- A relatively rare crash which seemed to occur mostly on some email
|
|
||||||
messages
|
|
||||||
- Forgotten MIME settings for .cs, .js and .css
|
|
||||||
|
|
||||||
Fixed in 20151112 and later
|
|
||||||
|
|
||||||
- Forgotten dll prevents the unrtf program to work, so no rtf indexing.
|
|
||||||
|
|
||||||
Fixed in 20151109 (hopefully?)
|
|
||||||
|
|
||||||
- The GUI sometimes crashes when you click `Preview` or `Open`. This does
|
|
||||||
not occur often, and usually for one of the first tries after starting
|
|
||||||
the program. Don't despair. This seems to be fixed in the latest version
|
|
||||||
(20151109), but I am not 100% certain that it is gone.
|
|
||||||
|
|
||||||
++++
|
|
||||||
<h2 id="comments">Comments</h2>
|
|
||||||
|
|
||||||
<div id="disqus_thread"></div>
|
|
||||||
<script type="text/javascript">
|
|
||||||
var disqus_shortname = 'lesbonscomptes';
|
|
||||||
(function() {
|
|
||||||
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
|
|
||||||
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
|
|
||||||
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
|
|
||||||
})();
|
|
||||||
</script>
|
|
||||||
<noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
|
|
||||||
<a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
|
|
||||||
++++
|
|
||||||
@ -1,416 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>RECOLL indexing performance and index sizes</title>
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll is a simple full-text search system for unix and linux based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"full text search,fulltext,desktop search,unix,linux,solaris,open source,free">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content=
|
|
||||||
"text/html; charset=iso-8859-1">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="rightlinks">
|
|
||||||
<ul>
|
|
||||||
<li><a href="index.html">Home</a></li>
|
|
||||||
<li><a href="pics/index.html">Screenshots</a></li>
|
|
||||||
<li><a href="download.html">Downloads</a></li>
|
|
||||||
<li><a href="doc.html">Documentation</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h1>Recoll: Indexing performance and index sizes</h1>
|
|
||||||
|
|
||||||
<p>The time needed to index a given set of documents, and the
|
|
||||||
resulting index size depend of many factors.
|
|
||||||
|
|
||||||
<p>The index size depends almost only on the size of the
|
|
||||||
uncompressed input text, and you can expect it to be roughly
|
|
||||||
of the same order of magnitude. Depending on the type of file,
|
|
||||||
the proportion of text to file size varies very widely, going
|
|
||||||
from close to 1 for pure text files to a very small factor
|
|
||||||
for, e.g., metadata tags in mp3 files.</p>
|
|
||||||
|
|
||||||
<p>Estimating indexing time is a much more complicated issue,
|
|
||||||
depending on the type and size of input and on system
|
|
||||||
performance. There is no general way to determine what part of
|
|
||||||
the hardware should be optimized. Depending on the type of
|
|
||||||
input, performance may be bound by I/O read or write
|
|
||||||
performance, CPU single-processing speed, or combined
|
|
||||||
multi-processing speed.</p>
|
|
||||||
|
|
||||||
<p>It should be noted that Recoll performance will not be an
|
|
||||||
issue for most people. The indexer can process 1000 typical
|
|
||||||
PDF files per minute, or 500 Wikipedia HTML pages per second
|
|
||||||
on medium-range hardware, meaning that the initial indexing of
|
|
||||||
a typical dataset will need a few dozen minutes at
|
|
||||||
most. Further incremental index updates will be much faster
|
|
||||||
because most files will not need to be processed again.</p>
|
|
||||||
|
|
||||||
<p>However, there are Recoll installations with
|
|
||||||
terabyte-sized datasets, on which indexing can take days. For
|
|
||||||
such operations (or even much smaller ones), it is very
|
|
||||||
important to know what kind of performance can be expected,
|
|
||||||
and what aspects of the hardware should be optimized.</p>
|
|
||||||
|
|
||||||
<p>In order to provide some reference points, I have run a
|
|
||||||
number of benchs on medium-sized datasets, using typical
|
|
||||||
mid-range desktop hardware, and varying the indexing
|
|
||||||
configuration parameters to show how they affect the results.</p>
|
|
||||||
|
|
||||||
<p>The following may help you check that you are getting typical
|
|
||||||
performance for your indexing, and give some indications about
|
|
||||||
what to adjust to improve it.</p>
|
|
||||||
|
|
||||||
<p>From time to time, I receive a report about a system becoming
|
|
||||||
unusable during indexing. As far as I know, with the default
|
|
||||||
Recoll configuration, and barring an exceptional issue (bug),
|
|
||||||
this is always due to a system problem (typically bad hardware
|
|
||||||
such as a disk doing retries). The tests below were mostly run
|
|
||||||
while I was using the desktop, which never became
|
|
||||||
unusable. However, some tests rendered it less responsive and
|
|
||||||
this is noted with the results.</p>
|
|
||||||
|
|
||||||
<p>The following text refers to the indexing parameters without
|
|
||||||
further explanation. Here follow links to more explanation about the
|
|
||||||
<a href="http://www.lesbonscomptes.com/recoll/idxthreads/threadingRecoll.html#recoll.idxthreads.multistage">processing
|
|
||||||
model</a> and
|
|
||||||
<a href="https://www.lesbonscomptes.com/recoll/usermanual/webhelp/docs/RCL.INSTALL.CONFIG.RECOLLCONF.PERFS.html">configuration
|
|
||||||
parameters</a>.</p>
|
|
||||||
|
|
||||||
|
|
||||||
<p>All text were run without generating the stemming database or
|
|
||||||
aspell dictionary. These phases are relatively short and there
|
|
||||||
is nothing which can be optimized about them.</p>
|
|
||||||
|
|
||||||
<h2>Hardware</h2>
|
|
||||||
|
|
||||||
<p>The tests were run on what could be considered a mid-range
|
|
||||||
desktop PC:
|
|
||||||
<ul>
|
|
||||||
<li>Intel Core I7-4770T CPU: 2.5 Ghz, 4 physical cores, and
|
|
||||||
hyper-threading for a total of 8 hardware threads</li>
|
|
||||||
<li>8 GBytes of RAM</li>
|
|
||||||
<li>Asus H87I-Plus motherboard, Samsung 850 EVO SSD storage</li>
|
|
||||||
</ul>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<p>This is usually a fanless PC, but I did run a fan on the
|
|
||||||
external case fins during some of the tests (esp. PDF
|
|
||||||
indexing), because the CPU was running a bit too hot.</p>
|
|
||||||
|
|
||||||
|
|
||||||
<h2>Indexing PDF files</h2>
|
|
||||||
|
|
||||||
|
|
||||||
<p>The tests were run on 18000 random PDFs harvested on
|
|
||||||
Google, with a total size of around 30 GB, using Recoll 1.22.3
|
|
||||||
and Xapian 1.2.22. The resulting index size was 1.2 GB.</p>
|
|
||||||
|
|
||||||
<h3>PDF: storage</h3>
|
|
||||||
|
|
||||||
<p>Typical PDF files have a low text to file size ratio, and a
|
|
||||||
lot of data needs to be read for indexing. With the test
|
|
||||||
configuration, the indexer needs to read around 45 MBytes / S
|
|
||||||
from multiple files. This means that input storage makes a
|
|
||||||
difference and that you need an SSD or a fast array for
|
|
||||||
optimal performance.</p>
|
|
||||||
|
|
||||||
<table border=1>
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th>Storage</th>
|
|
||||||
<th>idxflushmb</th>
|
|
||||||
<th>thrTCounts</th>
|
|
||||||
<th>Real Time</th>
|
|
||||||
</tr>
|
|
||||||
<tbody>
|
|
||||||
<tr>
|
|
||||||
<td>NFS drive (gigabit)</td>
|
|
||||||
<td>200</td>
|
|
||||||
<td>6/4/1</td>
|
|
||||||
<td>24m40</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>local SSD</td>
|
|
||||||
<td>200</td>
|
|
||||||
<td>6/4/1</td>
|
|
||||||
<td>11m40</td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
|
|
||||||
<h3>PDF: threading</h3>
|
|
||||||
|
|
||||||
<p>Because PDF files are bulky and complicated to process, the
|
|
||||||
dominant step for indexing them is input processing. PDF text
|
|
||||||
extraction is performed by multiple instances
|
|
||||||
the <i>pdftotext</i> program, and parallelisation works very
|
|
||||||
well.</p>
|
|
||||||
|
|
||||||
<p>The following table shows the indexing times with a variety
|
|
||||||
of threading parameters.</p>
|
|
||||||
|
|
||||||
<table border=1>
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th>idxflushmb</th>
|
|
||||||
<th>thrQSizes</th>
|
|
||||||
<th>thrTCounts</th>
|
|
||||||
<th>Time R/U/S</th>
|
|
||||||
</tr>
|
|
||||||
<tbody>
|
|
||||||
<tr>
|
|
||||||
<td>200</td>
|
|
||||||
<td>2/2/2</td>
|
|
||||||
<td>2/1/1</td>
|
|
||||||
<td>19m21</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>200</td>
|
|
||||||
<td>2/2/2</td>
|
|
||||||
<td>10/10/1</td>
|
|
||||||
<td>10m38</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>200</td>
|
|
||||||
<td>2/2/2</td>
|
|
||||||
<td>100/10/1</td>
|
|
||||||
<td>11m</td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<p>10/10/1 was the best value for thrTCounts for this test. The
|
|
||||||
total CPU time was around 78 mn.</p>
|
|
||||||
|
|
||||||
<p>The last line shows the effect of a ridiculously high thread
|
|
||||||
count value for the input step, which is not much. Using
|
|
||||||
sligthly lower values than the optimum has not much impact
|
|
||||||
either. The only thing which really degrades performance is
|
|
||||||
configuring less threads than available from the hardware.</p>
|
|
||||||
|
|
||||||
<p>With the optimal parameters above, the peak recollindex
|
|
||||||
resident memory size is around 930 MB, to which we should add
|
|
||||||
ten instances of pdftotext (10MB typical), and of the
|
|
||||||
rclpdf.py Python input handler (around 15 MB each). This means
|
|
||||||
that the total resident memory used by indexing is around 1200
|
|
||||||
MB, quite a modest value in 2016.</p>
|
|
||||||
|
|
||||||
|
|
||||||
<h3>PDF: Xapian flushes</h3>
|
|
||||||
|
|
||||||
<p>idxflushmb has practically no influence on the indexing time
|
|
||||||
(tested from 40 to 1000), which is not too surprising because
|
|
||||||
the Xapian index size is very small relatively to the input
|
|
||||||
size, so that the cost of Xapian flushes to disk is not very
|
|
||||||
significant. The value of 200 used for the threading tests
|
|
||||||
could be lowered in practise, which would decrease memory
|
|
||||||
usage and not change the indexing time significantly.</p>
|
|
||||||
|
|
||||||
<h3>PDF: conclusion</h3>
|
|
||||||
|
|
||||||
<p>For indexing PDF files, you need many cores and a fast
|
|
||||||
input storage system. Neither single-thread performance nor
|
|
||||||
amount of memory will be critical aspects.</p>
|
|
||||||
|
|
||||||
<p>Running the PDF indexing tests had no influence on the system
|
|
||||||
"feel", I could work on it just as if it were quiescent.</p>
|
|
||||||
|
|
||||||
|
|
||||||
<h2>Indexing HTML files</h2>
|
|
||||||
|
|
||||||
<p>The tests were run on an (old) French Wikipedia dump: 2.9
|
|
||||||
million HTML files stored in 42000 directories, for an
|
|
||||||
approximate total size of 41 GB (average file size
|
|
||||||
14 KB).
|
|
||||||
|
|
||||||
<p>The files are stored on a local SSD. Just reading them with
|
|
||||||
find+cpio takes close to 8 mn.</p>
|
|
||||||
|
|
||||||
<p>The resulting index has a size of around 30 GB.</p>
|
|
||||||
|
|
||||||
<p>I was too lazy to extract 3 million entries tar file on a
|
|
||||||
spinning disk, so all tests were performed with the data
|
|
||||||
stored on a local SSD.</p>
|
|
||||||
|
|
||||||
<p>For this test, the indexing time is dominated by the Xapian
|
|
||||||
index updates. As these are single threaded, only the flush
|
|
||||||
interval has a real influence.</p>
|
|
||||||
|
|
||||||
<table border=1>
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th>idxflushmb</th>
|
|
||||||
<th>thrQSizes</th>
|
|
||||||
<th>thrTCounts</th>
|
|
||||||
<th>Time R/U/S</th>
|
|
||||||
</tr>
|
|
||||||
<tbody>
|
|
||||||
<tr>
|
|
||||||
<td>200</td>
|
|
||||||
<td>2/2/2</td>
|
|
||||||
<td>2/1/1</td>
|
|
||||||
<td>88m</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>200</td>
|
|
||||||
<td>2/2/2</td>
|
|
||||||
<td>6/4/1</td>
|
|
||||||
<td>91m</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>200</td>
|
|
||||||
<td>2/2/2</td>
|
|
||||||
<td>1/1/1</td>
|
|
||||||
<td>96m</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>100</td>
|
|
||||||
<td>2/2/2</td>
|
|
||||||
<td>1/2/1</td>
|
|
||||||
<td>120m</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>100</td>
|
|
||||||
<td>2/2/2</td>
|
|
||||||
<td>6/4/1</td>
|
|
||||||
<td>121m</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>40</td>
|
|
||||||
<td>2/2/2</td>
|
|
||||||
<td>1/2/1</td>
|
|
||||||
<td>173m</td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
|
|
||||||
<p>The indexing process becomes quite big (resident size around
|
|
||||||
4GB), and the combination of high I/O load and high memory
|
|
||||||
usage makes the system less responsive at times (but not
|
|
||||||
unusable). As this happens principally when switching
|
|
||||||
applications, my guess would be that some program pages
|
|
||||||
(e.g. from the window manager and X) get flushed out, and take
|
|
||||||
time being read in, during which time the display appears
|
|
||||||
frozen.</p>
|
|
||||||
|
|
||||||
<p>For this kind of data, single-threaded CPU performance and
|
|
||||||
storage write speed can make a difference. Multithreading does
|
|
||||||
not help.</p>
|
|
||||||
|
|
||||||
<h2>Adjusting hardware to improve indexing performance</h2>
|
|
||||||
|
|
||||||
<p>I think that the following multi-step approach has a good
|
|
||||||
chance to improve performance:
|
|
||||||
<ul>
|
|
||||||
<li>Check that multithreading is enabled (it is, by default
|
|
||||||
with recent Recoll versions).</li>
|
|
||||||
<li>Increase the flush threshold until the machine begins to
|
|
||||||
have memory issues. Maybe add memory.</li>
|
|
||||||
<li>Store the index on an SSD. If possible, also store the
|
|
||||||
data on an SSD. Actually, when using many threads, it is
|
|
||||||
probably almost more important to have the data on an
|
|
||||||
SSD.</li>
|
|
||||||
<li>If you have many files which will need temporary copies
|
|
||||||
(email attachments, archive members, compressed files): use
|
|
||||||
a memory temporary directory. Add memory.</li>
|
|
||||||
<li>More CPUs...</li>
|
|
||||||
</ul>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
<p>At some point, the index updating and writing may become the
|
|
||||||
bottleneck (this depends on the data mix, very quickly with
|
|
||||||
HTML or text files). As far as I can think, the only possible
|
|
||||||
approach is then to partition the index. You can query the
|
|
||||||
multiple Xapian indices either by using the Recoll external
|
|
||||||
index capability, or by actually merging the results with
|
|
||||||
xapian-compact.</p>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<h5>Old benchmarks</h5>
|
|
||||||
|
|
||||||
<p>To provide a point of comparison for the evolution of
|
|
||||||
hardware and software...</p>
|
|
||||||
|
|
||||||
<p>The following very old data was obtained (around 2007?) on a
|
|
||||||
machine with a 1800 Mhz AMD Duron CPU, 768Mb of Ram, and a
|
|
||||||
7200 RPM 160 GBytes IDE disk, running Suse 10.1.</p>
|
|
||||||
|
|
||||||
<p><b>recollindex</b> (version 1.8.2 with xapian 1.0.0) is
|
|
||||||
executed with the default flush threshold value.
|
|
||||||
The process memory usage is the one given by <b>ps</b></p>
|
|
||||||
|
|
||||||
<table border=1>
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th>Data</th>
|
|
||||||
<th>Data size</th>
|
|
||||||
<th>Indexing time</th>
|
|
||||||
<th>Index size</th>
|
|
||||||
<th>Peak process memory usage</th>
|
|
||||||
</tr>
|
|
||||||
<tbody>
|
|
||||||
<tr>
|
|
||||||
<td>Random pdfs harvested on Google</td>
|
|
||||||
<td>1.7 GB, 3564 files</td>
|
|
||||||
<td>27 mn</td>
|
|
||||||
<td>230 MB</td>
|
|
||||||
<td>225 MB</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>Ietf mailing list archive</td>
|
|
||||||
<td>211 MB, 44,000 messages</td>
|
|
||||||
<td>8 mn</td>
|
|
||||||
<td>350 MB</td>
|
|
||||||
<td>90 MB</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td>Partial Wikipedia dump</td>
|
|
||||||
<td>15 GB, one million files</td>
|
|
||||||
<td>6H30</td>
|
|
||||||
<td>10 GB</td>
|
|
||||||
<td>324 MB</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<!-- DB: ndocs 3564 lastdocid 3564 avglength 6460.71 -->
|
|
||||||
<td>Random pdfs harvested on Google<br>
|
|
||||||
Recoll 1.9, <em>idxflushmb</em> set to 10</td>
|
|
||||||
<td>1.7 GB, 3564 files</td>
|
|
||||||
<td>25 mn</td>
|
|
||||||
<td>262 MB</td>
|
|
||||||
<td>65 MB</td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<p>Notice how the index size for the mail archive is bigger than
|
|
||||||
the data size. Myriads of small pure text documents will do
|
|
||||||
this. The factor of expansion would be even much worse with
|
|
||||||
compressed folders of course (the test was on uncompressed
|
|
||||||
data).</p>
|
|
||||||
|
|
||||||
<p>The last test was performed with Recoll 1.9.0 which has an
|
|
||||||
ajustable flush threshold (<em>idxflushmb</em> parameter), here
|
|
||||||
set to 10 MB. Notice the much lower peak memory usage, with no
|
|
||||||
performance degradation. The resulting index is bigger though,
|
|
||||||
the exact reason is not known to me, possibly because of
|
|
||||||
additional fragmentation </p>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
||||||
@ -1,2 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
onlylist=1 photalb . .
|
|
||||||
@ -1,44 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Recoll screenshots</title>
|
|
||||||
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll is a simple full-text search system for unix and linux
|
|
||||||
based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"full text search, desktop search, unix, linux">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
|
|
||||||
<link type="text/css" rel="stylesheet" href="../styles/style.css">
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h1>Recoll Screenshots</h1>
|
|
||||||
<li><a href="../index.html">Back to Recoll home</a></li>
|
|
||||||
|
|
||||||
<table>
|
|
||||||
<tr>
|
|
||||||
<td align="center"><a href="recoll0.html"><img src="recoll0-thumb.png"></a></td>
|
|
||||||
<td align="center"><a href="result-table.html"><img src="result-table-thumb.png"></a></td>
|
|
||||||
<td align="center"><a href="recoll1.html"><img src="recoll1-thumb.png"></a></td>
|
|
||||||
<td align="center"><a href="recoll2.html"><img src="recoll2-thumb.png"></a></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td align="center"><a href="recoll3.html"><img src="recoll3-thumb.png"></a></td>
|
|
||||||
<td align="center"><a href="recoll4.html"><img src="recoll4-thumb.png"></a></td>
|
|
||||||
<td align="center"><a href="recoll5.html"><img src="recoll5-thumb.png"></a></td>
|
|
||||||
<td align="center"><a href="recoll_chinese.html"><img src="recoll_chinese-thumb.png"></a></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td align="center"><a href="recoll-HTML_search_results.html"><img src="recoll-HTML_search_results-thumb.png"></a></td>
|
|
||||||
</tr></table>
|
|
||||||
</body></html>
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Recoll screenshots</title>
|
|
||||||
|
|
||||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
|
||||||
<meta name="Author" content="Jean-Francois Dockes">
|
|
||||||
<meta name="Description" content=
|
|
||||||
"recoll is a simple full-text search system for unix and linux
|
|
||||||
based on the powerful and mature xapian engine">
|
|
||||||
<meta name="Keywords" content=
|
|
||||||
"full text search, desktop search, unix, linux">
|
|
||||||
<meta http-equiv="Content-language" content="en">
|
|
||||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
|
||||||
<meta name="robots" content="All,Index,Follow">
|
|
||||||
|
|
||||||
<link type="text/css" rel="stylesheet" href="../styles/style.css">
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="content">
|
|
||||||
|
|
||||||
<h1>Recoll Screenshots</h1>
|
|
||||||
<li><a href="../index.html">Back to Recoll home</a></li>
|
|
||||||
|
|
||||||
|
Before Width: | Height: | Size: 5.6 KiB |
@ -1,13 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Photo</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<p><a href="recoll-HTML_search_results.html">Prev</a> <a href="../index.html">Up</a>
|
|
||||||
<a href="smile.html">Next</a>
|
|
||||||
<a href="mario.png">Image</a></p>
|
|
||||||
<p></p>
|
|
||||||
<p><img height="90%" src="mario.png"></p>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
Before Width: | Height: | Size: 1.8 KiB |
@ -1,9 +0,0 @@
|
|||||||
recoll0.png
|
|
||||||
result-table.png
|
|
||||||
recoll1.png
|
|
||||||
recoll2.png
|
|
||||||
recoll3.png
|
|
||||||
recoll4.png
|
|
||||||
recoll5.png
|
|
||||||
recoll_chinese.png
|
|
||||||
recoll-HTML_search_results.png
|
|
||||||
|
Before Width: | Height: | Size: 178 KiB |
@ -1,40 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Photo</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<p><a href="recoll_chinese.html">Prev</a> <a href=".">Up</a>
|
|
||||||
<a href="recoll0.html">Next</a>
|
|
||||||
<a href="recoll-HTML_search_results.png">Image</a></p>
|
|
||||||
<p>A customized result list, thanks to Michael Croes. The html code follows,
|
|
||||||
it should be pasted into the
|
|
||||||
<i>Preferences->Query Configuration->Result paragraph format string</i> entry.
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<table border="1" bgcolor="lightyellow">
|
|
||||||
<tr>
|
|
||||||
<td rowspan="4" width="40px" align="center"
|
|
||||||
valign="center">
|
|
||||||
<img src="%I" width="32" height="32">
|
|
||||||
<p><b>%R</b></p>
|
|
||||||
<p><a href="P%N">Aperçu</a></p>
|
|
||||||
</td>
|
|
||||||
<th colspan="3" bgcolor="lightgrey">%T</th>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td align="center">%M</td>
|
|
||||||
<td align="center">%D</td>
|
|
||||||
<td align="center">%S</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td colspan="3"><a href="E%N">%U</a></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td colspan="3">%A</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
</pre></p>
|
|
||||||
<p><img height="90%" src="recoll-HTML_search_results.png"></p>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
Before Width: | Height: | Size: 62 KiB |
@ -1,28 +0,0 @@
|
|||||||
A customized result list, thanks to Michael Croes. The html code follows,
|
|
||||||
it should be pasted into the
|
|
||||||
<i>Preferences->Query Configuration->Result paragraph format string</i> entry.
|
|
||||||
|
|
||||||
<pre>
|
|
||||||
<table border="1" bgcolor="lightyellow">
|
|
||||||
<tr>
|
|
||||||
<td rowspan="4" width="40px" align="center"
|
|
||||||
valign="center">
|
|
||||||
<img src="%I" width="32" height="32">
|
|
||||||
<p><b>%R</b></p>
|
|
||||||
<p><a href="P%N">Aperçu</a></p>
|
|
||||||
</td>
|
|
||||||
<th colspan="3" bgcolor="lightgrey">%T</th>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td align="center">%M</td>
|
|
||||||
<td align="center">%D</td>
|
|
||||||
<td align="center">%S</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td colspan="3"><a href="E%N">%U</a></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td colspan="3">%A</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
</pre>
|
|
||||||
|
Before Width: | Height: | Size: 124 KiB |
@ -1,13 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Photo</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<p><a href=".">Prev</a> <a href=".">Up</a>
|
|
||||||
<a href="result-table.html">Next</a>
|
|
||||||
<a href="recoll0.png">Image</a></p>
|
|
||||||
<p>Search results.</p>
|
|
||||||
<p><img height="90%" src="recoll0.png"></p>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
Before Width: | Height: | Size: 128 KiB |
@ -1,2 +0,0 @@
|
|||||||
Search results.
|
|
||||||
|
|
||||||
|
Before Width: | Height: | Size: 154 KiB |
@ -1,13 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Photo</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<p><a href="result-table.html">Prev</a> <a href=".">Up</a>
|
|
||||||
<a href="recoll2.html">Next</a>
|
|
||||||
<a href="recoll1.png">Image</a></p>
|
|
||||||
<p>A result list with a preview window open.</p>
|
|
||||||
<p><img height="90%" src="recoll1.png"></p>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
Before Width: | Height: | Size: 181 KiB |
@ -1,4 +0,0 @@
|
|||||||
A result list with a preview window open.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Before Width: | Height: | Size: 25 KiB |
@ -1,13 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Photo</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<p><a href="recoll1.html">Prev</a> <a href=".">Up</a>
|
|
||||||
<a href="recoll3.html">Next</a>
|
|
||||||
<a href="recoll2.png">Image</a></p>
|
|
||||||
<p>The two tabs in the advanced search dialog.</p>
|
|
||||||
<p><img height="90%" src="recoll2.png"></p>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
Before Width: | Height: | Size: 46 KiB |
@ -1 +0,0 @@
|
|||||||
The two tabs in the advanced search dialog.
|
|
||||||
|
Before Width: | Height: | Size: 55 KiB |
@ -1,14 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Photo</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<p><a href="recoll2.html">Prev</a> <a href=".">Up</a>
|
|
||||||
<a href="recoll4.html">Next</a>
|
|
||||||
<a href="recoll3.png">Image</a></p>
|
|
||||||
<p>A result list from which the native application (firefox)
|
|
||||||
was started by clicking the Edit link.</p>
|
|
||||||
<p><img height="90%" src="recoll3.png"></p>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
Before Width: | Height: | Size: 166 KiB |
@ -1,2 +0,0 @@
|
|||||||
A result list from which the native application (firefox)
|
|
||||||
was started by clicking the Edit link.
|
|
||||||
|
Before Width: | Height: | Size: 65 KiB |
@ -1,14 +0,0 @@
|
|||||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Photo</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<p><a href="recoll3.html">Prev</a> <a href=".">Up</a>
|
|
||||||
<a href="recoll5.html">Next</a>
|
|
||||||
<a href="recoll4.png">Image</a></p>
|
|
||||||
<p>The document history window looks a little like a result list
|
|
||||||
I'm afraid...</p>
|
|
||||||
<p><img height="90%" src="recoll4.png"></p>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
Before Width: | Height: | Size: 54 KiB |