moved website
@ -45,8 +45,8 @@ debdir=debian
|
||||
# Note: no new releases for lucid: no webkit. Or use old debianrclqt4 dir.
|
||||
# No new releases for trusty either because of risk of kio compat (kio
|
||||
# wont build)
|
||||
series="xenial yakkety zesty"
|
||||
series=
|
||||
series="xenial yakkety zesty artful"
|
||||
series=artful
|
||||
|
||||
if test "X$series" != X ; then
|
||||
check_recoll_orig
|
||||
@ -77,8 +77,8 @@ done
|
||||
|
||||
### KIO. Does not build on trusty from recoll 1.23 because of the need
|
||||
### for c++11
|
||||
series="xenial yakkety zesty"
|
||||
#series=
|
||||
series="xenial yakkety zesty artful"
|
||||
series=
|
||||
|
||||
debdir=debiankio
|
||||
topdir=kio-recoll-${RCLVERS}
|
||||
@ -146,8 +146,8 @@ for series in $series ; do
|
||||
done
|
||||
|
||||
### Unity Scope
|
||||
series="trusty xenial yakkety"
|
||||
series=
|
||||
series="trusty xenial yakkety zesty artful"
|
||||
#series=
|
||||
|
||||
debdir=debianunityscope
|
||||
if test ! -d ${debdir}/ ; then
|
||||
|
||||
@ -20,8 +20,8 @@ alink="#0000FF">
|
||||
<div class="titlepage">
|
||||
<div>
|
||||
<div>
|
||||
<h1 class="title"><a name="idp56557776" id=
|
||||
"idp56557776"></a>Recoll user manual</h1>
|
||||
<h1 class="title"><a name="idm44986984150384" id=
|
||||
"idm44986984150384"></a>Recoll user manual</h1>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
@ -109,13 +109,14 @@ alink="#0000FF">
|
||||
multiple indexes</a></span></dt>
|
||||
|
||||
<dt><span class="sect2">2.1.3. <a href=
|
||||
"#idp62130176">Document types</a></span></dt>
|
||||
"#idm44986952097312">Document types</a></span></dt>
|
||||
|
||||
<dt><span class="sect2">2.1.4. <a href=
|
||||
"#idp62154272">Indexing failures</a></span></dt>
|
||||
"#idm44986952072736">Indexing
|
||||
failures</a></span></dt>
|
||||
|
||||
<dt><span class="sect2">2.1.5. <a href=
|
||||
"#idp62161280">Recovery</a></span></dt>
|
||||
"#idm44986952065728">Recovery</a></span></dt>
|
||||
</dl>
|
||||
</dd>
|
||||
|
||||
@ -1017,8 +1018,9 @@ alink="#0000FF">
|
||||
<div class="titlepage">
|
||||
<div>
|
||||
<div>
|
||||
<h3 class="title"><a name="idp62130176" id=
|
||||
"idp62130176"></a>2.1.3. Document types</h3>
|
||||
<h3 class="title"><a name="idm44986952097312" id=
|
||||
"idm44986952097312"></a>2.1.3. Document
|
||||
types</h3>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@ -1131,8 +1133,8 @@ indexedmimetypes = application/pdf
|
||||
<div class="titlepage">
|
||||
<div>
|
||||
<div>
|
||||
<h3 class="title"><a name="idp62154272" id=
|
||||
"idp62154272"></a>2.1.4. Indexing
|
||||
<h3 class="title"><a name="idm44986952072736" id=
|
||||
"idm44986952072736"></a>2.1.4. Indexing
|
||||
failures</h3>
|
||||
</div>
|
||||
</div>
|
||||
@ -1172,8 +1174,8 @@ indexedmimetypes = application/pdf
|
||||
<div class="titlepage">
|
||||
<div>
|
||||
<div>
|
||||
<h3 class="title"><a name="idp62161280" id=
|
||||
"idp62161280"></a>2.1.5. Recovery</h3>
|
||||
<h3 class="title"><a name="idm44986952065728" id=
|
||||
"idm44986952065728"></a>2.1.5. Recovery</h3>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@ -1778,7 +1780,7 @@ thrQSizes = -1 -1 -1
|
||||
|
||||
<p>A current pointer to the extension can be found, along
|
||||
with up-to-date instructions, on the <a class="ulink" href=
|
||||
"http://bitbucket.org/medoc/recoll/wiki/IndexWebHistory"
|
||||
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/IndexWebHistory"
|
||||
target="_top">Recoll wiki</a>.</p>
|
||||
|
||||
<p>A copy of the indexed WEB pages is retained by Recoll in
|
||||
@ -3057,7 +3059,7 @@ MimeType=*/*
|
||||
thumbnails.</p>
|
||||
|
||||
<p>There are also <a class="ulink" href=
|
||||
"http://bitbucket.org/medoc/recoll/wiki/ResultsThumbnails.wiki"
|
||||
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/ResultsThumbnails.wiki"
|
||||
target="_top">some pointers about thumbnail
|
||||
generation</a> on the <span class=
|
||||
"application">Recoll</span> wiki.</p>
|
||||
@ -5898,7 +5900,7 @@ dir:recoll dir:src -dir:utils -dir:common
|
||||
<li class="listitem">
|
||||
<p>If you use a recent version of Ubuntu Linux, you
|
||||
may find the <a class="ulink" href=
|
||||
"http://bitbucket.org/medoc/recoll/wiki/UnityLens"
|
||||
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/UnityLens"
|
||||
target="_top">Ubuntu Unity Lens</a> module
|
||||
useful.</p>
|
||||
</li>
|
||||
@ -5932,7 +5934,7 @@ dir:recoll dir:src -dir:utils -dir:common
|
||||
"application">libwnck</span> window manager interface
|
||||
library, which will allow you to do just this. The
|
||||
detailed instructions are on <a class="ulink" href=
|
||||
"http://bitbucket.org/medoc/recoll/wiki/HotRecoll"
|
||||
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/HotRecoll"
|
||||
target="_top">this wiki page</a>.</p>
|
||||
</div>
|
||||
|
||||
@ -6642,9 +6644,9 @@ or
|
||||
comments inside the file.</p>
|
||||
|
||||
<p>You can also have a look at the <a class="ulink" href=
|
||||
"http://bitbucket.org/medoc/recoll/wiki/HandleCustomField"
|
||||
target="_top">example on the Wiki</a>, detailing how one
|
||||
could add a <span class="emphasis"><em>page
|
||||
"https://www.lesbonscomptes.com/recoll/faqsandhowtos/HandleCustomField"
|
||||
target="_top">example in the FAQs area</a>, detailing how
|
||||
one could add a <span class="emphasis"><em>page
|
||||
count</em></span> field to pdf documents for displaying
|
||||
inside result lists.</p>
|
||||
</div>
|
||||
@ -8978,7 +8980,7 @@ thesame = "some string with spaces"
|
||||
function similar to skippedNames, but works
|
||||
independantly. Can be redefined for subdirectories.
|
||||
Supported by recoll 1.20 and newer. See
|
||||
https://bitbucket.org/medoc/recoll/wiki/Filtering%20out%20Zip%20archive%20members</p>
|
||||
https://www.lesbonscomptes.com/recoll/faqsandhowtos/Filtering%20out%20Zip%20archive%20members</p>
|
||||
</dd>
|
||||
|
||||
<dt><a name=
|
||||
|
||||
1091
website/BUGS.html
@ -1,945 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html>
|
||||
<head>
|
||||
<title>Recoll changes</title>
|
||||
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux
|
||||
based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search, desktop search, unix, linux">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
|
||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="index.html">Home</a></li>
|
||||
<li><a href="download.html">Downloads</a></li>
|
||||
<li><a href="doc.html">Documentation</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h1>Recoll journal of user-visible changes </h1>
|
||||
|
||||
<p>Newer releases are described in their release notes document:</p>
|
||||
<p>
|
||||
<a href="release-1.20.html">1.20</a>
|
||||
<a href="release-1.19.html">1.19</a>
|
||||
<a href="release-1.18.html">1.18</a>
|
||||
<a href="release-1.17.html">1.17</a>
|
||||
<a href="release-1.16.html">1.16</a>
|
||||
<a href="release-1.15.html">1.15</a>
|
||||
<a href="release-1.14.4.html">1.14.4</a>
|
||||
</p>
|
||||
|
||||
<h2><a name="1.14.3">1.14.3</a></h2>
|
||||
<ul>
|
||||
<li>Get rid of permanent filter subprocess at the end of a GUI
|
||||
indexing pass.</li>
|
||||
<li>Add new filter for indexing GNU info files.</li>
|
||||
<li>Index the file name from a zip or chm internal path.</li>
|
||||
<li>Add hotrecoll.py script to help with one-key recoll
|
||||
activation/hiding. Move focus to search entry when unminimized.</li>
|
||||
<li>Handle bad mbox format from Thunderbird.</li>
|
||||
<li>Catch exception which was causing stderr messages while
|
||||
indexing encrypted zip files.</li>
|
||||
<li>Change result list "Edit" links to "Open" for consistency
|
||||
with menus.</li>
|
||||
<li>Change the type of character set conversion occurring
|
||||
when using "Copy file path" from the result list.
|
||||
Should work in more cases than the previous approach (but
|
||||
will still fail sometimes).</li>
|
||||
<li>Update lyx filter.</li>
|
||||
<li>Fix problems with white space in file name in several
|
||||
input filters.</li>
|
||||
<li>Support mutagen versions older than 1.17.</li>
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.14.2">1.14.2</a></h2>
|
||||
<p>Note: most of the changes are in release 1.14.0. Release 1.14.1 fixed 2
|
||||
bugs. Release 1.14.2 fixes the help browser which was broken
|
||||
by 1.14.2. Sigh ...</p>
|
||||
<ul>
|
||||
<li><a href="usermanual/usermanual.html#RCL.SEARCH.LANG">
|
||||
date selection in queries</a>.</li>
|
||||
<li>Pure negative queries (ie: <i>-someterm date:P10D/</i>.</li>
|
||||
<li>Autosuffs: option to automatically turn words into <tt>ext:</tt>
|
||||
clauses (ie: <i>xls</i> -> <i>ext:xls</i>) (GUI preferences
|
||||
panel).</li>
|
||||
<li>Allow extracting arbitrary mail headers and use them as
|
||||
index/search fields (configured in the <tt>fields</tt>
|
||||
file).</li>
|
||||
<li><tt>nonumbers</tt> configuration parameter: disable
|
||||
indexing of all numbers, useful for some data files with lots
|
||||
of numerical data.</li>
|
||||
<li>Shortcuts for the results page: <tt>PageUp/Down</tt> can
|
||||
be used even when the focus is in the search
|
||||
entry. <tt>Shift+Home</tt>: back to first page of results.
|
||||
<tt>Ctrl+Shift+s</tt>: return focus to the search
|
||||
entry. </li>
|
||||
<li>Add full screen mode for small devices.</li>
|
||||
<li>Added -i option to recollq to specify extra indexes.</li>
|
||||
<li>Removed use of id3lib for extracting mp3 tags. A Python filter
|
||||
based on mutagen now handles all audio formats
|
||||
(mp3/flac/ogg). <i>If you are currently indexing audio
|
||||
files, you need to install mutagen, Recoll will not use
|
||||
id3lib or the Flac/Ogg tools any more</i>. </li>
|
||||
<li>Filter for <b>fictionbook</b> (.fb2) documents.</li>
|
||||
<li>Cleaned up the Python samples and made recollq.py a usable
|
||||
clone of recollq.</li>
|
||||
<li>Errors when opening additional indexes for a query are now
|
||||
fatal. They could easily go unnoticed before.</li>
|
||||
<li>Proper LARGEFILE support.</li>
|
||||
<li>Use <b>xsltproc</b> instead of misc dirty tricks to
|
||||
extract text from most current XML-based documents (except
|
||||
those in which the XML is too broken).</li>
|
||||
<li>Implement <tt>configure --enable-pic</tt> and use it for
|
||||
the KIO slave and Python and PHP modules.</li>
|
||||
</ul>
|
||||
|
||||
<p>Bugs also fixed in the 1.13 branch:</p>
|
||||
<ul>
|
||||
<li>The <tt>filename</tt> (transcoded file name) field
|
||||
could not be stored, so it could not be displayed in the
|
||||
result list. Can now be displayed as %(filename).</li>
|
||||
<li>Html files would always be indexed even when filtered
|
||||
out by <tt>indexedmimetypes</tt></li>
|
||||
<li>Preview: toggling between main text and metadata
|
||||
display would confuse the text format.</li>
|
||||
<li>Restore <tt>indexallfilenames=0</tt> functionality.</li>
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.13.04">1.13.04</a></h2>
|
||||
<ul>
|
||||
<li>Provide a set of configuration defaults so that compilation has a
|
||||
chance to succeed on unknown systems.</li>
|
||||
<li>Install icon to the pixmaps directory.</li>
|
||||
<li>Fixes stemming, which was broken for all previous 1.13
|
||||
releases.</li>
|
||||
<li><a href="BUGS.html#b_1_13_02">Bugs fixed between 1.13.02
|
||||
and 1.13.04.</a></li>
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.13.02">1.13.02</a></h2>
|
||||
<ul><li>This version has a single fix to work around a problem in the
|
||||
Qt 4.6.1 uic utility. If you are not using Qt 4.6.1 and are
|
||||
currently running Recoll 1.13.01, you do not need to
|
||||
upgrade.</li></ul>
|
||||
|
||||
<h2><a name="1.13.01">1.13.01</a></h2>
|
||||
|
||||
<ul>
|
||||
<li>Recoll has a new class of persistent external filters
|
||||
with the capability to process several documents, or
|
||||
multi-document files, in the same instance. Benefits: much
|
||||
faster image tag indexing, and new file formats. Except for
|
||||
the Perl image tag filter (because of ExifTool), the new
|
||||
filters are written in Python.<li>
|
||||
|
||||
<li>New file formats: chm (microsoft help), zip archives, .ics
|
||||
calendar files. Individual pages in chm files are indexed and
|
||||
can be previewed. Zip is quite convenient for maildir
|
||||
archives (for example).</li>
|
||||
|
||||
<li>Recoll can now use the output of the Beagle Firefox plugin
|
||||
to index visited web pages and bookmarks. This is only usable
|
||||
if Beagle itself is not running, else Recoll and Beagle will be
|
||||
fighting for the same queue.</li>
|
||||
|
||||
<li>Big text files (like application logs) can now be paged for
|
||||
indexing, avoiding excess memory usage during indexing and
|
||||
improving the usability at query time. They can also be
|
||||
altogether skipped by setting a maximum size configuration
|
||||
parameter. These parameters have default values (1 MB and 20
|
||||
MB) which change Recoll behaviour compared to previous
|
||||
versions. You can set <i>textfilepagekbs</i>
|
||||
and <i>textfilemaxmbs</i> to -1 in the configuration to
|
||||
restore the old behaviour.</li>
|
||||
|
||||
<li>A cache was implemented for mbox message header offsets. This
|
||||
speeds up message previews for big mbox files.</li>
|
||||
|
||||
<li>Miscellaneous usability improvements:
|
||||
<ul>
|
||||
<li>Allow using page-up/down and shift-home to scroll the
|
||||
result list while the focus is in the search entry. </li>
|
||||
<li>Make 'Use desktop preferences' the default for new
|
||||
Recoll installations, and make this choice more
|
||||
prominent in the external viewer dialog.</li>
|
||||
<li>^P starts the print dialog on a preview window.</li>
|
||||
<li>If a search has no result, alternate spellings are
|
||||
suggested. This feature is still a bit raw and will be
|
||||
improved.</li>
|
||||
<li>If the text of a document is empty, preview will switch to
|
||||
displaying the document fields.</li>
|
||||
<li>New entry in the result list contextual menu for opening
|
||||
the parent document of a result list hit with its native
|
||||
application. Useful for exemple for pages inside chm files.</li>
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
<li>Indentation is now preserved when displaying text documents
|
||||
inside the preview window. This is particularly welcome for
|
||||
program source files.</li>
|
||||
|
||||
<li>Allow substituting arbitrary fields in the result
|
||||
paragraph, using a %(fieldname) syntax</li>
|
||||
|
||||
<li>The real-time indexing monitor will now accumulate
|
||||
modifications during 30 S before indexing.</li>
|
||||
|
||||
<li>The indexer can now split camelCase words, allowing search on
|
||||
component terms. This is not enabled by default as it can
|
||||
confuse phrase searches (ie: "MySQL manual" is matched by
|
||||
phrase queries for "my sql manual" and "MySQL manual"
|
||||
but not "mysql manual"). Use "configure --enable-camelcase"
|
||||
to activate it.
|
||||
</li>
|
||||
|
||||
<li>The ipath is now printed by default after the url in the
|
||||
default result list format.</li>
|
||||
|
||||
<li><i>recoll_noindex</i> and <i>skippedNames</i> can now be
|
||||
changed at any point in the tree (only for topdirs previously).</li>
|
||||
|
||||
<li>Allow using location/application sensitivity in external viewer
|
||||
choice. This uses several new functions:
|
||||
<ul>
|
||||
<li>Allow the substitution of arbitrary document fields inside
|
||||
external viewer command line arguments.</li>
|
||||
<li>Allow field values to be set on all documents
|
||||
in a file system subtree. For example, you can
|
||||
set an application tag (ie: rclaptg = gnus) on all mailbox
|
||||
files under a specific directory.</li>
|
||||
<li>New syntax in mimeview for including the rclaptg field in
|
||||
viewer choice
|
||||
(<i>mimetype</i>|<i>tagvalue</i> = ...).</li>
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
<li>Allow specifiying a specific default character set for mail
|
||||
messages. This is mainly useful for readpst dumps. All
|
||||
reasonable non-ascii messages specify their character set.</li>
|
||||
|
||||
<li>Added a --without-gui configure option. Removes all X11 and
|
||||
Qt dependancies and only compiles the command-line interface.</li>
|
||||
|
||||
<li>Improved the kio_recoll build. There is no need to run
|
||||
configure manually in the main directory any more. Ubuntu
|
||||
packages for kio_recoll are now built on the
|
||||
<a href="http://launchpad.net/~recoll-backports/+archive/ppa">
|
||||
recoll-backports PPA on launchpad.net</a>.</li>
|
||||
|
||||
</ul>
|
||||
|
||||
|
||||
<h2><a name="1.12.4">1.12.4</a></h2>
|
||||
<p>Bugs fixed:</p>
|
||||
<ul>
|
||||
<li>Qt4 version only: the search inside the preview window
|
||||
could become unbearably slow for big documents (quadratically
|
||||
so), and could not be interrupted (Qt bug). The Qt3 version of
|
||||
the code was included in the preview tool to restore good
|
||||
performance. This bug is the main reason for this release.</li>
|
||||
</ul>
|
||||
<p>Build system improvements:</p>
|
||||
<ul>
|
||||
<li>Perform minimal base package configuration inside the kio
|
||||
cmake code to permit building it from scratch (without a build
|
||||
of the main code). Mainly useful for builds on the Ubuntu
|
||||
PPA.</li>
|
||||
<li>Implement a --without-gui option to build a pure
|
||||
command-line version with no Qt or X11 dependancies.</li>
|
||||
<li>Ensure that the user's PATH settings determine where we
|
||||
look first for qmake in all cases.</li>
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.12.3">1.12.3</a></h2>
|
||||
<p>This is a bug fix release.</p>
|
||||
<ul>
|
||||
<li>Fix the sort tool which had been broken since 1.11 with
|
||||
some (or all?) qt3 versions.<li>
|
||||
<li>Catch two Xapian exceptions which could crash the GUI when a query
|
||||
was run while the index was being updated.</li>
|
||||
<li>Ensure that the result list right-click pop up menu will appear even
|
||||
when the click is inside a table.</li>
|
||||
<li>Fix the way we retrieve the Xapian library version to avoid
|
||||
GUI compilation problems.</li>
|
||||
<li>Inside the real-time indexer: only use the main thread to test that
|
||||
the X11 server is still alive. Multithreaded calls to x11IsAlive()
|
||||
would sometimes crash the process because of an X11 error.</li>
|
||||
<li>Define filter timeout so that a looping filter (ie: rclps trying to
|
||||
index loop.ps) will not completely stop the indexing. Default value:
|
||||
20mn. Add loop.ps to skippedNames.</li>
|
||||
<li>Improve filter subprocesses management. Some could previously be
|
||||
left around after recollindex was killed. Improve cancellation
|
||||
request acknowledgment by recollindex (two ^C were sometimes
|
||||
necessary to make it terminate).</li>
|
||||
<li>Signals SIGUSR1 and SIGUSR2 are now blocked in addition to
|
||||
INTR/TERM/QUIT.</li>
|
||||
<li>Extended attributes indexing now works for all file types.</li>
|
||||
<li>Ensure that queries started from the command line are handled as
|
||||
normal ones (they previously could not be sorted).</li>
|
||||
<li>Improve man page indexing: do not index section header terms.</li>
|
||||
</ul>
|
||||
|
||||
|
||||
<h2><a name="1.12.1">1.12.1</a></h2>
|
||||
<p>This is a very minor release, mainly to fix compilation
|
||||
issues and a few very minor bugs. No need to upgrade if
|
||||
you don't experience these.</p>
|
||||
<ul>
|
||||
<li>Fixed compilation errors for new gcc and gnu libc.</li>
|
||||
<li>Use groff html output in rclman to get rid of control
|
||||
characters in output (improve manual pages indexing). Fix
|
||||
8bit character issues in file names in rcllyx.</li>
|
||||
<li>Fixed command line arguments processing problem with
|
||||
"recoll -q"</li>
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.12.0">1.12.0</a></h2>
|
||||
<ul>
|
||||
<li>Recoll now implements a KIO slave to allow searching
|
||||
directly from KDE applications. This does not affect the
|
||||
main application and is not enabled by default (go to the
|
||||
kde/kio/recoll source directory for build
|
||||
instructions). </li>
|
||||
<li>Recoll now computes md5 checksums for all indexed
|
||||
documents and optionally collapses duplicate entries inside
|
||||
the result list. This needs a full reindex to become
|
||||
effective for older documents already in the index. The
|
||||
option to activate collapsing is in the <i>Query
|
||||
Configuration</i>.</li>
|
||||
<li>Typing F1 anywhere in the GUI should bring up the
|
||||
appropriate section of the manual in the application
|
||||
configured for viewing HTML documents.</li>
|
||||
<li>The result list right click menu now has an entry to
|
||||
save the document to a file. This is only enabled for
|
||||
documents contained inside another file (ie, messages inside
|
||||
an mbox folder, or attachments), and is especially useful for
|
||||
extracting an attachment with no associated external
|
||||
editor.</li>
|
||||
<li>The preview window now has a right-click menu, with an
|
||||
entry to toggle between viewing the main text or all the
|
||||
metadata for the document. This is most useful in the case
|
||||
where the search match actually occurred in a field not
|
||||
visible in the main text (ie: author or HTML title).</li>
|
||||
<li>Words glued by an underscore character like
|
||||
<i>compound_word</i> are now split during indexing, and
|
||||
will be found when queried either as themselves or in a
|
||||
search for the components.</li>
|
||||
<li>There is now a size limit over which no attempt will be made to
|
||||
uncompress/identify/index compressed files. Not active by
|
||||
default, to be set in the <i>Indexing Configuration</i>.</li>
|
||||
<li>Added support for fetching field values from extended file
|
||||
attributes. This is not enabled by default, use
|
||||
<i>configure --enable-xattr</i>. You'll also need to
|
||||
set up a map from the attributes names to the Recoll field
|
||||
names (see comment at the end of the <i>fields</i>
|
||||
configuration file.</li>
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.11.4">1.11.4</a></h2>
|
||||
<ul>
|
||||
|
||||
<li>Bugs fixed:
|
||||
check the <a href="BUGS.html#b_1_11_1">list</a>.</li>
|
||||
|
||||
<li>The right-click menu "Copy" commands inside the result list
|
||||
now copy to the clipboard in addition to the main selection,
|
||||
enabling subsequent ^v commands.</li>
|
||||
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.11.0">1.11.0</a></h2>
|
||||
|
||||
<p><i>Recoll release 1.11 has relatively extensive changes that have
|
||||
necessitated a modification of the index format. Hence installing this
|
||||
release implies a full re-indexing, which is enforced by the
|
||||
software.</i></p>
|
||||
|
||||
<ul>
|
||||
<li>Filtering on category (message/text/media etc.) as a function of
|
||||
the main window for quick access.</li>
|
||||
|
||||
<li>Use html for preview when available (ex: html files or "colorized"
|
||||
python) instead of converting to text. This can be turned of in the
|
||||
preferences. </li>
|
||||
|
||||
<li>New Python query and index interfaces. The Python query
|
||||
interface will be used for building a Xesam adapter for
|
||||
Recoll when the specification is stabilized, and could be
|
||||
useful for other things, such as indexing contents from an
|
||||
RDBMS (see
|
||||
<a href="usermanual/usermanual.html#RCL.PROGRAM.PYTHONAPI">
|
||||
the manual</a> for details). Restructured and cleaned up
|
||||
internal Recoll interfaces.</li>
|
||||
|
||||
<li>Improved filter framework. Can now process either html or text output
|
||||
from the filters, and more easily execute "raw" commands instead of
|
||||
Recoll scripts. Avoided wasteful repeated execution of filters for
|
||||
which the helper application is missing.</li>
|
||||
|
||||
<li>Query language now closer to Xesam specification, (but
|
||||
still far from a
|
||||
complete implementation). See the Recoll manual and
|
||||
<a href="http://www.xesam.org/main/XesamUserSearchLanguage">
|
||||
http://www.xesam.org/main/XesamUserSearchLanguage</a> </li>
|
||||
|
||||
<li>Much improved configuration for fields. Fields like
|
||||
"author" can now be specified as storable (displayable in
|
||||
results) and/or indexed (searchable). Added alias facility
|
||||
for translating from user-level names to internal.</li>
|
||||
|
||||
<li>Added "recipient" as an indexed/searchable field for emails.</li>
|
||||
|
||||
<li>rcltext filter for processing text such as C code for which no specific
|
||||
processing is needed when indexing but a specific viewer is
|
||||
desired.</li>
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.10.6">1.10.6</a></h2>
|
||||
<ul>
|
||||
<li>Fix a simple and mildly nasty bug that would cause the
|
||||
indexer to stop
|
||||
indexing an mbox on encountering a specific but not exceptional error
|
||||
condition (like a few dozen errors while indexing attachments for which
|
||||
no filter was installed).</li>
|
||||
</ul>
|
||||
|
||||
|
||||
<h2><a name="1.10.5">1.10.5</a></h2>
|
||||
<ul>
|
||||
<li>Ensure that file names indexed as terms don't overflow the maximum term
|
||||
size.</li>
|
||||
|
||||
<li> Handle non-standard date format in mbox separator lines sometimes
|
||||
generated by thunderbird.
|
||||
|
||||
<li> Use attachment file names to help identify a better mime type for
|
||||
parts only described as application/octet-stream
|
||||
|
||||
<li> For Phrase/Near searches, highlight all term groups in preview, not just
|
||||
the first
|
||||
|
||||
<li> Added Open XML filters
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.10.2">1.10.2</a></h2>
|
||||
<ul>
|
||||
|
||||
<li>Fixed openSuse 11 compile issues.
|
||||
|
||||
<li>Fixed bug in interpreting email mime structure, which resulted in base-64
|
||||
decoding errors.
|
||||
|
||||
<li>Fixed "Prev" button in preview window. Would actually go forward when
|
||||
walking the search terms.
|
||||
|
||||
<li> Allow setting the highlight color for search terms in result list and
|
||||
preview (yes: feature change, should have waited for major release...)
|
||||
|
||||
<li> Added svg filter
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.10.1">1.10.1</a></h2>
|
||||
<ul>
|
||||
|
||||
<li> Ensure that in case the data of a file can't be indexed because of some
|
||||
error, at least the file name is indexed.
|
||||
|
||||
<li> Improve query language to support OR queries of terms with field
|
||||
specifications (ie: title:someterm OR author:someauthor).
|
||||
|
||||
<li> Fix filename search to split patterns on white space, so that
|
||||
a "*.jpg *.jpeg" search does what's expected. Means you now need to use
|
||||
double-quotes if there is actual embedded white space.
|
||||
|
||||
<li> Jump directly to the external editor choice dialog instead of opening
|
||||
preferences when an external viewer is not found.
|
||||
|
||||
<li> Allow stopping indexing through menu action (only works with qt4 for now).
|
||||
|
||||
<li> Create an "indexedmimetypes" configuration variable to allow explicitely
|
||||
restricting the file types which do get indexed.
|
||||
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.10.0">1.10.0</a></h2>
|
||||
<ul>
|
||||
|
||||
<li> Added a GUI dialog to configure the indexing parameters.
|
||||
|
||||
<li> Added better support for indexing CJK text (Chinese, Japanese, Korean).
|
||||
Please note that:
|
||||
- You will need a full reindex to take good advantage of this. (You
|
||||
*don't* need to reindex if you don't need to search CJK, even if there
|
||||
is some in your index).
|
||||
- When entering CJK search terms, words (single or multiple characters)
|
||||
should be separated with white space.
|
||||
- The specific CJK processing can be turned off by setting the nocjk
|
||||
variable to true in the configuration file (this may make sense if you
|
||||
have a mixed cjk/other document base and don't want to index the cjk
|
||||
part, as it will save some disk space and a minuscule amount of cpu).
|
||||
|
||||
<li> Changed the way Recoll handles searches including composite words (like
|
||||
an email address). The new approach looks saner, but could have
|
||||
side-effects, please report any problems in this area.
|
||||
|
||||
<li> The query language got a new "dir:" specifier to filter results on location.
|
||||
|
||||
<li> New rclimg perl filter for better indexing of picture tags, thanks to
|
||||
Cedric Scott. This depends on Exiftool.
|
||||
<li> New rcltex filter.
|
||||
|
||||
<li> Changed and improved how the preview window local search finds the
|
||||
query terms, this does not involve weird characters any more. The
|
||||
display is cleaner and cut and paste works better.
|
||||
|
||||
<li> Fixed the fact that a newline-separated word list in simple search would
|
||||
wrongly trigger a phrase search.
|
||||
|
||||
<li> Fixed the way we input text to the preview textedit (the old way would
|
||||
sometimes confuse the window into displaying tags instead of acting on
|
||||
them).
|
||||
|
||||
<li> Fixed transcoding to utf-8 for text/plain email attachments
|
||||
|
||||
<li> Improved mbox From_ line detection
|
||||
|
||||
<li> Added indexedmimetypes variables to allow restricting the list of indexed
|
||||
mime types.
|
||||
|
||||
<li> KDE kicker applet: start a recoll search from the panel and get a
|
||||
Recoll window. This is a clone from the find_applet, originally meant to
|
||||
start a Tracker search. Not so useful presently because it will start a
|
||||
new Recoll instance for every search. Not part of the main source (the
|
||||
configure script is a whopping 1MB...), linked from the download page.
|
||||
<li> Added recoll command line options to define a query and execute it
|
||||
immediately when the program starts. This is used in practice from the
|
||||
applet and could be used from other programs. There is a also a new
|
||||
option to not start the GUI and print the results to stdout.
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.9.0">1.9.0</a></h2>
|
||||
<ul>
|
||||
<li> Incompatible change: the icon image reference is now part of the result
|
||||
list paragraph format string:
|
||||
- If you had a standard config, you need do nothing.
|
||||
- If you had a custom format string, you need to add
|
||||
<img src="%I" align="left"> at its beginning to get the same result as
|
||||
before.
|
||||
- If you had unchecked the "show icons" option, you need to remove the
|
||||
above string from the paragraph format to make the icons go away.
|
||||
Changes to the format string are performed in the
|
||||
"Preferences->Query Configuration->User Interface" dialog tab.
|
||||
|
||||
<li> New filters: wordperfect, abiword and kword, rcljpeg, rclflac, rclogg
|
||||
(contributed filters). The jpeg and audio filters should be extended to
|
||||
make use of the new field indexing/search capability (hint :) )
|
||||
|
||||
<li> When searching for an empty string inside the preview window, position
|
||||
the window to the next occurrence of a primary search term.
|
||||
|
||||
<li> Added ext: and mime: selectors to the query language.
|
||||
|
||||
<li> Added an adjustable flush threshold during indexing: should help control
|
||||
memory usage. See the idxflushmb configuration variable.
|
||||
|
||||
<li> Added a check for file system free space. Indexing will stop if the
|
||||
threshold is reached. See the maxfsoccuppc configuration parameter.
|
||||
|
||||
<li> Added 'followLinks' configuration option to have the indexer follow
|
||||
symbolic links while walking the tree (the default is false).
|
||||
|
||||
<li> Allow symbolic links as 'topdirs' members. These are always followed.
|
||||
|
||||
<li> Add preference option to remember sort tool state between program
|
||||
invocations (it is reset to inactive by default)
|
||||
|
||||
<li> Added File menu entry to erase document history.
|
||||
|
||||
<li> Bound the space and backspace keys to PgUp/PgDown in preview.
|
||||
|
||||
<li> (Hopefully) Improved abstract (keyword in context) generation
|
||||
|
||||
<li> Added support for arbitrary fields. Filters can now produce any number of
|
||||
fields which will be selectively searchable through the query
|
||||
language. This could be useful, for exemple, for the mp3 and jpeg filters
|
||||
(but it is not currently used).
|
||||
|
||||
<li> Improved qt4 build: no more need for --enable-qt4. Note: the qt4 build
|
||||
still needs the qt3 support library.
|
||||
|
||||
<li> Changed the icon to an ugly one. The previous one was nicer but looked
|
||||
too much like Xapian's.
|
||||
|
||||
<li> Added some kind of support for a stopword list.
|
||||
|
||||
<li> Have email attachments inherit date and author from their parent message
|
||||
(instead of mail folder).
|
||||
|
||||
<li> Fix bus error on rclmon exit
|
||||
|
||||
<li> Better handling of aspell errors inside rclmon
|
||||
|
||||
<li> Fixed a number of qt4 glitches: selection and keyboard shortcuts.
|
||||
|
||||
<li> New query configuration parameter to set the maximum text size beyond
|
||||
which text won't be hilighted before preview (takes too much time). This
|
||||
was a fixed value in 1.8.
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.8.2">1.8.2 2007-05-19</a></h2>
|
||||
<ul>
|
||||
<li> Fixed method name for compatibility with xapian 1.0.0
|
||||
<li> Add .beagle to default list of skipped names (avoids indexing beagle
|
||||
document cache...)
|
||||
<li> Fix configure.ac to use $libdir instead of /usr/lib
|
||||
<li> Fix recollinstall to properly copy translations and pictures for qt4
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.8.1">1.8.1 2007-02-20</a></h2>
|
||||
<ul>
|
||||
<li> Add a small query language with some field-based searches (author, title,
|
||||
etc.)
|
||||
<li> Add wildcard handling everywhere. *, ?, [] can be used in any
|
||||
search. Warning: using a wild card at the left of a term can make
|
||||
for a very slow search.
|
||||
<li> Allow skipping specific paths during indexing (in addition to file name
|
||||
patterns)
|
||||
<li> Improved external index choice dialog, accessible from the top-level menu.
|
||||
<li> Many small bugs fixed: stemming language choice ignored in term explorer,
|
||||
qt4 preview window reentrancy crashes, issues with saving the default
|
||||
advanced search file, type filter, display more clearly missing helper
|
||||
errors, etc.
|
||||
<li> Option to use the desktop defaults (with xdg-open) to choose the native
|
||||
viewer for files (instead of recoll's mimeview).
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.7.6">1.7.6 2007-01-30</a></h2>
|
||||
<ul>
|
||||
<li> Fixes an issue with the openoffice filter on debian systems.
|
||||
<li> Adds Scribus and Lyx filters.
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.7.5">1.7.5 2007-01-15</a></h2>
|
||||
<ul>
|
||||
<li> Fixes two email indexing bugs in 1.7.3, which would bail out from an
|
||||
mbox folder on the first attachment filtering error, and would decline
|
||||
to handle multipart/signed bodies. You may need to run a full indexing
|
||||
pass (recollindex -z), to force reindexing of old folders.
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.7.3">1.7.3 2007-01-09</a></h2>
|
||||
<ul>
|
||||
<li> Email attachments are now indexed.
|
||||
<li> Right-click menu option to access the parent document of an embedded
|
||||
result (ie from mail attachment to parent message), or the parent folder
|
||||
of a given file (which is opened with the application configured for
|
||||
directories)
|
||||
<li> The sort tool has been improved: no need to restart the query after sort
|
||||
criteria change.
|
||||
<li> Support for real-time indexing with inotify is now enabled by default
|
||||
when appropriate.
|
||||
<li> Recoll now warns when the configured native viewer can not be found and
|
||||
starts an interface for chosing another one.
|
||||
<li> Categories (text, presentation, spreadsheets, etc.) can be used instead
|
||||
of raw mime types when filtering on file types in advanced search.
|
||||
<li> The port to qt4 is functional and can be enabled with configure --enable-qt4
|
||||
<li> 'autophrase' option improved and may now actually be useful.
|
||||
<li> Improved highlighting (again...)
|
||||
<li> Display term frequencies in term explorer.
|
||||
<li> Recollindex -e to remove data from index for listed files.
|
||||
<li> Directory names now indexed. Directories can be 'edited' with the
|
||||
configured application (rox by default)
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.6.3">1.6.3</a></h2>
|
||||
<ul>
|
||||
<li> Fixed problem with bad detection of mbox message boundaries.
|
||||
Upgrading can change the message numbering in some cases, and you should
|
||||
perform a full index update (recollindex -z) after installing
|
||||
the new version.
|
||||
<li> Fixed problem with execution of external viewer for files with
|
||||
single-quotes in the name.
|
||||
</ul>
|
||||
<h2><a name="1.6.2">1.6.2</a></h2>
|
||||
<ul>
|
||||
<li> Minor solaris compilation glitches only.
|
||||
</ul>
|
||||
<h2><a name="1.6.1">1.6.1</a></h2>
|
||||
<ul>
|
||||
<li> Term explorer: a multimode wildcard-regexp-spell/phonetic tool to search
|
||||
the index for terms. This uses aspell for the orthographic/phonetic part.
|
||||
<li> A more dynamic advanced search window. You now have a choice of the top
|
||||
level conjunction (OR/AND) and of any number of clauses, including NEAR
|
||||
and PHRASE clauses with an adjustable proximity parameter.
|
||||
<li> User-settable format for the result-list entries, which use an HTML
|
||||
string with %xx printf-like replacements (accessible from the user
|
||||
preferences).
|
||||
<li> Real time monitoring/indexing support. This is not configured by
|
||||
default, and must be specified at build time (configure --help).
|
||||
<li> Improved phrase/group highlighting in abstracts and preview
|
||||
<li> Better sample selection for synthetic abstracts.
|
||||
<li> Improved performance of the text splitter, good for indexing and previewing.
|
||||
<li> Shift+click link to open new preview window instead of tab in existing
|
||||
window.
|
||||
<li> The key sequence for term completion in the simple search entry was
|
||||
changed from CTRL+TAB to "Escape Space" to avoid interaction with window
|
||||
managers.
|
||||
<li> Improved recall for phrases with composite words like email addresses.
|
||||
|
||||
|
||||
Updating from 1.2 to 1.3 or 1.4 or 1.5:
|
||||
<li>--------------------------------------
|
||||
From version 1.3 up, there is a new feature to search specifically for file
|
||||
names (with wildcard processing). If you want to take full advantage of
|
||||
this, you should perform a full reindex after installing the new version
|
||||
(ie: use recollindex -z, or delete ~/.recoll/xapiandb).
|
||||
Also, we now use the central copies of configuration files for default
|
||||
values, and the user ones only for overrides. Your old configuration files
|
||||
will still work, but, you may want to remove them if they are unmodified,
|
||||
or keep only the modified parameters.
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.5.9 ">1.5.9 </a></h2>
|
||||
<ul>
|
||||
<li> Fix bad timezone conversion in email dates. Display timezone in result
|
||||
list dates.
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.5.8">1.5.8</a></h2>
|
||||
<ul>
|
||||
<li> Fix stored and displayed dates which used to come from the file's ctime,
|
||||
now use mtime (which was already used for deciding re-indexing).
|
||||
<li> Fix problem with some weird MIME messages (with null boundaries) which
|
||||
crashed the indexer.
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.5.6">1.5.6</a></h2>
|
||||
<ul>
|
||||
<li> Small fixes dealing with the build process or compiler issues.
|
||||
1.5.6 has updated ukrainian and russian messages.
|
||||
Otherwise no functional changes, and no need to upgrade from 1.5.1
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.5.1">1.5.1</a></h2>
|
||||
<ul>
|
||||
<li> Fix serious bug with non ascii strings in simple search history
|
||||
<li> Improve synthetic abstracts: remove size limitations, handle overlapping
|
||||
extracts, avoid printing several terms from the same position.
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.5.0">1.5.0 2006-09-20</a></h2>
|
||||
<ul>
|
||||
|
||||
<li> Added support for powerpoint and excel files, with the catdoc package.
|
||||
<li> Allow viewing consecutive documents from the result list inside a single
|
||||
preview window using the shift-arrow-up and shift-arrow-down keys.
|
||||
<li> Colorize search terms in abstracts in the result list.
|
||||
<li> A number of elements are now remembered between program invocations:
|
||||
sort criteria, list of ignored file types (always starts inactive),
|
||||
subtree restriction, better handling of the recent searches listbox, the
|
||||
buildAbstract and replaceAbstract settings are not forgotten any more.
|
||||
<li> New option to automatically add a phrase to simple searches.
|
||||
<li> Possibility to adjust the length and context width for synthetic abstracts.
|
||||
<li> Handle weird html better.
|
||||
<li> When indexing mail messages, walk the full mime tree instead of staying
|
||||
at the top level, index all text parts and attachement file names.
|
||||
<li> Add -c <confdir> option to recoll and recollindex to specify the
|
||||
configuration directory on the command line
|
||||
<li> Better synchronization between the active preview and the highlighted
|
||||
paragraph inside the list
|
||||
<li> Improved recall for some special cases of stemming.
|
||||
<li> Much better handling of email dates, allowing better email sorting by
|
||||
date (previously the message date was quite often the date when the file
|
||||
was indexed).
|
||||
<li> Store the external database lists in the configuration directory, not the
|
||||
qt preferences.
|
||||
<li> Ensure dialogs are sized according to font size
|
||||
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.4.3">1.4.3 2006-05-07</a></h2>
|
||||
<ul>
|
||||
<li> Multiple search databases.
|
||||
<li> Optionally auto-search when a word is entered in the simple search
|
||||
field.
|
||||
<li> Show possible term completions in simple search by typing CTRL+TAB
|
||||
<li> Add 'more like this' option to result list right-click menu, to look for
|
||||
documents related to the current result.
|
||||
<li> Double-click in preview or result list adds the selected word to the
|
||||
simple search text field.
|
||||
<li> The simple search text entry field is now a combobox and remembers
|
||||
previous searches.
|
||||
<li> Additional OR field in complex search.
|
||||
<li> Improved indexing cancellability (interrupting recollindex or closing
|
||||
recoll with an indexing thread active), and status reporting.
|
||||
<li> Fixed filters to handle file paths with embedded spaces.
|
||||
<li> Misc small bug and memory leaks fixes.
|
||||
<li> More compact result list.
|
||||
<li> Set mode 0700 on .recoll directory by default
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.3.3">1.3.3 2006-04-04</a></h2>
|
||||
<ul>
|
||||
<li> Implement specific search on file names with wildcard
|
||||
support. Indexing can optionally process all file names or only those
|
||||
with mime types supported for normal indexing. UPDATING: you need a
|
||||
full re-indexing to take advantage of this.
|
||||
<li> Use links and a right-click popup menu to replace confusing use of
|
||||
mouse clicks and double-clicks inside the result list.
|
||||
<li> The 'example' configuration files are now used as default, and are not
|
||||
copied any more to the user directory during installation. Overrides can
|
||||
be set in the personal files for any value that the user wishes to
|
||||
modify, with unchanged formats and file names (so that the files from
|
||||
previous versions remain valid, but you may wish to trim them of values
|
||||
that duplicate the central ones).
|
||||
<li> Use NLS information (LC_CTYPE, LANG) do determine default charset when
|
||||
possible.
|
||||
<li> Mp3 file indexing, either filenames only or also id3 tags if id3info is
|
||||
available. c/c++ ext edit. Use gnuclient instead of xemacs for text files.
|
||||
<li> Russian and Ukrainian translations and many improvement ideas thanks to
|
||||
Michael Shigorin.
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.2.3">1.2.3 2006-03-03</a></h2>
|
||||
<ul>
|
||||
<li> Added support for dvi (with dvips), and dvu (with DjVuLibre).
|
||||
<li> Ensure that configure and make use the same qt version.
|
||||
<li> Fix sorted sequence title display.
|
||||
<li> Discriminate fatal errors and missing docs while loading a doc list.
|
||||
<li> Improved and cleaned up way to position a preview on the first search term.
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.2.2">1.2.2 2006-02-02</a></h2>
|
||||
<ul>
|
||||
<li> Fix minor compilation glitches (FreeBSD 4, QT 3.1, xapian-config problem)
|
||||
|
||||
</ul>
|
||||
<h2><a name="1.2.0">1.2.0 2006-02-01</a></h2>
|
||||
<ul>
|
||||
<li> Improved preview loading: don't highlight very big documents (over 1Mb),
|
||||
allow cancellation while loading.
|
||||
<li> Abstracts generated in the result list by looking at search term
|
||||
contexts. This can slow down result list display for big documents, and
|
||||
can be turned off in the preferences menu.
|
||||
<li> Wrap query detail line displayed when clicking on result list header.
|
||||
<li> Text splitting cleanup with less spurious terms should result in
|
||||
slightly smaller databases.
|
||||
<li> Sligthly improved presentation in preview, esp. line breaks.
|
||||
<li> Color icons...
|
||||
<li> Let the user select the html browser used for help display.
|
||||
<li> autoconf/Makefile change: allow building UI from inside the qtgui
|
||||
directory.
|
||||
<li> autoconf/Makefile: improved search and diagnostics for qt/qmake.
|
||||
<li> Internal code cleanup for maintainability: text splitting, user
|
||||
interface.
|
||||
<li> Added prototype kio_slave to show result inside Konqueror, doesn't seem
|
||||
particularly useful.</li>
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.1.0">1.1.0 2006-01-12</a></h2>
|
||||
<ul>
|
||||
<li> A much better user manual, which can be browsed from the help menu.
|
||||
<li> man pages for recoll, recollindex, recoll.conf
|
||||
<li> User/query interface configuration dialog.
|
||||
<li> Click on result list header will display the exact boolean search which
|
||||
was used.
|
||||
<li> recollindex can be used to create stem expansion databases independantly
|
||||
of a full indexing pass.
|
||||
<li> Misc user interface improvements, like an 'all terms' checkbox for
|
||||
simple search.
|
||||
<li> Fixed case-insensitivity issues. Probably needs more testing.
|
||||
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.0.16">1.0.16 2006-01-05</a></h2>
|
||||
<ul>
|
||||
<li> Minor installation tweaks for rpm compatibility
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.0.15 ">1.0.15 </a></h2>
|
||||
<ul>
|
||||
<li> Fix problems with prefix != /usr/local
|
||||
<li> Remove '.*' from the default list of ignored file/dir names: this
|
||||
prevented mozilla/thunderbird mail indexing.
|
||||
<li> Fix some 64 bits issues
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.0.14">1.0.14</a></h2>
|
||||
<ul>
|
||||
<li> Small changes for FreeBSD 4 compilation.
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.0.13">1.0.13</a></h2>
|
||||
<ul>
|
||||
<li> Install of recollinstall program not done or needed any more.
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.0.12">1.0.12</a></h2>
|
||||
<ul>
|
||||
<li> Fixed nasty html parsing bug introduced in 1.0.9 Html parsing failed
|
||||
whenever the document charset name differed from the default only in
|
||||
character case or punctuation.
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.0.11">1.0.11</a></h2>
|
||||
<ul>
|
||||
<li> Create personal configuration on first start.
|
||||
<li> Use qt toolbars.
|
||||
<li> Also index terms in file paths.
|
||||
<li> Tool for sorting on dates or mime types.
|
||||
<li> Fixed pdf filter which was broken by more recent xpdf
|
||||
<li> Filters now installed/executed from /usr/local
|
||||
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.0.10">1.0.10</a></h2>
|
||||
<ul>
|
||||
<li> Added tool to manage the history of consulted documents.
|
||||
<li> Try harder to convert email messages with wrongly declared charsets.
|
||||
<li> Add option to reset the database before indexing (easier than rm -rf).
|
||||
<li> Small gui improvements.
|
||||
<li> Install partial french translation as a tease for future translaters...
|
||||
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.0.9">1.0.9</a></h2>
|
||||
<ul>
|
||||
<li> Fixed 2 really ennoying bugs in 1.0.8: wouldn't preview 2nd document
|
||||
from same file + spurious db close when filter could not be executed.
|
||||
|
||||
</ul>
|
||||
|
||||
<h2><a name="1.0.8">1.0.8</a></h2>
|
||||
<ul>
|
||||
<li> Add support for rtf and gaim logs
|
||||
<li> Optionally show icons to indicate mime types in result list
|
||||
<li> Better (but imperfect) feedback during the preview
|
||||
loading for big files
|
||||
<li> Remember main window geometry when closing
|
||||
<li> Fix stem expansion in advanced search
|
||||
<li> Some autoconf
|
||||
<li> Option to use the system's 'file' command as a final step of
|
||||
identification for suffix-less or unknown files.
|
||||
<li> Typo had removed support for .Z compression
|
||||
<li>Use more appropriate conjonction operators when
|
||||
computing the advanced search query (OP_AND_MAYBE,
|
||||
OP_FILTER instead of OP_AND)
|
||||
</ul>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@ -1,17 +0,0 @@
|
||||
#!/bin/sh
|
||||
set -x
|
||||
docdir=/home/dockes/projets/fulltext/recoll/src/doc/user/
|
||||
|
||||
#(cd $docdir;make) || exit 1
|
||||
|
||||
test -d usermanual || mkdir usermanual || exit 1
|
||||
cd usermanual
|
||||
|
||||
thisdir=`pwd`
|
||||
(cd $docdir; find . -name templates -prune -o -print | cpio -vudp $thisdir)
|
||||
|
||||
mv usermanual.pdf recoll_user_manual.pdf
|
||||
# The freebsd tool chain generates a link to book.html in the index. Too
|
||||
# lazy to check if this can be changed
|
||||
cp -p usermanual.html book.html
|
||||
#cp usermanual.html index.html
|
||||
@ -1,80 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>RECOLL: credits</title>
|
||||
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux
|
||||
based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search, desktop search, unix, linux">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
|
||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="index.html">Home</a></li>
|
||||
<li><a href="pics/index.html">Screenshots</a></li>
|
||||
<li><a href="download.html">Downloads</a></li>
|
||||
<li><a href="doc.html">User manual</a></li>
|
||||
<li><a href="index.html#support">Support</a></li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h3><a name="credits">Credits</a></h3>
|
||||
|
||||
<p>First of all, many thanks to the users who provided criticism
|
||||
and ideas to make <span class="application">Recoll</span> go
|
||||
forward ! Please
|
||||
<a href="mailto:jfd@recoll.org">
|
||||
contact me</a> if you have something to suggest.</p>
|
||||
|
||||
<p><span class="application">Recoll</span> borrows
|
||||
from the following projects. I tried to include the relevant
|
||||
copyright attributions with the code. Any omission is
|
||||
unintentional and will be fixed as soon as notified. </p>
|
||||
|
||||
<ul>
|
||||
<li><a href="http://www.xapian.org">Xapian</a>: The database module
|
||||
(core) is used unmodified, and quite a lot of code has been
|
||||
borrowed from Omega, the web-based search application (ie:
|
||||
the html parser, plus miscellaneous bits and ideas). </li>
|
||||
<li><a href="http://estraier.sourceforge.net/">Estraier</a>:
|
||||
Some of the input handlers still have bits of Estraier code
|
||||
in them.</li>
|
||||
<li><a href="http://www.senga.org/">Unac</a>: for accent
|
||||
removal. This package is unmaintained and the (quite modified)
|
||||
code is carried with the <span class="application">Recoll</span>
|
||||
source.</li>
|
||||
<li><a href="http://www.gnu.org/software/libiconv/">Iconv</a>, for
|
||||
character set conversion.</li>
|
||||
<li><a href="http://www.bincimap.org/">Binc IMAP</a> for MIME
|
||||
parsing code. The original package is unmaintained and the
|
||||
relevant code is carried with the <span
|
||||
class="application">Recoll</span> source.</li>
|
||||
<li>The icons mainly come from the <a
|
||||
href="http://www.everaldo.com/">Crystal SVG</a> KDE set.</li>
|
||||
</ul>
|
||||
|
||||
<ul>
|
||||
<li>I fear that bugs found elsewhere are mostly mine:
|
||||
<a href="mailto:jfd@recoll.org">jfd@recoll.org</a></li>
|
||||
</ul>
|
||||
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@ -1,630 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>RECOLL: result list customisation tips</title>
|
||||
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux
|
||||
based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search, desktop search, unix, linux">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
|
||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
||||
|
||||
|
||||
<style type="text/css">
|
||||
/* Photo-Caption PZ3 CSS v080630
|
||||
* copyright: http://randsco.com/copyright
|
||||
* www.randsco.com
|
||||
*/
|
||||
|
||||
.PZ3-l { float:left; margin-right:10px; }
|
||||
.PZ3-r { float:right; margin-left:10px; direction:rtl; }
|
||||
html>/**/body .PZ3-r { position:relative; }
|
||||
|
||||
.PZ3zoom { border:1px solid #369; }
|
||||
.PZ3zoom a,.PZ3zoom a:visited { display:block;
|
||||
padding:0; overflow:hidden; text-decoration:none;
|
||||
height:100%; width:100%; }
|
||||
html>/**/body .PZ3-r a { right:0; }
|
||||
|
||||
.PZ3zoom a:hover { position:absolute;
|
||||
z-index:999; padding:0; background:none;
|
||||
cursor:default; height:auto; width:auto;
|
||||
overflow:visible; border:1px solid #369;
|
||||
margin:-1px 0 0 -1px; }
|
||||
html>body .PZ3zoom a:hover { margin:-1px -1px 0 -1px; }
|
||||
|
||||
.PZ3zoom a img { border:0; height:100%; width:100%; }
|
||||
.PZ3zoom a:hover img { height:auto; width:auto;
|
||||
border:0; }
|
||||
|
||||
a:hover .PZ3cap,
|
||||
a:hover .PZ31cap { display:block;
|
||||
direction:ltr; font:10pt verdana,sans-serif;
|
||||
margin-top:-3px; background:#369; color:#fff;
|
||||
text-align:left; }
|
||||
a:hover .PZ3cap { padding:3px 5px; }
|
||||
.PZ3inr { display:block; padding:2px 5px; }
|
||||
|
||||
.noCap a:hover .PZ3cap,
|
||||
.noCap a:hover .PZ31cap { display:none; }
|
||||
.noBdr,.noBdr a:hover { border:0; }
|
||||
.Lnk a:hover { cursor:pointer; }
|
||||
|
||||
/* End Photo-Caption Zoom CSS */
|
||||
</style>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="index.html">Home</a></li>
|
||||
<li><a href="pics/index.html">Screenshots</a></li>
|
||||
<li><a href="download.html">Downloads</a></li>
|
||||
<li><a href="doc.html">User manual</a></li>
|
||||
<li><a href="index.html#support">Support</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h1>Recoll result list customising exemples</h1>
|
||||
|
||||
<p>The Recoll result list is actually made of html text
|
||||
displayed inside a Qt Widget. In all Recoll versions, you
|
||||
can specify the format for the list entries: what data is
|
||||
displayed for each hit document and how. This used to include
|
||||
"almost full" support for HTML capabilities, with a few
|
||||
restrictions due to the Qt QTextBrowser object. The details
|
||||
are described in the
|
||||
<a href="http://www.recoll.org/usermanual/usermanual.html#RCL.SEARCH.GUI.CUSTOM.RESLIST">
|
||||
Recoll manual</a>.</p>
|
||||
|
||||
<p>As of Recoll 1.17, the result list is a WebKit object by
|
||||
default (WebKit is the basis for several major browsers),
|
||||
which yields full CSS and even Javascript support.</p>
|
||||
|
||||
<h2>New in Recoll 1.17: the WebKit result list</h2>
|
||||
|
||||
|
||||
<p>For newer Recoll versions, you can specify the
|
||||
individual result format, as for previous versions. You can
|
||||
also define code to be included in the HTML
|
||||
header (ie: CSS or Javascript), using
|
||||
<tt>Preferences->Query Configuration->Result List->Edit result page html header insert</tt></p>
|
||||
|
||||
<p>This, plus the full Javascript and CSS support in WebKit,
|
||||
open a world of possibilities for result list formatting and
|
||||
even behaviour.</p>
|
||||
|
||||
<p>The examples which follow are probably not generally
|
||||
very useful but they show the kinds of things you can do, if
|
||||
you can use Javascript/CSS which is not my case.</p>
|
||||
|
||||
<h3>Using the icons as links</h3>
|
||||
<p>You can now make the list icons links that activate the
|
||||
preview or open action (or the document url which you can then
|
||||
drag/drop to other windows). Using images as links did
|
||||
not work with QTextBrowser.</p>
|
||||
|
||||
<h3>Alternating result backgrounds</h3>
|
||||
<p>Using the following Javascript inside the header will yield
|
||||
alternating backgrounds for the results:</p>
|
||||
|
||||
<pre>
|
||||
<script type="text/javascript">
|
||||
function altRows() {
|
||||
var rows = document.getElementsByClassName("rclresult");
|
||||
for (i = 0; i < rows.length; i++) {
|
||||
if (i % 2 == 0) {
|
||||
rows[i].style.backgroundColor = "#d4e3e5";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
window.onload = function() {
|
||||
altRows();
|
||||
}
|
||||
</script>
|
||||
</pre>
|
||||
|
||||
|
||||
<h3>Zooming the paragraph font size</h3>
|
||||
<p>If you are using a format with small fonts, it may be useful
|
||||
to be able to zoom the text when the mouse hovers over it. A
|
||||
very basic way to do this -<em>with the standard paragraph
|
||||
format, which is a table</em>- would be to include the following
|
||||
code in the header:</p>
|
||||
<pre>
|
||||
<style type="text/css">
|
||||
table:hover {font-size: 130%;}
|
||||
</style>
|
||||
</pre>
|
||||
|
||||
<p>Of course, the selector should be adapted to your own
|
||||
result format. You should know that every result will be
|
||||
enclosed by Recoll inside a <tt><div
|
||||
class="rclresult" rcldocnum="nn"></tt> element.</p>
|
||||
|
||||
<h3>Zooming the thumbnails</h3>
|
||||
|
||||
<p>Recoll 1.17 and newer will display document
|
||||
thumbnails instead of the type icon if the thumbnail exists in
|
||||
the standard Freedesktop location. The icons/thumbnails are
|
||||
64x64 pixels in size, which is a bit small. The standard
|
||||
thumbnail files are actually 128x128, which is much more
|
||||
detailed. Using them statically would consume too much list
|
||||
space though. Using CSS, you can get them to expand when the
|
||||
mouse is over them. Recipee:</p>
|
||||
|
||||
<blockquote>
|
||||
<p>Retrieve the CSS code
|
||||
from <a href="http://randsco.com/_miscPgs/cssZoomPZ3.html">randsco
|
||||
pure CSS photo-caption zoom</a>, and include it inside the
|
||||
result list html header by using the "Edit result page html
|
||||
header insert" from the GUI preferences. Don't forget to
|
||||
enclose the CSS code between <code><style type="text/css">
|
||||
</style></code> tags.</p>
|
||||
|
||||
<p>Use something like the following result paragraph format
|
||||
(only the code around the img tag is relevant, the rest can be
|
||||
what you want):</p>
|
||||
|
||||
<pre>
|
||||
<!--
|
||||
<table><tr><td>
|
||||
<div class="PZ3zoom PZ3-l noBdr noCap noLnk" style="width:64px;height:64px;">
|
||||
<a href="%U"> <img src='%I' width='64'></a>
|
||||
</div>
|
||||
</td><td>
|
||||
%R %S %L <b>%T</b><br>%M %D <i>%U</i> %i<br>%A %K
|
||||
</td></tr></table>
|
||||
-->
|
||||
<table><tr><td>
|
||||
|
||||
<div class="PZ3zoom PZ3-l noBdr noCap noLnk" style="width:64px;height:64px;">
|
||||
<a href="%U"> <img src='%I' width='64'></a>
|
||||
</div>
|
||||
|
||||
</td><td>
|
||||
%R %S %L &nbsp;&nbsp;<b>%T</b><br>%M&nbsp;%D&nbsp;&nbsp;&nbsp;<i>%U</i>&nbsp;%i<br>%A %K
|
||||
</td></tr></table>
|
||||
</pre>
|
||||
|
||||
</blockquote>
|
||||
<div class="PZ3zoom PZ3-r noCap noLnk" style="width:100px;height:40px;">
|
||||
<a href="resparpics/pz3.png" onclick="return false">
|
||||
<img src="resparpics/pz3.png" alt="hover zoom" />
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<p>Et voilà! The icons will grow to their full size when the mouse is
|
||||
over them.</p>
|
||||
|
||||
<h2>Alternate icons theme</h2>
|
||||
<p>There is an alternate set of icons
|
||||
at <a href="http://kde-look.org/content/show.php?content=145669">
|
||||
kde-look.org</a>. If you are running KDE desktop, it should
|
||||
be more consistent with the rest of your applications.</p>
|
||||
<p>You do not need to replace the standard Recoll set of icons
|
||||
to use it, just extract it somewhere, and use
|
||||
the <tt>iconsdir</tt> variable in <i>~/.recoll/recoll.conf</i> to
|
||||
point Recoll to it. e.g.:
|
||||
<blockquote><pre>
|
||||
<tt>iconsdir = /path/to/my/icons</tt>
|
||||
</pre></blockquote>
|
||||
</p>
|
||||
|
||||
<h2>Result list paragraph format samples (for all versions)</h2>
|
||||
|
||||
<p>Here follow some sample formats. Most of them were contributed by
|
||||
kind users, and I'll be happy to show their names if they so
|
||||
wish (abstaining by default).</p>
|
||||
|
||||
<h3>Recoll 1.15 default</h3>
|
||||
<pre>
|
||||
|
||||
<!--
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src='%I'></td>
|
||||
<td>%R %S %L <b>%T</b><br>
|
||||
%M %D <i>%U</i><br>
|
||||
%A %K
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
-->
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src='%I'></td>
|
||||
<td>%R %S %L&nbsp;&nbsp;<b>%T</b><br>
|
||||
%M&nbsp;%D&nbsp;&nbsp;&nbsp;<i>%U</i><br>
|
||||
%A %K
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</pre>
|
||||
|
||||
<br clear="all">
|
||||
<img src="resparpics/default.png"/>
|
||||
|
||||
<h3>Alternating bands, bigger previews, and custom paragraph
|
||||
typesetting</h3>
|
||||
|
||||
<p>Paul, the author, gives the following description for his
|
||||
result list formatting:
|
||||
<blockquote>
|
||||
It uses the "Alternating Results Background" from that page,
|
||||
plus my own layout which incorporates a larger view of image
|
||||
files. The 'large image' is scaled down from the actual
|
||||
image, rather than a scaled up version of the thumbnail.
|
||||
</blockquote>
|
||||
</p>
|
||||
|
||||
<p>The header fragment has the javascript for
|
||||
alternating backgrounds, and the CSS code:</p>
|
||||
|
||||
<pre>
|
||||
<!-- Custom Header -->
|
||||
<script type="text/javascript">
|
||||
function altRows() {
|
||||
var rows = document.getElementsByClassName("rclresult");
|
||||
for (i = 0; i < rows.length; i++) {
|
||||
if (i % 2 == 0) {
|
||||
rows[i].style.backgroundColor = "#f0f0f0";
|
||||
}
|
||||
}
|
||||
}
|
||||
window.onload = function() {
|
||||
altRows();
|
||||
}
|
||||
</script>
|
||||
|
||||
<style type="text/css">
|
||||
.thumbnail {
|
||||
display:block;
|
||||
position:relative;
|
||||
padding: 4px;
|
||||
width: auto; /* set width of thumbnail image in 'paragraph' code - not here */
|
||||
border:none;
|
||||
z-index:0;
|
||||
}
|
||||
.thumbnail:hover {
|
||||
border:none;
|
||||
background-color: transparent;
|
||||
z-index: 50;
|
||||
}
|
||||
.thumbnail span {
|
||||
position: absolute;
|
||||
left: -9999px;
|
||||
visibility: hidden;
|
||||
}
|
||||
.thumbnail span img {
|
||||
max-width:256px; /* set 'large image' max width/height - advise keeping these */
|
||||
max-height:256px; /* the same to avoid inadvertently changing the aspect ratio */
|
||||
width:auto; /* leave set to auto */
|
||||
height:auto; /* leave set to auto */
|
||||
background-color: gray;
|
||||
padding: 1px;
|
||||
border: 1px solid black;
|
||||
}
|
||||
.thumbnail:hover span {
|
||||
visibility: visible;
|
||||
top: 4px; /* top/left positions 'large image' relative to top left */
|
||||
left: 88px; /* of parent thumbnail (plus padding) */
|
||||
}
|
||||
</style>
|
||||
<!-- End of Custom Header -->
|
||||
</pre>
|
||||
|
||||
<p>And the paragraph format:</p>
|
||||
|
||||
<pre>
|
||||
<!-- Custom Paragraph -->
|
||||
<table>
|
||||
<tr>
|
||||
<td>
|
||||
<a class="thumbnail" href="#">
|
||||
<img src="%I" width="64px" height="auto"> <!-- set width of thumbnail -->
|
||||
<span>
|
||||
<img src="%U">
|
||||
</span>
|
||||
</a>
|
||||
<td>
|
||||
<table>
|
||||
<tr>
|
||||
<td>
|
||||
<div>
|
||||
<b>%T</b></br>
|
||||
%L</br>
|
||||
<p><font color="grey">%A </font><font color="#CD6688"><i>%K</i></font></p>
|
||||
<font color="green"><font size=1>
|
||||
%U</br>
|
||||
%R — %S—%D — %M
|
||||
</font></font></br>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
<!-- End Custom Paragraph -->
|
||||
</pre>
|
||||
|
||||
<p>Result:</p>
|
||||
<br clear="all">
|
||||
<img src="resparpics/pip.png"/>
|
||||
|
||||
<h3>A simpler format, suggested in Bitbucket issue #69</h3>
|
||||
|
||||
<pre>
|
||||
<!--
|
||||
<img src="%I" align="left">%R %L <b>%T</b><br>
|
||||
<i><font color="#808080">%U</font></i> %i<br>
|
||||
%A %K
|
||||
-->
|
||||
<img src="%I" align="left">%R %L&nbsp;&nbsp;<b>%T</b><br>
|
||||
&nbsp;&nbsp;<i><font color="#808080">%U</font></i>&nbsp;%i<br>
|
||||
%A %K
|
||||
</pre>
|
||||
<br clear="all">
|
||||
<img src="resparpics/issue73.png"/>
|
||||
|
||||
|
||||
<h3>Simple+table</h3>
|
||||
|
||||
<p>Same format, but using a table to avoid text flowing into the icon
|
||||
area.</p>
|
||||
|
||||
<pre>
|
||||
<!--
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="%I" align="left"></td>
|
||||
<td>%R %L <b>%T</b><br>
|
||||
<i><font color="#808080">%U</font></i> %i<br>
|
||||
%A %K
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
-->
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="%I" align="left"></td>
|
||||
<td>%R %L&nbsp;&nbsp;<b>%T</b><br>
|
||||
&nbsp;&nbsp;<i><font color="#808080">%U</font></i>&nbsp;%i<br>
|
||||
%A %K
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
</pre>
|
||||
|
||||
<br clear="all">
|
||||
<img src="resparpics/issue73+table.png"/>
|
||||
|
||||
|
||||
|
||||
<h3>Using a small font to make the size/date details less obstrusive</h3>
|
||||
|
||||
<pre>
|
||||
<!--
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="%I" align="left"></td>
|
||||
<td><table bgcolor="#bababa">
|
||||
<tr><td><div>
|
||||
<font face="Tahoma, sans-serif"><u><b><a href="P%N">%T</a></b></u><br>
|
||||
<font color=#008000>%L</font><br>
|
||||
<font color=#510101>%A %K</font><br>
|
||||
<font color=#0100FF>%U</font>
|
||||
<p align="right"><font size=1><font color=#000000>%S
|
||||
- %D
|
||||
- %M</font></p>
|
||||
</div></td></tr>
|
||||
</table></td>
|
||||
</tr>
|
||||
</table>
|
||||
-->
|
||||
<table>
|
||||
<tr>
|
||||
<td><img src="%I" align="left"></td>
|
||||
<td><table bgcolor="#bababa">
|
||||
<tr><td><div>
|
||||
<font face="Tahoma, sans-serif"><u><b><a href="P%N">%T</a></b></u><br>
|
||||
<font color=#008000>%L</font><br>
|
||||
<font color=#510101>%A %K</font><br>
|
||||
<font color=#0100FF>%U</font>
|
||||
<p align="right"><font size=1><font color=#000000>%S
|
||||
&nbsp;&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp; %D
|
||||
&nbsp;&nbsp;&nbsp;-&nbsp;&nbsp;&nbsp; %M</font></p>
|
||||
</div></td></tr>
|
||||
</table></td>
|
||||
</tr>
|
||||
</table>
|
||||
</pre>
|
||||
|
||||
<br clear="all">
|
||||
<img src="resparpics/detailSmallGreyTable.png"/>
|
||||
|
||||
|
||||
<h3>A very structured table</h3>
|
||||
|
||||
<pre>
|
||||
<!--
|
||||
<table border="1" bgcolor="lightyellow">
|
||||
<tr>
|
||||
<td rowspan="4" width="40px" align="center" valign="center">
|
||||
<img src="%I" width="32" height="32">
|
||||
<p><b>%R</b></p>
|
||||
<p><a href="P%N">Aperçu</a></p>
|
||||
</td>
|
||||
<th colspan="3" bgcolor="lightgrey">%T</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center">%M</td>
|
||||
<td align="center">%D</td>
|
||||
<td align="center">%S</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="3"><a href="E%N">%U</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="3">%A</td>
|
||||
</tr>
|
||||
</table>
|
||||
-->
|
||||
<table border="1" bgcolor="lightyellow">
|
||||
<tr>
|
||||
<td rowspan="4" width="40px" align="center" valign="center">
|
||||
<img src="%I" width="32" height="32">
|
||||
<p><b>%R</b></p>
|
||||
<p><a href="P%N">Aperçu</a></p>
|
||||
</td>
|
||||
<th colspan="3" bgcolor="lightgrey">%T</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center">%M</td>
|
||||
<td align="center">%D</td>
|
||||
<td align="center">%S</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="3"><a href="E%N">%U</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="3">%A</td>
|
||||
</tr>
|
||||
</table>
|
||||
</pre>
|
||||
<br clear="all">
|
||||
<img src="resparpics/structuredTable.png"/>
|
||||
|
||||
|
||||
<h3>Web-like from the user manual</h3>
|
||||
|
||||
<pre>
|
||||
<!--
|
||||
<u><b><a href="P%N">%T</a></b></u><br>
|
||||
%U<br>
|
||||
%A <font color=#008000>%S</font> - <a href="E%N">Edit</a>
|
||||
-->
|
||||
<u><b><a href="P%N">%T</a></b></u><br>
|
||||
%U<br>
|
||||
%A <font color=#008000>%S</font> - <a href="E%N">Edit</a>
|
||||
</pre>
|
||||
<br clear="all">
|
||||
<img src="resparpics/weblike.png"/>
|
||||
|
||||
|
||||
<h3>Clean-Looking from the user manual</h3>
|
||||
|
||||
<pre>
|
||||
<!--
|
||||
<table>
|
||||
<tr><td><img src="%I" align="left"></td>
|
||||
<td>%L <font color="#900000">%R</font> <b>%T</b><br>
|
||||
%S <font color="#808080"><i>%U</i></font>
|
||||
<table bgcolor="#e0e0e0">
|
||||
<tr><td><div>%A</div> %K </td></tr>
|
||||
</table></td>
|
||||
</table>
|
||||
-->
|
||||
<table>
|
||||
<tr><td><img src="%I" align="left"></td>
|
||||
<td>%L <font color="#900000">%R</font> <b>%T</b><br>
|
||||
%S <font color="#808080"><i>%U</i></font>
|
||||
<table bgcolor="#e0e0e0">
|
||||
<tr><td><div>%A</div> %K </td></tr>
|
||||
</table></td>
|
||||
</table>
|
||||
|
||||
</pre>
|
||||
<br clear="all">
|
||||
<img src="resparpics/clean.png"/>
|
||||
|
||||
|
||||
|
||||
<h3>Another clean and nice one, using both a bit of header code and a
|
||||
custom paragraph format</h3>
|
||||
|
||||
<p>This one also uses the custom icons set from
|
||||
<a href="http://kde-look.org/content/show.php?content=145669">
|
||||
this kde-look page</a>.</p>
|
||||
|
||||
<p>The header code:</p>
|
||||
|
||||
<pre>
|
||||
<style type="text/css">
|
||||
body {
|
||||
color: rgb(0, 0, 0);
|
||||
background-color: rgb(224, 224, 224);
|
||||
}
|
||||
</style>
|
||||
</pre>
|
||||
|
||||
<p>The paragraph code:</p>
|
||||
|
||||
<pre>
|
||||
<table style="background-color: white; width: 950px;"
|
||||
border-style="none" border-color:="" border="0">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td rowspan="4"
|
||||
style="width: 68px; text-align: center; background-color: rgb(238, 238, 238);">
|
||||
<img src="%I" height="32" width="32">
|
||||
<p style="font-family: sans-serif;"><b>%R</b></p>
|
||||
<p style="font-family: sans-serif; color: rgb(0, 153, 0);"><br>
|
||||
</p>
|
||||
</td>
|
||||
<td style="vertical-align: top;"><br>
|
||||
</td>
|
||||
<th
|
||||
style="font-family: sans-serif; background-color: white; text-align: left;"
|
||||
colspan="3" bgcolor="lightgrey">%T</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="vertical-align: top; width: 11px;"><br>
|
||||
</td>
|
||||
<td
|
||||
style="text-align: center; font-family: sans-serif; background-color: rgb(249, 249, 249);">%M</td>
|
||||
<td
|
||||
style="text-align: center; font-family: sans-serif; background-color: rgb(249, 249, 249);">%D</td>
|
||||
<td
|
||||
style="font-family: sans-serif; text-align: right; background-color: rgb(249, 249, 249);">%S</td>
|
||||
</tr>
|
||||
<tr style="font-family: sans-serif; color: rgb(0, 153, 0);">
|
||||
<td style="vertical-align: top;"><br>
|
||||
</td>
|
||||
<td colspan="3"><a href="E%N">%U</a></td>
|
||||
</tr>
|
||||
<tr style="font-family: sans-serif;" 8="">
|
||||
<td style="vertical-align: top;"><br>
|
||||
</td>
|
||||
<td colspan="3">%A</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<br>
|
||||
<br>
|
||||
</pre>
|
||||
|
||||
<br clear="all">
|
||||
<img src="resparpics/christopher.png"/>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@ -1,109 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>RECOLL: a personal text search system for
|
||||
Unix/Linux</title>
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search,fulltext,desktop search,unix,linux,solaris,open source,free">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content=
|
||||
"text/html; charset=iso-8859-1">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="index.html">Home</a></li>
|
||||
<li><a href="pics/index.html">Screenshots</a></li>
|
||||
<li><a href="download.html">Downloads</a></li>
|
||||
<li><a href="doc.html">Documentation</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h1>Contributing to Recoll developement and availability</h1>
|
||||
|
||||
<p>If you are not a software developer, or have no time
|
||||
available for testing the application of thinking about how it
|
||||
could be improved, there is always the possibility of
|
||||
contributing a donation, which will be much appreciated !<br/>
|
||||
<a href="/donations/index.html">
|
||||
<img src="/donations/btn_donate_LG.gif" /></a>
|
||||
</p>
|
||||
|
||||
<p>If you wish to become involved in the development of <span
|
||||
class="application">Recoll</span>, please send me an <a
|
||||
href="mailto:jfd@recoll.org">email</a>.</p>
|
||||
|
||||
<h1><a name="translation">Translation</a></h1>
|
||||
|
||||
<p>More translations is good ! If you are a non-english speaker
|
||||
(and understand english, which can probably be assumed, you
|
||||
being reading this), you can take a little time to translate
|
||||
the GUI messages file.</p>
|
||||
<p>The newest versions of the message files follow can be found
|
||||
in <a href="translations">this directory</a>. There
|
||||
is an empty one (the xx one), the others are partially
|
||||
translated, just needing an update for the new messages.<p>
|
||||
<p>Updating the files can easily be done with
|
||||
the <span class="application">Qt Linguist</span>. Contact me
|
||||
for more directions if needed.</p>
|
||||
|
||||
<h1><a name="development">Development</a></h1>
|
||||
|
||||
<p>The Recoll source repository is on
|
||||
<a href="https://opensourceprojects.eu/p/recoll1/code/">opensourceprojects.eu</a>.
|
||||
Use git, to clone it and hack away.</p>
|
||||
|
||||
<p>Apart from possible tickets in the
|
||||
<a href="https://opensourceprojects.eu/p/recoll1/tickets/">tracking
|
||||
system</a>, these are the general areas where help or
|
||||
ideas are particularly welcome:</p>
|
||||
<ul>
|
||||
<li>A better GUI design (both the ergonomy and the
|
||||
appearance). Adding missing shortcuts or fixing the menu
|
||||
accelerators for exemple is easy and useful.</li>
|
||||
|
||||
<li>More support for the more advanced <span class=
|
||||
"application">Xapian</span> concepts like relevance
|
||||
feedback.</li>
|
||||
|
||||
<li>More filters for less common or less obviously
|
||||
useful file types.</li>
|
||||
|
||||
<li>Integration with the <span class="application">KDE</span>
|
||||
desktop.</li>
|
||||
|
||||
<li>Integration with some mail user agent. We need a way to
|
||||
jump from a message preview to the message in thread context
|
||||
inside the MUA.</li>
|
||||
|
||||
<li>Etc. :)</li>
|
||||
</ul>
|
||||
|
||||
<h1><a name="problemreport">Problem reporting</a></h1>
|
||||
|
||||
<p>Once in a while it will happen that a Recoll program will
|
||||
crash (either the "recoll" graphical interface or the
|
||||
"recollindex" command line indexing command).</p>
|
||||
|
||||
<p>Reporting crashes is very useful. It can help others, and it
|
||||
can get your own problem to be solved.</p>
|
||||
|
||||
<p>You will find help and information about producing a useful
|
||||
problem report on this
|
||||
<a href="faqsandhowtos/ProblemSolvingData.html">
|
||||
Howto page</a>.</p>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@ -1,71 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>Recoll documentation</title>
|
||||
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux
|
||||
based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search, desktop search, unix, linux">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
|
||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="index.html">Home</a></li>
|
||||
<li><a href="download.html">Downloads</a></li>
|
||||
<li>Documentation</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h1>Recoll user manual</h1>
|
||||
|
||||
<ul>
|
||||
<li><a href="usermanual/webhelp/docs/index.html">English, HTML, many
|
||||
pages, nicer format (needs javascript).</a></li>
|
||||
<li><a href="usermanual/usermanual.html">English, HTML, one page</a></li>
|
||||
<li><a href="http://stupidbeauty.com/Blog/2012/03/recoll%E7%94%A8%E6%88%B6%E6%89%8B%E5%86%8A%E7%BF%BB%E8%AD%AF%EF%BC%8Crecoll-user-manual-2/">
|
||||
中文,HTML</a></li>
|
||||
<li><a href="usermanual/recoll_user_manual.pdf">English, PDF</a></li>
|
||||
<li><a href="http://mcz.altervista.org/Pagine/usermanual-italian.html">
|
||||
Italian (rather old)</a></li>
|
||||
</ul>
|
||||
|
||||
<p><br></p>
|
||||
|
||||
<h1>Faqs and Howtos</h1>
|
||||
|
||||
<p>You will find a number of useful tips for common
|
||||
issues and extensions on the
|
||||
<a href="faqsandhowtos/index.html">
|
||||
Faqs and Howtos section</a>.
|
||||
|
||||
<h1>Other documentation</h1>
|
||||
|
||||
<ul>
|
||||
<li><a href="recoll_XMP/index.html">Indexing PDF
|
||||
XMP-metadata</a>: a nice exemple of customizing a Recoll
|
||||
configuration and the PDF filter to use additional
|
||||
metadata, by Jeffrey Dick.</li>
|
||||
<li><a href="perfs.html">Index size and indexing performance
|
||||
data.</a></li>
|
||||
<li><a href="custom.html">Result list format samples.</a></li>
|
||||
<li><a href="idxthreads/threadingRecoll.html">Lessons learned
|
||||
while modifying Recoll indexing to be multithreaded</a>.</li>
|
||||
</ul>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@ -1,462 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html>
|
||||
<head>
|
||||
<title>Recoll download</title>
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description"
|
||||
content="recoll is a simple full-text search system for unix and linux based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content="full text search, desktop search, unix, linux">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
||||
|
||||
<script type="text/javascript">
|
||||
function showdiv(viewid)
|
||||
{
|
||||
var ids = ["general", "bugs", "source", "packages",
|
||||
"windows", "ports", "filters", "translations"];
|
||||
for (var i = 0; i < ids.length; i++) {
|
||||
document.getElementById(ids[i]).style.display = "none";
|
||||
}
|
||||
document.getElementById(viewid).style.display = "block";
|
||||
}
|
||||
</script>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="index.html.en">Home</a></li>
|
||||
<li><b>Downloads</b></li>
|
||||
<li><a href="doc.html">Documentation</a></li>
|
||||
<li><a href="usermanual/usermanual.html#RCL.INSTALL">Installation</a></li>
|
||||
<li><a href="support.html">Support</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
<h1>Recoll downloads</h1>
|
||||
|
||||
<div class="intrapage">
|
||||
|
||||
<table width="100%">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><a href="#general" onmouseover="showdiv('general')">
|
||||
General</a></td>
|
||||
<td><a href="#source" onmouseover="showdiv('source')">
|
||||
Source</a></td>
|
||||
<td><a href="#packages" onmouseover="showdiv('packages')">
|
||||
Linux Packages (.rpm and .deb)</a></td>
|
||||
<td><a href="#windows" onmouseover="showdiv('windows')">
|
||||
Windows</a></td>
|
||||
<td><a href="BUGS.html" onmouseover="showdiv('bugs')">
|
||||
Known bugs</a></td>
|
||||
<td><a href="#ports" onmouseover="showdiv('ports')">
|
||||
Mac ports</a></td>
|
||||
<td><a href="filters/filters.html" onmouseover="showdiv('filters')">
|
||||
Updated Filters</a></td>
|
||||
<td><a href="#translations" onmouseover="showdiv('translations')">
|
||||
Translations</a></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div id="general">
|
||||
<h2><a name="general">General information</a></h2>
|
||||
|
||||
<p>The current version is 1.23.2. <a href="release-1.23.html">Release
|
||||
notes</a>.</p>
|
||||
|
||||
<p>Recoll <a href="usermanual/usermanual.html#RCL.INSTALL">Installation
|
||||
/ building manual</a>.</p>
|
||||
|
||||
<p>The indexing filters used for some document types may need external
|
||||
packages not installed on your system by default, and not installed
|
||||
automatically with Recoll: <a href="features.html#doctypes">take a
|
||||
look at the list</a> and decide what you need to install.</p>
|
||||
|
||||
<p>The Recoll term explorer tool in phonetic mode (marginally useful and
|
||||
optional) uses the <b>aspell</b> package, version 0.60
|
||||
(utf-8 support) or newer.</p>
|
||||
|
||||
<p>If you find problems with this page, the package or its
|
||||
installation, <em>please</em> <a href="mailto:jfd@recoll.org">report
|
||||
them</a>.</p>
|
||||
|
||||
<h4>What do the release numbers mean?</h4>
|
||||
|
||||
<p>The Recoll releases are numbered X.Y.Z. The X would only
|
||||
change for really major modifications like a big change in
|
||||
the index format, and possibly won't ever reach 2.</p>
|
||||
|
||||
<p>Y is for functional modifications. These may bring bugs, so
|
||||
if you don't need the new features, you may want to wait a
|
||||
little, and especially skip the first release (X.Y.0), at
|
||||
least for a few weeks.</p>
|
||||
|
||||
<p>Z changes for <a href="BUGS.html">bug fixes</a> only, and
|
||||
moving from X.Y.Z to X.Y.Z+u should in general involve
|
||||
little risk of regression. But, <em>any</em> change can
|
||||
bring problems, if you are not affected by the corrected
|
||||
bugs (check the <a href="release-1.21.html">release
|
||||
file</a>), there is probably no necessity to upgrade
|
||||
anyway.</p>
|
||||
</div>
|
||||
|
||||
<div id="bugs">
|
||||
<h2><a name="bugs">Known bugs</a></h2>
|
||||
<p>There is a <a href="BUGS.html">history of known bugs</a>, sorted
|
||||
by fix release. Also see
|
||||
the <a href="https://opensourceprojects.eu/p/recoll1/tickets/new/">
|
||||
Recoll issue tracker</a>.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div id="source">
|
||||
<h2><a name="source">Source</a></h2>
|
||||
|
||||
<h3>Current release distribution: 1.23.2:</h3>
|
||||
<!-- Attention: source packages must remain here, not in a
|
||||
subdirectory, because of all the places they're referred from
|
||||
(package watches) -->
|
||||
|
||||
<p><a href="recoll-1.23.2.tar.gz">recoll-1.23.2.tar.gz</a>.</p>
|
||||
<p><a href="release-1.23.html">Release notes</a>.</p>
|
||||
|
||||
<h3>Previous release: 1.22.4:</h3>
|
||||
<p><a href="recoll-1.22.4.tar.gz">recoll-1.22.4.tar.gz</a>.</p>
|
||||
<p><a href="release-1.22.html">Release notes</a>.</p>
|
||||
|
||||
|
||||
<!--
|
||||
<h3>Snapshot</h3>
|
||||
<p>I sometimes release a source tarfile when I consider that the
|
||||
current development version is stable enough. The current
|
||||
snapshot contains commits up to 2240 (see
|
||||
<a href="https://opensourceprojects.eu/p/recoll1/code/commit_browser">
|
||||
the changelog</a>, and a synthetic abstract in the
|
||||
current <a href="release-1.16.html">1.16 release notes</a>).
|
||||
<p><a href="betarecoll-2240.tar.gz">betarecoll-2240.tar.gz</a>.</p>
|
||||
-->
|
||||
|
||||
<h3>Ubuntu Unity Lens and Scope</h3>
|
||||
|
||||
<p>You will probably get these from the <a href="#ubuntu">PPA</a>, but
|
||||
here are the source files. These are not included in the main tar file
|
||||
any more. For any Recoll version after 1.19 (choose on the
|
||||
Ubuntu version, not the Recoll one):
|
||||
|
||||
<blockquote>
|
||||
<a href="recoll-lens-1.19.10.3543.tar.gz">
|
||||
recoll-lens-1.19.10.3543.tar.gz</a> (Ubuntu up to 13.04
|
||||
Raring)<br>
|
||||
|
||||
<a href="unity-scope-recoll-1.20.2.4.tar.gz">
|
||||
unity-scope-recoll-1.20.2.4.tar.gz</a> (Ubuntu 13.10 and
|
||||
later).<br>
|
||||
|
||||
</blockquote>
|
||||
|
||||
For Recoll 1.18:
|
||||
<a href="recoll-lens-1.18.1.2997.tar.gz">
|
||||
recoll-lens-1.18.1.2997.tar.gz</a><br>
|
||||
For Recoll 1.17:
|
||||
<a href="recoll-lens-1.17.2.2697.tar.gz">
|
||||
recoll-lens-1.17.2.2697.tar.gz</a>
|
||||
|
||||
</p>
|
||||
|
||||
<h3>Prerequisites for building from source:</h3>
|
||||
<ul>
|
||||
<li>C++ compiler. Be aware that its absence sometimes
|
||||
manifests itself by quite cryptic messages.</li>
|
||||
|
||||
<li><p>Xapian core development libraries. Most Linux
|
||||
distributions carry them in their package repository. Or
|
||||
you will find source and binary packages on
|
||||
the <a href="http://www.xapian.org/download.php">Xapian
|
||||
download page</a>.
|
||||
<br>
|
||||
<p><em>Note on building Xapian for older CPUs:</em> The build
|
||||
configurations for Xapian releases 1.0.21 and 1.2.1 or
|
||||
newer enable the use of SSE2 floating point
|
||||
instructions. These instructions are not available in
|
||||
CPUs older than Intel Pentium 4 or AMD Athlon 64. When
|
||||
building for such a CPU, you need to add the
|
||||
--disable-sse flag to the Xapian library configure
|
||||
command. If this is not done, the problem signals itself
|
||||
by "Illegal instruction" crashes (SIGILL) in recollindex
|
||||
and recoll. </p>
|
||||
</li>
|
||||
<li>Qt development files: Qt 4.4, 5.3 or newer (5.2 not ok).</li>
|
||||
<li>Qt WebKit development files: these are quite often
|
||||
distributed apart from the main Qt libraries. It is
|
||||
possible to configure Recoll not to use Qt WebKit (see
|
||||
configure --help).</li>
|
||||
<li>zlib development files.</li>
|
||||
<li>X11 development files.</li>
|
||||
<li>Python development package: you can avoid needing this
|
||||
by configuring with --disable-python-module.</li>
|
||||
</ul>
|
||||
|
||||
<h3>Source repository:</h3>
|
||||
|
||||
<p>The <span class="application">Recoll</span> source
|
||||
repository is hosted
|
||||
on <a href="https://opensourceprojects.eu/p/recoll1/code/">
|
||||
opensourceprojects.eu</a>. The trunk is usually a bit on the
|
||||
bleeding edge, but there is always a maintenance branch for
|
||||
the current production version.</p>
|
||||
|
||||
<h3>Instructions for building</h3>
|
||||
|
||||
<p>Normally, it's just:</p>
|
||||
<div class="code">./configure; make; make install</div>
|
||||
<p>If a bit more detail is needed,
|
||||
<a href="http://www.recoll.org/usermanual/usermanual.html#RCL.INSTALL.BUILDING">
|
||||
there is some in the manual</a>.
|
||||
|
||||
</div>
|
||||
|
||||
<div id="packages">
|
||||
<h2><a name="packages">Packages</a></h2>
|
||||
|
||||
<p>Packages or ports for Recoll are available in the standard
|
||||
repositories for many distributions.</p>
|
||||
|
||||
<p>However they are often a bit older or built with older
|
||||
Xapian releases. Here follow some pointers to find newer
|
||||
packages for some distributions. In most cases, you will
|
||||
just need to use an alternate repository.</p>
|
||||
|
||||
<h3><a name="debian">Debian</a></h3>
|
||||
|
||||
<p>The Debian Recoll packages are not always up to date in
|
||||
stable distributions. Debian Wheezy and Jessie have Recoll
|
||||
1.17.3. which is ancient (it was an accident for
|
||||
Jessie). Stretch has 1.22.4 which is largely ok.</p>
|
||||
|
||||
<p>I am maintaining a repository for newer versions of the packages.
|
||||
The repository currently has recoll 1.23.x for Jessie and
|
||||
Stretch, Intel 32 and 64 bits, and armhf, and slightly older
|
||||
1.22 packages for Wheezy. There is a separate
|
||||
repository for Raspbian Jessie, which is <em>not</em>
|
||||
compatible with vanilla Debian.</p>
|
||||
|
||||
<p>To add the Debian or Raspbian repository to your sources:</p>
|
||||
|
||||
<ul>
|
||||
|
||||
<li>See <a href="../pages/signatures.html">here</a> for the
|
||||
keys used to signed the repository. You will need to import
|
||||
them to suppress <b>apt-get</b> messages about unverified
|
||||
signatures (the method is described on the
|
||||
<a href="../pages/signatures.html">same page</a>).</li>
|
||||
|
||||
<li>Create and edit <span class="filename">
|
||||
/etc/apt/sources.list.d/recoll.list</span>
|
||||
and add the following lines:<br>
|
||||
for Debian wheezy (debian 7.x, recoll 1.22.3):<br>
|
||||
<div class="code">
|
||||
deb http://www.lesbonscomptes.com/recoll/debian/ wheezy main
|
||||
deb-src http://www.lesbonscomptes.com/recoll/debian/ wheezy main
|
||||
</div>
|
||||
for Debian jessie (debian 8.x):<br>
|
||||
<div class="code">
|
||||
deb http://www.lesbonscomptes.com/recoll/debian/ jessie main
|
||||
deb-src http://www.lesbonscomptes.com/recoll/debian/ jessie main
|
||||
</div>
|
||||
for Debian stretch (debian 9.x):<br>
|
||||
<div class="code">
|
||||
deb http://www.lesbonscomptes.com/recoll/debian/ stretch main
|
||||
deb-src http://www.lesbonscomptes.com/recoll/debian/ stretch main
|
||||
</div>
|
||||
for Raspbian jessie (raspbian 8.x):<br>
|
||||
<div class="code">
|
||||
deb http://www.lesbonscomptes.com/recoll/raspbian/ jessie main
|
||||
deb-src http://www.lesbonscomptes.com/recoll/raspbian/ jessie main
|
||||
</div>
|
||||
<li>Then:
|
||||
<div class="code">
|
||||
sudo apt-get update
|
||||
sudo apt-get install recoll python-recoll python3-recoll
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<p>If you prefer to manually install the packages, they are here:
|
||||
<a href="debian/pool/main/r/recoll/">
|
||||
debian/pool/main/r/recoll/</a><br/>
|
||||
</p>
|
||||
|
||||
<h3><a name="ubuntu">Ubuntu</a></h3>
|
||||
|
||||
<p>There are Personal Package Archives on launchpad.net for
|
||||
<a href="https://launchpad.net/~recoll-backports/+archive/recoll-1.15-on">
|
||||
Recoll, kio-recoll and recoll-lens</a>. These were built
|
||||
from the latest versions, for the current set of supported Ubuntu
|
||||
versions. Procedure:</p>
|
||||
<div class="code">
|
||||
sudo add-apt-repository ppa:recoll-backports/recoll-1.15-on
|
||||
sudo apt-get update
|
||||
sudo apt-get install recoll
|
||||
</div>
|
||||
|
||||
<p>The packages in the PPA now have a separate package for the Python
|
||||
extension, like the standard ones, so there should be no more
|
||||
conflict issues while switching from the PPA to the normal
|
||||
repositories and back.</p>
|
||||
|
||||
<h3><a name="mint">Linux Mint</a></h3>
|
||||
|
||||
<p>The Ubuntu PPA works perfectly for Mint 13 (and probably other releases
|
||||
too). Just follow the instructions for Ubuntu.</p>
|
||||
|
||||
<h3>RPMS</h3>
|
||||
|
||||
<p>You'll need to install the Xapian, Qt, Qt-Webkit and zlib development
|
||||
packages if you want use the source rpms.</p>
|
||||
|
||||
<h3>Fedora</h3>
|
||||
|
||||
<p>Recoll is present in the standard Fedora package repositories starting from
|
||||
F-12. Recoll packages in Fedora are usually fairly up to
|
||||
date. Please get in touch if you have a need for a Recoll package
|
||||
for Fedora.</p>
|
||||
|
||||
<h3>CentOS 7.1</h3>
|
||||
|
||||
<p><a href="https://fedoraproject.org/wiki/EPEL">EPEL</a> now
|
||||
has a package for Recoll. It is in currently in
|
||||
the <a href="https://fedoraproject.org/wiki/EPEL/testing">testing
|
||||
section</a>, but it should hopefully move on
|
||||
shortly. If you install the test package (which runs just
|
||||
fine as far as I can see), please add feedback to
|
||||
the
|
||||
<a href="https://bodhi.fedoraproject.org/updates/FEDORA-EPEL-2017-ede90eda56">
|
||||
package page</a>.
|
||||
|
||||
<p>If EPEL does not work for you, there are still a few
|
||||
<a href="downloads/centos71">pre-EPEL packages
|
||||
here</a>. They will be deleted shortly, except if someone
|
||||
provides me with a good reason to keep them. There are
|
||||
only x86_64 binaries, use the source rpm for other
|
||||
archs. As base CentOS does not seem to have the Qt WebKit
|
||||
module, the Recoll build uses QTextBrowser instead of a
|
||||
WebKit QWebView, so no Javascript or advanced CSS in the
|
||||
result list or snippets window for you (the EPEL package
|
||||
uses WebKit, so this is another way it is better).</p>
|
||||
|
||||
|
||||
<h3>OpenSUSE</h3>
|
||||
|
||||
<p>Recoll is in the KDE:Extra repository. You just need to add the
|
||||
repository to your software
|
||||
sources (Yast2->software->Software repositories).<br>
|
||||
<a href="http://download.opensuse.org/repositories/KDE:/Extra/">
|
||||
Repository list (supported Suse versions)</a>.
|
||||
After adding the appropriate repository to your software sources,
|
||||
you will be able to install recoll and kio_recoll from the software
|
||||
management interface. The Xapian dependancy will also be satisfied
|
||||
from the build service repository. Some of the older repositories do
|
||||
not build antiword, just tell the software manager to "break" recoll
|
||||
by installing anyway, and get antiword somewhere else.</p>
|
||||
|
||||
|
||||
</div> <!-- Packages -->
|
||||
|
||||
<div id="windows">
|
||||
<h2><a name="windows">Microsoft Windows Setup Files</a></h2>
|
||||
|
||||
<p>The port of Recoll to Windows is still a bit experimental and
|
||||
lacking things like real-time indexing or spelling
|
||||
suggestions. However it works well enough to be useful. More info
|
||||
and links to the setup
|
||||
files <a href="pages/recoll-windows.html">here</a>.</p>
|
||||
</div> <!-- windows -->
|
||||
|
||||
<div id="ports">
|
||||
<h2><a name="ports">Ports</a></h2>
|
||||
|
||||
<h3>Mac port</h3>
|
||||
|
||||
<p>It seems that Recoll will sometimes find data that Spotlight misses
|
||||
(especially inside pdfs apparently, which is probably more to the credit of
|
||||
poppler than recoll itself).</p>
|
||||
|
||||
<p>Recoll is in MacPorts and really easy to install:</p>
|
||||
<ol>
|
||||
<li><a href="https://trac.macports.org/wiki/InstallingMacPorts">Install
|
||||
MacPorts</a>.</li>
|
||||
<li>Type "sudo port install recoll"</li>
|
||||
</ol>
|
||||
|
||||
<p>Recoll is then available from the command line and as an icon in the usual
|
||||
MacPorts applications place.</p>
|
||||
</div>
|
||||
|
||||
<div id="filters">
|
||||
<h2><a name="filters">Updated filters</a></h2>
|
||||
<p><a href="filters/filters.html">new or updated filters</a>
|
||||
sometimes become available after a release. As a rule, all
|
||||
filters are compatible with all Recoll versions. Any
|
||||
compatibility problem will be explicitely mentionned.</p>
|
||||
</div>
|
||||
|
||||
<div id="translations">
|
||||
<h2><a name="translations">Translations</a></h2>
|
||||
|
||||
<p>Most of the translations for 1.22/23 are incomplete The source
|
||||
translation files are included in the source release. If
|
||||
your language has some english messages left and you want to
|
||||
take a shot at fixing the problem, you can send the results
|
||||
to <a href="mailto:jfd@recoll.org">me</a> and earn my
|
||||
gratefulness (and your less multilingual
|
||||
compatriot's)...</p>
|
||||
|
||||
<p>You can use the <em>.ts</em> file to alter the translations
|
||||
if you wish (use Qt's <em>linguist</em> tool to edit the
|
||||
source file, then <em>lrelease</em> to produce
|
||||
the <em>.qm</em> file.). The <em>.qm</em> file should be copied
|
||||
to <span class="filename">/usr/[local/]share/recoll/translations</span>
|
||||
</p>
|
||||
|
||||
<p><a href="translations/recoll_xx.ts">recoll_xx.ts</a> is a blank
|
||||
Recoll 1.22 message file, handy to work on a new translation. You can
|
||||
also <a href="translations/">list the directory</a> to see all the
|
||||
translation files (same as those in the maintenance source branch on
|
||||
opensourceprojects.eu).</p>
|
||||
|
||||
<h3>Updated 1.22 translations that became available after the
|
||||
release:</h3>
|
||||
|
||||
<p>Greek translation by Dimitrios Glentadakis:
|
||||
<a href="translations/recoll_el.ts">recoll_el.ts</a>
|
||||
<a href="translations/recoll_el.qm">recoll_el.qm</a><br/>
|
||||
</p>
|
||||
<p>Dutch translation by Leslie Scheelings:
|
||||
<a href="translations/recoll_nl.ts">recoll_nl.ts</a>
|
||||
<a href="translations/recoll_nl.qm">recoll_nl.qm</a><br/>
|
||||
</p>
|
||||
|
||||
<p>Danish translation by Morten Langlo:
|
||||
<a href="translations/recoll_da.ts">recoll_da.ts</a>
|
||||
<a href="translations/recoll_da.qm">recoll_da.qm</a><br/>
|
||||
</p>
|
||||
|
||||
<p>Note that, if you are running an older release, you may find updated
|
||||
messages by looking inside the appropriate maintenance
|
||||
branch in
|
||||
<a href="https://opensourceprojects.eu/p/recoll1/code/">
|
||||
the source repository</a>.</p>
|
||||
|
||||
</div> <!-- translations -->
|
||||
</div> <!-- content -->
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@ -1,35 +0,0 @@
|
||||
== Extending the Recoll Firefox visited web page indexing mechanism to other browsers
|
||||
|
||||
The *Recoll* _Web Queue_ function allows using WEB browser plug-ins
|
||||
originally designed for indexing visited WEB pages with *Beagle* (rip). The
|
||||
browser plug-ins works very simply by creating copies of the visited pages
|
||||
in a designated directory. Two files are created for each page, one for the
|
||||
contents, the other for the metadata.
|
||||
|
||||
When activated, *Recoll* will visit the queue directory and index each HTML
|
||||
page and its associated metadata. There is more detail about the mechanism
|
||||
on the [[IndexWebHistory|page about the Recoll Web queue]], but mostly, you
|
||||
just need to go to the _Indexing Preferences_ in the *recoll* GUI, open the
|
||||
_Web history_ panel and check the top button.
|
||||
|
||||
Franck, a *Recoll* and *Elinks* user from New Zealand, designed a method
|
||||
and wrote a script to index the *Elinks* WEB history in this fashion.
|
||||
|
||||
The script works by using *wget* to fetch the visited page into the queue
|
||||
directory. This means that it would be reusable to index arbitrary WEB
|
||||
pages in contexts other than *Elinks* visits.
|
||||
|
||||
Recipee for *Elinks* and Recoll 1.18 and later:
|
||||
|
||||
* Retrieve the
|
||||
link:https://www.recoll.org/files/elinks_recoll.sh[elinks_recoll.sh] shell
|
||||
script and make it executable (`chmod a+x elinks_recoll.sh`).
|
||||
* In the Elinks Keyboard shortcut manager (k)/Main, add a shortcut to pass
|
||||
the current URL to an external commande, e.g. _Ctrl-P_.
|
||||
* In the Options manager (o) /Document/Uri Passing, add an action named for
|
||||
example _ToIndex_
|
||||
* Modify the ToIndex action to execute `/path/to/the/script/elinks_recoll.sh %c`
|
||||
* Save, you are done
|
||||
|
||||
For Recoll 1.17, the method is analog, but the script is named
|
||||
link:https://www.recoll.org/files/elinks_recoll.sh[elinks_beagle.sh].
|
||||
@ -1,82 +0,0 @@
|
||||
== Recoll input handlers
|
||||
|
||||
In the end, Recoll indexes plain UTF-8 text, remembering when it came
|
||||
from.
|
||||
|
||||
But of course, this is not how the source data looks like.
|
||||
The text content of the original documents is encoded in many fashions
|
||||
(ie pdf, ms-word, html, etc.), and it can also be stored in quite
|
||||
involved ways (inside archives, email attachments ...).
|
||||
|
||||
For getting to the data and converting it to plain text, Recoll uses a set
|
||||
of modules which it calls input handlers (or filters), which either operate
|
||||
on the storage structure (ie: a zip handler), or the storage format (ie a
|
||||
pdf to text translator), or both. In addition, there is a tentative notion
|
||||
of a higher level storage backend which we will ignore for now (for
|
||||
reference there are currently two of those: the file system and the web
|
||||
history cache).
|
||||
|
||||
The basic task of filters is to take a document as input and produce a
|
||||
series of subdocuments as output. The subdocument's format is defined
|
||||
either dynamically (as part of the output data), or statically, in the
|
||||
filter definition.
|
||||
|
||||
=== Simple filters
|
||||
|
||||
These are executed by a the **mh_exec** recoll module. They are the vast
|
||||
majority.
|
||||
|
||||
These filters are very simple. They are designed to perform a simple task
|
||||
with minimal interface, they mostly don't know anything about each other,
|
||||
and they don't know much about their context. This makes writing a filter
|
||||
quite easy as there is not much to learn about their environment.
|
||||
|
||||
Only one output document is produced and the format is fixed.
|
||||
|
||||
In practise the filter, which is most generally a shell-script (but could
|
||||
be any executable program), takes a file name on the command line and
|
||||
outputs an html or plain text document on standard output, then exits.
|
||||
|
||||
For example, the pdf filter takes one pdf file name as input on the command
|
||||
line and produces one html document on stdout. The fact that the output is
|
||||
html is statically defined in a configuration file.
|
||||
|
||||
For filters which produce plain text, the output character set information
|
||||
is in general defined in the configuration file. Else it will be obtained
|
||||
from the locale (hoping that it makes sense).
|
||||
|
||||
Filters that output html can produce metadata information in the html
|
||||
header (ie author etc.). Filters that output plain text can only output
|
||||
main text data, no metadata fields.
|
||||
|
||||
Besides the file name, there is one other piece of input information, which
|
||||
is in the form of an environment variable, and can be safely ignored:
|
||||
+RECOLL_FILTER_FORPREVIEW+. This indicates if the filter is being used
|
||||
for previewing or for indexing data. Some filters will elect to suppress
|
||||
repetitive parts of the output text when indexing to avoid distorting the
|
||||
term statistics. For exemple, the man filter suppresses the section
|
||||
headers (NAME, SYNOPSIS...) when indexing.
|
||||
|
||||
=== Multiple input filters
|
||||
|
||||
These filters are more complex, but still quite easy to write, especially
|
||||
if you can use Python, because they can then use a common module which
|
||||
manages the communication with the indexer.
|
||||
|
||||
Newer Recoll versions have converted many previously 'simple' filters to
|
||||
this kind as part of the port to Windows.
|
||||
|
||||
These filters are executed by the *mh_execm* Recoll module.
|
||||
|
||||
They are persistent (one instance will persist through a whole indexing
|
||||
pass), and will index successive multiple input files (the point being to
|
||||
avoid startup performance penalty), and possibly multiple documents per
|
||||
input file if this makes sense for their input format (ie: zip archive, chm
|
||||
help file).
|
||||
|
||||
They use a simple communication protocol over a pipe with the main recoll
|
||||
or recollindex process, with file names and a few other parameters being
|
||||
sent as input, and decoded data and attributes being sent in return.
|
||||
|
||||
The shared Python module is 'filters/rclexecm.py'. You can look at 'rclzip'
|
||||
or 'rclaudio' for reasonably straightforward exemples.
|
||||
@ -1,62 +0,0 @@
|
||||
== Installing a filter for a new document type
|
||||
|
||||
It will sometimes happen that a newer Recoll release has support for a
|
||||
document type which would be useful to you, but which your older release
|
||||
does not support.
|
||||
|
||||
It is in general easy to import support from the newer to the older
|
||||
release: the Recoll input handler interface is very stable, so things should just
|
||||
work.
|
||||
|
||||
Input Handler updates are generally described on the Recoll web site
|
||||
link:https://www.recoll.org/filters/filters.html[new filters pages]. They
|
||||
may include notes about which versions need the new input handler, or specifics
|
||||
about installing it.
|
||||
|
||||
An up to date copy of input handlers and configuration files is also kept
|
||||
link:https://www.recoll.org/filters/[at the same location].
|
||||
|
||||
We will take an example to make things more concrete: Tomboy and Gnote
|
||||
files are directly supported by Recoll 1.19, but not in older Recoll
|
||||
releases. The *rclxml* handler is needed to process them.
|
||||
|
||||
The following procedure will allow you to retrofit support:
|
||||
|
||||
- Retrieve the *rclxml* input handler from:
|
||||
link:https://www.lesbonscomptes.com/recoll/filters/rclxml[]
|
||||
|
||||
- Copy it to '/usr/share/recoll/filters' and make it executable:
|
||||
`chmod +x rclxml`
|
||||
The input handler needs *xsltproc*, but this is probably already on your
|
||||
system (else get it with the package manager).
|
||||
|
||||
- Edit '~/.recoll/mimemap', add the following line:
|
||||
`.note = application/x-gnote`
|
||||
- Edit '~/.recoll/mimeconf', add the following lines:
|
||||
+
|
||||
----
|
||||
[index]
|
||||
application/x-gnote = exec rclxml
|
||||
----
|
||||
- Edit '~/.recoll/mimeview', add the following lines:
|
||||
+
|
||||
----
|
||||
[view]
|
||||
application/x-gnote = tomboy %f
|
||||
----
|
||||
|
||||
- The easiest way to make sure the files are indexed with the new input
|
||||
handlers may then be to just run a full indexing pass (`recollindex -z`).
|
||||
|
||||
Notes:
|
||||
|
||||
- The MIME type which is used is not crucial, you could prefer to use,
|
||||
e.g., +application/x-tomboy+ instead, it just has to be consistent. To
|
||||
avoid future trouble, it's better to use the type used by newer Recoll
|
||||
releases though.
|
||||
- The 'mimeview' entry is necessary even if you are using the desktop
|
||||
preferences to open files. The value will not be used, but it has to be
|
||||
there.
|
||||
|
||||
|
||||
|
||||
@ -1,34 +0,0 @@
|
||||
== Filtering out Zip archive members ==
|
||||
|
||||
The *rclzip* Zip archive extraction input handler does not use the general
|
||||
configuration variables which define what file system objects should be
|
||||
skipped, but it has an equivalent internal function.
|
||||
|
||||
The name-skipping code depends on a recent member of the the Recoll Python
|
||||
package. This will become standard for release 1.20, but for earlier
|
||||
releases, you need to do two things to use this function:
|
||||
|
||||
- Fetch 'python/recoll/recoll/rclconfig.py' and 'filters/rclzip' from the
|
||||
source repository.
|
||||
- Copy both to '/usr/share/recoll/filters' and make 'rclzip' executable.
|
||||
|
||||
You can then set a variable named +zipSkippedNames+ inside
|
||||
'recoll.conf'. +zipSkippedNames+ should be a space-separated list of
|
||||
patterns which will be passed to the Python fnmatch() function. The +/+
|
||||
characters are not special (matched as any character).
|
||||
|
||||
You can't use embedded spaces in patterns (no double-quote quoting for now)
|
||||
|
||||
This can be redefined for file system directories using the usual section
|
||||
indicators (Zip archives in different file-system directories can have
|
||||
different skip lists).
|
||||
|
||||
Example:
|
||||
|
||||
----
|
||||
zipSkippedNames = *.txt
|
||||
[/path/to/the/dir]
|
||||
zipSkippedNames = somedir/*/*.html
|
||||
----
|
||||
|
||||
|
||||
@ -1,60 +0,0 @@
|
||||
== Recoll GUI keyboard navigation
|
||||
|
||||
Using Recoll without the mouse is not completely straightforward, but it is
|
||||
mostly feasible. Here follows a description of the usable shortcuts.
|
||||
|
||||
=== Anywhere
|
||||
|
||||
`Ctrl+q` should exit Recoll from anywhere.
|
||||
|
||||
=== Main window and result list ===
|
||||
|
||||
When Recoll starts up, the focus is in the simple search entry. The main
|
||||
window tab order is as follows:
|
||||
|
||||
* Clear
|
||||
* Search
|
||||
* Search type combo
|
||||
* Search entry (Initial focus)
|
||||
* Result list (scrolling etc)
|
||||
* Result list 1st link
|
||||
* Result list next links...
|
||||
* Back to Clear
|
||||
|
||||
Each result list entry has 3 links: the icon link is not active, but its
|
||||
value is the URL, so that it can be dragged and dropped to another
|
||||
application. The 2 other links are _Preview_ and _Open_ and can be
|
||||
activated by typing _Enter_.
|
||||
|
||||
Typing _Ctrl+Shift+s_ anywhere in the main window should return the focus to the search entry. So will _Ctrl+l_ in future versions (for compatibility with WEB browser usage).
|
||||
|
||||
For pure keyboard usage, you can improve this by:
|
||||
|
||||
- Disabling the icon link: use _Preferences->GUI configuration->Result
|
||||
List->Edit result paragraph_ and remove the `<a href='%U'>` and `</a>`
|
||||
around the `<img...>` tag.
|
||||
- Making the active link more visible by adding the following code to the
|
||||
result page HTML header insert (same preferences tab). Feel free to
|
||||
adjust the color :=) :
|
||||
|
||||
----
|
||||
<style type="text/css">
|
||||
a:focus {background-color: red;}
|
||||
</style>
|
||||
----
|
||||
|
||||
=== Result table
|
||||
|
||||
The same _Ctrl+Shift+s_ will return the focus to the search entry when
|
||||
working with the result table.
|
||||
|
||||
_Ctrl+r_ will move the focus from the entry to the spreadsheet. When in
|
||||
there the arrow keys will navigate the lines.
|
||||
|
||||
When a line is selected:
|
||||
|
||||
* _Ctrl+o_ will _Open_ the document.
|
||||
* _Ctrl+Shift+o_ will _Open_ the document and exit Recoll.
|
||||
* _Ctrl+d_ (detail) will start a _Preview_
|
||||
|
||||
_Esc_ will deselect the current line so that mouse hovering will work again.
|
||||
@ -1,69 +0,0 @@
|
||||
== Generating a custom field and using it to sort results
|
||||
|
||||
We are going to show how to generate a custom field from a Recoll filter,
|
||||
and use it for sorting results. The example chosen comes from an actual
|
||||
user request: sorting results on pdf page counts.
|
||||
|
||||
The details here are obsolete, as the +pdf+ input handler is now a quite
|
||||
different python program, but the general idea is still relevant.
|
||||
|
||||
The page count from a pdf file can be displayed by the pdfinfo command
|
||||
(xpdf or poppler tools).
|
||||
|
||||
We first modify a copy of the rclpdf filter
|
||||
('/usr/[local/]share/recoll/filters/rclpdf'), to compute the pdf page count,
|
||||
and output the value as an html meta field. This is a not very interesting
|
||||
bit of shell/awk magic. Another approach would be to just rewrite the
|
||||
rclpdf filter in your favorite scripting language (ie: perl, python...), as
|
||||
all it does is execute pdftotext and pdfinfo and output html, nothing
|
||||
complicated. Here follows the rclpdf modification as a pseudo patch:
|
||||
|
||||
----
|
||||
# compute the page count and format it so that it's alphabetically sortable
|
||||
+set `pdfinfo "$infile" | egrep ^Pages:`
|
||||
+pages=`printf "%04d" $2`
|
||||
[skip...]
|
||||
# Pass the page count value to awk
|
||||
-awk 'BEGIN'\
|
||||
+awk -v Pages="$pages" 'BEGIN'\
|
||||
[skip...]
|
||||
# Inside the awk program startup section: compute the "meta" field line
|
||||
+ pagemeta = "<meta name=\"pdfpages\" content=\"" Pages "\">\n"
|
||||
[skip...]
|
||||
# Then print it as part of the header:
|
||||
+ $0 = part1 charsetmeta pagemeta part2
|
||||
[skip...]
|
||||
----
|
||||
|
||||
You can execute your own version of rclpdf by modifying '~/.recoll/mimeconf':
|
||||
|
||||
----
|
||||
[index]
|
||||
application/pdf = exec /path/to/my/own/rclpdf
|
||||
----
|
||||
|
||||
At this point, recollindex would receive and extract a +pdfpages+ field,
|
||||
but it would not know what to do with it. We are going to tell it to store
|
||||
the value inside the document data record so that it can be displayed in
|
||||
the results, and sorted on. For this we modify the '~/.recoll/fields' file:
|
||||
|
||||
----
|
||||
[stored]
|
||||
pdfpages=
|
||||
----
|
||||
|
||||
That's it ! After reindexing, you can now display +pdfpages+ inside the
|
||||
result list (add a +%(pdfpages)+ value to the paragraph format), and display
|
||||
+pdfpages+ inside the result table (right-click the table header), and sort
|
||||
the results on page count (click the column header).
|
||||
|
||||
Note that +pdfpages+ has not been defined as searchable (this would not make
|
||||
much sense). For this, you'd have to define a prefix and add it to the
|
||||
[prefixes] fields file section:
|
||||
|
||||
----
|
||||
[prefixes]
|
||||
pdfpages = XYPDFP
|
||||
----
|
||||
|
||||
Have a look at the comments inside the 'fields' file for more information.
|
||||
@ -1,13 +0,0 @@
|
||||
== Welcome to the Recoll Faqs and Recipees
|
||||
|
||||
link:FaqsAndHowTos.html[FAQs and Howtos] are stored here, but
|
||||
the main source for Recoll user documentation is
|
||||
link:https://www.recoll.org/doc.html[the _Recoll user manual_] on the
|
||||
link:https://www.recoll.org/[Recoll Web site] where you will also find a
|
||||
lot of other Recoll information, source code tarballs and contact
|
||||
information.
|
||||
|
||||
If you want to make your problem report as useful as possible, you may want
|
||||
to take a look at link:ProblemSolvingData.html[this page].
|
||||
|
||||
link:WikiIndex.html[Full file index]
|
||||
@ -1,79 +0,0 @@
|
||||
== Recoll hotkey: starting / hiding recoll with a keyboard shortcut
|
||||
|
||||
Type a key (ie: F12) and have recoll appear or disappear. On the first
|
||||
occurrence, recoll is started if it's not already running. Further
|
||||
occurrences toggle recoll between visible and minimized states. Never
|
||||
thought this would be useful until someone asked for it. Can't do without
|
||||
it anymore :)
|
||||
|
||||
This works well with both Gnome and KDE, but is implemented using a gnome
|
||||
library (*libwnck*) and its python interface, which you may have to install
|
||||
on your system if you are a pure KDE user. The library most probably exists
|
||||
in the package repositories for your distribution, so this should not be
|
||||
too complicated.
|
||||
|
||||
This should also work with other window managers, because it is based on a
|
||||
standard window manager interface extension (EWMH) that most modern window
|
||||
managers implement.
|
||||
|
||||
=== Installing the script (all desktops):
|
||||
|
||||
- You will need the libwnck library and its python interface. These are
|
||||
usually part of a gnome installation, otherwise check and possibly
|
||||
install them. For OpenSuse, the library should already be there but you
|
||||
need to install gnome-python-desktop.
|
||||
- Download the
|
||||
link:https://www.recoll.org/files/hotrecoll.py[http://www.recoll.org/files/hotrecoll.py
|
||||
script]. If you have a recent recoll installation (1.14.3 and
|
||||
further), it's already in the recoll filters directory
|
||||
('/usr/[local/]share/recoll/filters')
|
||||
- Copy the script to some permanent place (ie: '~/bin') and make it
|
||||
executable (you can leave it in the filters dirs if it's there). In a
|
||||
shell window: `chmod +x hotrecoll.py`.
|
||||
- You can check that the script works (or not) by executing it on the
|
||||
command line. It does not need an argument. Recoll should appear or
|
||||
disappear every time you execute the script. A few warning messages may
|
||||
be considered normal. If the script says that it does not find the wnck
|
||||
library or some other module, you'll have to install them.
|
||||
|
||||
=== Installing the keyboard shortcut (Gnome):
|
||||
|
||||
- _System->Preferences->Keyboard shortcuts_, or execute
|
||||
*gnome-keybinding-properties*
|
||||
- Click add, Name, ie: StartRecoll, Action: /path/to/hotrecoll.py
|
||||
- This will add the shortcut to the "Custom shortcuts" section. You can
|
||||
then click in the "Shortcut" column for "StartRecoll", and type any key
|
||||
combination (ie: push F12) to assign a key shortcut.
|
||||
|
||||
=== Installing the keyboard shortcut (KDE):
|
||||
|
||||
Under KDE installing a global custom keyboard shortcut like we need is most
|
||||
helpfully not under "Keyboard Shortcuts" but under "Input Actions".
|
||||
|
||||
- _Kmenu -> Configure Desktop -> Input Actions -> Edit -> New -> Global
|
||||
Shortcut -> Command/Url_
|
||||
- A new Action appears, named _New Action_. You can rename it something
|
||||
like +hotrecoll+ for clarity.
|
||||
- Click the _Trigger_ tab, click the input area and press your preferred
|
||||
key combination (ie: F12)
|
||||
- Click the _Action_ tab, and enter +hotrecoll.py+ (if it's in your PATH),
|
||||
or else the full path to the command (e.g.:
|
||||
'/usr/share/recoll/filters/hotrecoll.py').
|
||||
- Click _Apply_.
|
||||
|
||||
=== Installing the keyboard shortcut (XFCE):
|
||||
|
||||
Open the settings manager, and add the shortcut in the
|
||||
_Application Shortcuts_ panel inside the _Keyboard_ tool.
|
||||
|
||||
|
||||
=== Other environments
|
||||
|
||||
Many window managers have a way to set up a keyboard shortcut for running
|
||||
an arbitrary command. You'll need to look at the documentation for yours,
|
||||
or search the web for a solution.
|
||||
|
||||
An alternative independant of the environment would be to use the XBindKeys
|
||||
utility. See this link:http://www.linux.com/archive/feed/59494[linux.com
|
||||
article] for helpful instructions.
|
||||
|
||||
@ -1,33 +0,0 @@
|
||||
== Indexing arbitrary mail headers
|
||||
|
||||
By default the Recoll mail handler only processes a subset of email headers
|
||||
(+From+, +To+, +Cc+, +Date+, +Subject+). It is possible to index additional
|
||||
headers by specifying them inside the 'fields' configuration file, inside
|
||||
the configuration directory (typically '~/.recoll/').
|
||||
|
||||
Lengthy explanations are not really needed here, and I'll just show an
|
||||
example (duplicated from the configuration section of the manual):
|
||||
|
||||
----
|
||||
[prefixes]
|
||||
# Index mailmytag contents (with the given prefix)
|
||||
mailmytag = XMTAG
|
||||
|
||||
[stored]
|
||||
# Store mailmytag inside the document data record (so that it can be
|
||||
# displayed - as %(mailmytag) - in result lists).
|
||||
mailmytag =
|
||||
|
||||
[mail]
|
||||
# Extract the X-My-Tag mail header, and use it internally with the
|
||||
# mailmytag field name
|
||||
x-my-tag = mailmytag
|
||||
|
||||
----
|
||||
|
||||
Limitations:
|
||||
|
||||
- The mail filter will only process the first instance for a header
|
||||
occurring several times.
|
||||
- No decoding will take place (ie for non-ascii headers which would have
|
||||
some kind of encoding).
|
||||
@ -1,32 +0,0 @@
|
||||
== Indexing Mozilla calendar data
|
||||
|
||||
Mozilla calendar programs (*Sunbird*, *Lightning*) do not store their
|
||||
data in +ics+ files natively. They use an *SQLite* database (the
|
||||
'storage.sdb' file inside the profile). This means that calendar data
|
||||
cannot be indexed directly.
|
||||
|
||||
To get Recoll to index calendar data, you need to export it to an +ics+
|
||||
file. This can be done manually, from the application menus, or, by
|
||||
installing the
|
||||
link:https://addons.mozilla.org/en-US/sunbird/addon/3740[Automatic Export
|
||||
extension].
|
||||
|
||||
The extension can be configured to export the data when exiting the
|
||||
program, or at regular time intervals. You can even set up a command to be
|
||||
executed after the export. If you are not using real time indexing, this
|
||||
can usefully be *recollindex*.
|
||||
|
||||
In _Tools->Add Ons->Automatic Export preferences_, in the _Start an
|
||||
application after export_ subpanel, set _Path of application_ to
|
||||
'/usr/[local/]bin/recollindex' and _Parameters of application_ to
|
||||
something like _-i;/home/me/path/to/nameofexportedcal.ics_
|
||||
|
||||
This will ensure that the calendar is indexed every time it is exported
|
||||
(this is not necessary though, you can let the next batch indexing pass
|
||||
take care of it).
|
||||
|
||||
It may happen that the exported data has some syntax errors which will
|
||||
prevent indexing with the *rclics* filter which was distributed up to
|
||||
Recoll 1.13.04 (included). You may get an updated filter from the
|
||||
link:https://www.recoll.org/download.html[Recoll download page].
|
||||
|
||||
@ -1,24 +0,0 @@
|
||||
== Laptops: starting or stopping indexing according to AC power status
|
||||
|
||||
For people using real time indexing on a laptop, kind user "The Doctor"
|
||||
contributed a script to automatically start and stop indexing according to
|
||||
power status. The script can be found here:
|
||||
link:https://opensourceprojects.eu/p/recoll1/code/ci/144da4a5caa2b39d23d9d7cf262f03b6d80a4739/tree/src/desktop/recoll_index_on_ac.sh[recoll_index_on_ac.sh]
|
||||
|
||||
To use it, you need to copy it somewhere (e.g.: '/usr/bin', but any place
|
||||
will do), make it executable (`chmod a+x recoll_index_on_ac.sh`), and edit
|
||||
'~/.config/autostart/recollindex.desktop'
|
||||
|
||||
Change the following line:
|
||||
|
||||
Exec=recollindex -w 60 -m
|
||||
|
||||
to something like the following (depending where you copied the script):
|
||||
|
||||
Exec=/usr/bin/recoll_index_on_ac.sh
|
||||
|
||||
You may also want to change
|
||||
'/usr/share/recoll/examples/recollindex.desktop', otherwise your change
|
||||
will be reverted the next time you toggle real time indexing through the
|
||||
GUI. And, yes, sorry about it, _this_ change will be lost on the next
|
||||
Recoll update, so save a copy.
|
||||
@ -1,11 +0,0 @@
|
||||
== Indexing Outlook archives ==
|
||||
|
||||
Recoll has no direct support for indexing Microsoft Outlook data, because,
|
||||
if you are a Windows user, you probably are not a good customer for Linux
|
||||
desktop indexing...
|
||||
|
||||
However, if you have a need to index Outlook data at some point, I can
|
||||
recommend the excellent link:http://www.five-ten-sg.com/libpst/[libpst]
|
||||
library and its link:http://www.five-ten-sg.com/libpst/rn01re01.html[readpst]
|
||||
utility. Using this you can very easily convert the Outlook data into MH or
|
||||
mbox format, and then index the result with Recoll.
|
||||
@ -1,29 +0,0 @@
|
||||
== Indexing Web history with the Firefox extension ==
|
||||
|
||||
Note: this document is valid for Recoll versions from 1.18.
|
||||
|
||||
The link:http://sourceforge.net/projects/recollfirefox/[Recoll Firefox
|
||||
extension]
|
||||
works together with Recoll to index the Web pages that you visit. The
|
||||
extension is based on an older one which was initially written for the
|
||||
Beagle indexer.
|
||||
|
||||
The extension works by copying the data for the visited pages to a queue
|
||||
directory ('~/.recollweb/ToIndex' by default), from which they are
|
||||
indexed and removed by Recoll, and then stored in a local cache.
|
||||
|
||||
The extension is now hosted on the Mozilla add-ons site, so you can install
|
||||
it very simply in Firefox: link:https://addons.mozilla.org/fr/firefox/addon/recoll-indexer-1/[Recoll Firefox add-on page].
|
||||
|
||||
This feature can be enabled in the Recoll GUI index configuration panel
|
||||
(Web history section), or by editing the configuration file (set
|
||||
+processwebqueue+ to 1).
|
||||
|
||||
Please remember that Recoll only stores a limited amount of cached web data
|
||||
(adjustable from the GUI Index Configuration section), and that old pages
|
||||
will be purged from the index. Pages that you want to archive permanently
|
||||
need to be saved elsewhere, as they will otherwise eventually disappear
|
||||
from the Recoll results.
|
||||
|
||||
Recoll will index +.maff+ files, which may be a better choice for archival
|
||||
usage.
|
||||
@ -1,9 +0,0 @@
|
||||
.SUFFIXES: .txt .html
|
||||
|
||||
.txt.html:
|
||||
asciidoc $<
|
||||
|
||||
all: $(addsuffix .html,$(basename $(wildcard *.txt)))
|
||||
|
||||
clean:
|
||||
rm *.html
|
||||
@ -1,97 +0,0 @@
|
||||
== Creating and using multiple indexes
|
||||
|
||||
=== Why would you want to do this ?
|
||||
|
||||
- Easy adjustment of search areas: you can filter results by using the
|
||||
directory filter in the advanced search panel, but, if you have
|
||||
separate well defined places where you store different kind of data,
|
||||
it is easier to maintain separate index and use the External indexes
|
||||
dialog to switch them on or off, and it will also yield much better
|
||||
search performance.
|
||||
- Shared indexes: it may be useful to maintain one or several indexes
|
||||
for shared data, and separate personal indexes for each user. Indexes
|
||||
can be shared over the network.
|
||||
- Creating separate indexes for removable volumes.
|
||||
|
||||
=== How to do it
|
||||
|
||||
As an example we'll suppose that you have Recoll installed and indexing
|
||||
your home directory, and that you would like to have a separate index for
|
||||
'/usr/share/doc'.
|
||||
|
||||
You need to create a separate configuration for the new index, then add it
|
||||
to the external indexes list in the user interface, and activate it as
|
||||
needed.
|
||||
|
||||
. Create a directory for the new index, and create an empty configuration
|
||||
file
|
||||
+
|
||||
----
|
||||
cd
|
||||
mkdir .recoll-sharedoc
|
||||
touch .recoll-sharedoc/recoll.conf
|
||||
----
|
||||
. Either edit the new configuration by hand or start recoll to use the GUI
|
||||
configuration editor.
|
||||
+
|
||||
----
|
||||
cd .recoll-sharedoc
|
||||
echo "topdirs = /usr/share/doc" > recoll.conf
|
||||
# OR
|
||||
recoll -c ~/.recoll-sharedoc
|
||||
----
|
||||
+
|
||||
If using the GUI, click _Cancel_ when asked, to start the configuration
|
||||
editor.
|
||||
|
||||
. Perform initial indexing. If you chose the GUI route, indexing will
|
||||
start as soon as you leave the configuration editor. Else, on the
|
||||
command line:
|
||||
+
|
||||
----
|
||||
recollindex -c ~/.recoll-sharedoc
|
||||
----
|
||||
. Optionally set up *cron* to perform nightly indexing, use +crontab -e+
|
||||
and insert a line like the following:
|
||||
+
|
||||
----
|
||||
45 20 * * * recollindex -c ~/.recoll-sharedoc
|
||||
----
|
||||
+
|
||||
This would start the indexing at 20:45. `crontab -e` will use the *vi*
|
||||
editor by default, you can change this by using the EDITOR
|
||||
environment variable. Exemple: `EDITOR=kate crontab -e`
|
||||
Your favorite desktop may also have a dedicated tool to add crontab entries.
|
||||
|
||||
. Start recoll and choose the _Preferences->External_ index dialog menu
|
||||
entry, then click the Browse button (near the bottom), and select the
|
||||
new index Xapian database directory '~/.recoll-sharedoc/xapiandb'
|
||||
Then click _Add index_.
|
||||
|
||||
. You can then activate or deactivate the new index by clicking the box
|
||||
in front of the directory name in the list.
|
||||
|
||||
When adding an index shared by multiple users, it may be helpful to use the
|
||||
RECOLL_EXTRA_DBS environment variable instead of editing individual
|
||||
configurations, see the manual for more details.
|
||||
|
||||
=== Paths adjustments
|
||||
|
||||
When sharing indexes over a network, in most cases, the indexed data will
|
||||
be accessible through different paths on the different hosts. This will
|
||||
prevent the Preview and Open functions to work because the paths they get
|
||||
from the index do not match the ones which are usable from the local
|
||||
host.
|
||||
|
||||
For example my home directory is accessed as '/home/me' on my home
|
||||
machine, and as '/net/myhost/home/me' on other hosts. By default, trying
|
||||
to access a result from a remote host would use the first path, when the
|
||||
second is the one that would work.
|
||||
|
||||
As of release 1.19 **Recoll** has a facility to perform index-dependant
|
||||
path translations. This facility is accessible from the _external index
|
||||
dialog_ in the GUI preferences. Paths translations can be set for the main
|
||||
index if no index is selected (rarely useful), or for the selected
|
||||
additional index.
|
||||
link:../usermanual/webhelp/docs/RCL.SEARCH.PTRANS.html[See
|
||||
the manual] for more detail.
|
||||
@ -1,77 +0,0 @@
|
||||
== Interfacing Recoll and Mutt
|
||||
|
||||
It is possible to either use Mutt as a Recoll search result viewer, or
|
||||
start Recoll from the Mutt search.
|
||||
|
||||
=== Starting Mutt to view Recoll search results
|
||||
|
||||
This method and the associated
|
||||
link:http://www.recoll.org/files/recoll2mutt[recoll2mutt script] were kindly
|
||||
contributed by Morten Langlo.
|
||||
|
||||
This allows finding mail messages in recoll and then calling *mutt*
|
||||
or *mutt-kz* to read or process the mail.
|
||||
|
||||
Installation:
|
||||
|
||||
- Copy the [[http://www.recoll.org/files/recoll2mutt|recoll2mutt script]]
|
||||
somewhere in your PATH, and make it executable.
|
||||
- In the **recoll** GUI menus:
|
||||
_Preferences->GUI configuration->User interface->Choose editor applications_
|
||||
change the entry for "message/rfc822" to: +recoll2mutt %f+
|
||||
|
||||
The script has options for setting a number of parameters, you may not need
|
||||
to set any of them, the defaults are:
|
||||
|
||||
- -c mutt
|
||||
- -F .muttrc
|
||||
- -m Mail
|
||||
- -x "-fn 10*20 -geometry 115x40"
|
||||
|
||||
Example:
|
||||
|
||||
----
|
||||
recoll2mutt -c mutt-kz -F .mutt_kzrc -m Mail -x "-fn 10*20 -geometry 115x40" %f
|
||||
----
|
||||
|
||||
The option +-x+ is passed to *xterm*, which is used to call *mutt* or
|
||||
*mutt-kz*.
|
||||
|
||||
The script works for both _mbox_ and _maildir_ mail boxes, and it
|
||||
expects the configuration file for mutt and the mail directory to reside in
|
||||
your $HOME and the spool file to be '/var/spool/mail/$USER' if it is
|
||||
not in your mail directory. But it is easy to change the values in the
|
||||
script if you need to.
|
||||
|
||||
*mutt* is opened with the right mailbox and limit set to _Date_ and
|
||||
_Sender_. In theory you could set limit to _Message-Id_, but very often
|
||||
*mutt* reports, that there are invalid patterns in _Message-Id_, so do it
|
||||
safe, even though all emails in the opened mail box with the same date from
|
||||
the sender are shown.
|
||||
|
||||
|
||||
=== Starting Recoll from the Mutt search
|
||||
|
||||
This will work only when using maildir storage (messages in individual
|
||||
files). It will not work with mailbox files. The latter would probably be
|
||||
possible by extracting the individual result messages using the Python
|
||||
interface, but I did not try.
|
||||
|
||||
The classic way to interface Mutt and a search application is to create a
|
||||
shortcut to an external command which creates a temporary Maildir
|
||||
containing the search results.
|
||||
|
||||
There is such a script for Recoll, you will find it link:https://bitbucket.org/medoc/recoll/raw/41d41799dbac4c69a34db985b3ab9f1597c9c742/src/python/samples/mutt-recoll.py[here].
|
||||
|
||||
Copy the script somewhere in your PATH, and make it executable, then add
|
||||
the following line to your '.muttrc':
|
||||
|
||||
|
||||
----
|
||||
|
||||
macro index S "<enter-command>unset wait_key<enter><shell-escape>mutt-recoll.py -G<enter><change-folder-readonly>~/.cache/mutt_results<enter>" \
|
||||
"search mail (using recoll)"
|
||||
|
||||
----
|
||||
|
||||
Obviously, you can replace the 'S' letter with whatever will suit you (e.g:/)
|
||||
@ -1,85 +0,0 @@
|
||||
== Unix and non-ASCII file names, a summary of issues
|
||||
|
||||
Unix/Linux file and directory names are binary byte C strings. Only the
|
||||
null byte and the slash character (/) are forbidden inside a name,
|
||||
nowhere does the kernel interpret the strings as meaningful or
|
||||
printable.
|
||||
|
||||
In the old times, all utilities that would display to the user were
|
||||
ASCII-based, and people would use pure printable ASCII file names (even
|
||||
using space characters inside names was a cause for trouble). Non
|
||||
alphanumeric characters were exclusively used for playing tricks on
|
||||
colleagues. And all was well.
|
||||
|
||||
Then the devil came under the guise of accented 8 bit characters. The
|
||||
system has no problem with them, file names are still binary C strings, but
|
||||
the utilities have to display them or take them as input, and, because
|
||||
there is no encoding specification stored with the file names, they can
|
||||
only do this according to the character encoding taken from the user's
|
||||
current locale.
|
||||
|
||||
For example fr_FR.UTF-8, and fr_FR.ISO8859-1 could be used simultaneously
|
||||
on the same system (by different users), but they are completely
|
||||
uncompatible: ISO-8859-1 strings are illegal when viewed in an UTF-8 locale
|
||||
(will display as interrogation points or some other conventional error
|
||||
marker). UTF-8 strings will display as gibberish in an ISO-8859-1 locale.
|
||||
|
||||
This means that the file names created by an UTF-8 user are displayed as
|
||||
garbage to the ISO-8859 one...
|
||||
|
||||
If you ever change your locale, your old files are still there and named
|
||||
the same (in the binary sense), but the names display badly and you have
|
||||
great trouble inputing them. If you add distributed (NFS) file system
|
||||
issues, things become totally unmanageable. Also think about archives sent
|
||||
from another system with a different encoding.
|
||||
|
||||
For what concerns Recoll:
|
||||
|
||||
- The file names inside recoll.conf are not transcoded, they are taken as
|
||||
binary strings (mostly, only +\n+ and +space+ are a bit special), and
|
||||
passed as is to the system. So if you edit 'recoll.conf' with a text
|
||||
editor, inside the same locale that is or has been used for file names,
|
||||
you'll be fine.
|
||||
- There was a bug in the GUI configuration tool, up to 1.12, it should
|
||||
transcode between the internal Qt format and locale-dependant strings,
|
||||
but it doesn't or does it badly.
|
||||
- There is also an exception for the +unac_except_trans+ variable, this
|
||||
*has* to be UTF-8, so if the rest of the file uses another encoding,
|
||||
you'll need to edit two separate files and concatenate them.
|
||||
|
||||
As of version 1.13, Recoll uses local8Bit()/fromLocal8Bit() to convert
|
||||
recoll.conf file names from/to QStrings (it uses UTF-8 for all string
|
||||
values which are not file names).
|
||||
|
||||
The Qt file dialog is broken (at least was, I have not checked this on
|
||||
recent versions). It should consider file paths as almost-binary data, not
|
||||
QStrings, but doesn't. In consequence, things are even more broken than
|
||||
necessary as seen from there:
|
||||
|
||||
With LANG="C", no non-ASCII paths can't be used at all:
|
||||
|
||||
- Strings read from recoll.conf are stripped of 8bit characters before display.
|
||||
- Directory entries with 8bit characters are not displayed at all in the
|
||||
selection dialog.
|
||||
|
||||
With LANG="fr_FR.UTF-8", only UTF-8 paths can be used:
|
||||
|
||||
- Strings read from recoll.conf are damaged when converted to QString
|
||||
(except those that were actually UTF-8)
|
||||
- Only the UTF-8 directory entries are displayed in the selection dialog.
|
||||
|
||||
|
||||
With LANG="fr_FR.iso8859-1", everything works ok.
|
||||
|
||||
- Strings read from recoll.conf are displayed with weird characters if
|
||||
they use another encoding such as UTF-8, but are correctly maintained
|
||||
and can be read back from the dialogs and rewritten without damage.
|
||||
- Directory entries with 8 bit characters are displayed weirdly (normal),
|
||||
but can be manipulated without trouble (this includes utf-8 names of
|
||||
course).
|
||||
|
||||
In conclusion, only the iso-8859 locales can be used for handling mixed
|
||||
encoding situations. This is a possible workaround for people who need it.
|
||||
|
||||
More data about path encoding issues:
|
||||
http://www.dwheeler.com/essays/fixing-unix-linux-filenames.html
|
||||
@ -1,71 +0,0 @@
|
||||
== Starting native applications
|
||||
|
||||
It is sometimes difficult to start a native application on a result
|
||||
document, especially when the result comes from a container file (ie: email
|
||||
folder file, chm file).
|
||||
|
||||
The problem is that native applications usually expect at most a file name
|
||||
on the command line, and sometimes not even that (emailers).
|
||||
|
||||
The _Open parent documents_ link in the result list right click menu is
|
||||
sometimes useful in this situation (e.g.: +chm+ files).
|
||||
|
||||
In some other cases it may help that Recoll does make a lot of data
|
||||
available to the application. This data may have to be pre-processed in a
|
||||
script before calling the actual application.
|
||||
|
||||
Details about configuring how the native application or script are called
|
||||
are given with the
|
||||
link:http://www.recoll.org/usermanual/usermanual.html#RCL.INSTALL.CONFIG.MIMEVIEW[description of the mimeview configuration file]
|
||||
|
||||
Information about
|
||||
link:http://www.recoll.org/usermanual/usermanual.html#RCL.INSTALL.CONFIG.FIELDS[configuring
|
||||
customised fields] may also be useful in combination.
|
||||
|
||||
=== Example
|
||||
|
||||
This is a simple example, because it does not need to use special
|
||||
fields. It just shows how to solve a simple issue by using an intermediary
|
||||
script. The problem is due to the fact that thunderbird's +-file+ option
|
||||
won't open a file if the extension is not '.eml'. Jorge, the kind Recoll
|
||||
user who supplied the example stores his email in Maildir++ format, the
|
||||
file names have no extension, so an intermediary script is necessary to get
|
||||
thunderbird to open them:
|
||||
|
||||
Note that this only works with messages stored in Maildir or MH format (one
|
||||
message per file). As far as I know, there is no way to get Thunderbird to
|
||||
open an arbitrary mbox file.
|
||||
|
||||
The 'recoll-thunderbird-open-file' script:
|
||||
|
||||
----
|
||||
#!/bin/sh
|
||||
cp $1 /tmp/$$.eml
|
||||
thunderbird -file /tmp/$$.eml
|
||||
----
|
||||
|
||||
Create the file in an editor, save it somewhere, and make it executable
|
||||
(`chmod +x recoll-thunderbird-open-file`).
|
||||
|
||||
The mail line in the '~/.recoll/mimeview' file:
|
||||
|
||||
----
|
||||
[view]
|
||||
message/rfc822 = recoll-thunderbird-open-file %f
|
||||
----
|
||||
|
||||
If the place where you saved the script is not in your PATH, you will need
|
||||
to use the full path instead of just the script name, as in
|
||||
|
||||
----
|
||||
[view]
|
||||
message/rfc822 = /home/me/somewhere/recoll-thunderbird-open-file %f
|
||||
----
|
||||
|
||||
You should then be able to open the messages in Thunderbird, which is
|
||||
useful, for example, to handle the attachments.
|
||||
|
||||
With recent Recoll versions, if using the normal option of letting the
|
||||
Desktop chose the _Open_ application to use (_Use Desktop default_),
|
||||
you should also add +message/rfc822+ to the exceptions, and the whole
|
||||
thing is probably more easily done from the Recoll GUI.
|
||||
@ -1,30 +0,0 @@
|
||||
== Preventing indexing in a directory
|
||||
|
||||
=== Why would you want to do this ?
|
||||
|
||||
By default, recollindex (or the indexing thread inside the recoll QT user
|
||||
interface) will process your home directories and most its subdirectories,
|
||||
at the exception of some well known places (thumbnails, beagle and web
|
||||
browser caches, etc.)
|
||||
|
||||
You may want to prevent indexing in some directories where you don't expect
|
||||
interesting search results. This will avoid polluting the search result
|
||||
lists, speed up indexing times and make the index smaller.
|
||||
|
||||
=== How to do it
|
||||
|
||||
There are two ways to block indexing at certain points: either by listing
|
||||
specific paths, or by directory name pattern matches.
|
||||
|
||||
- Blocking specific paths: this is controlled by the 'skippedPaths'
|
||||
variable in the main configuration file. You can adjust the value either
|
||||
by editing the file or by using the indexing configuration dialog:
|
||||
_Preferences->Indexing configuration->Global parameters->Skipped paths_
|
||||
- Using pattern matches: these are listed in the skippedNames variable in
|
||||
the main configuration file. You can adjust the value either by editing
|
||||
the file or by using the GUI: _Preferences->Indexing configuration->Local
|
||||
parameters->Skipped names_
|
||||
|
||||
The
|
||||
link:../usermanual/webhelp/docs/RCL.INSTALL.CONFIG.RECOLLCONF.WHATDOCS.html[configuration
|
||||
section] of the manual has a bit more detail about the two variables.
|
||||
@ -1,157 +0,0 @@
|
||||
== Gathering useful data for asking help about or reporting a Recoll issue
|
||||
|
||||
Once in a while it will happen that a Recoll program will either signal an
|
||||
error, or even crash (either the *recoll* graphical interface or the
|
||||
*recollindex* command line indexing command).
|
||||
|
||||
Reporting errors and crashes is very useful. It can help others, and it can
|
||||
get your own problem solved.
|
||||
|
||||
Any problem report should include the exact Recoll and system versions.
|
||||
|
||||
If at all possible, reading the following and performing part of the
|
||||
suggested steps will be useful. This is not a condition for obtaining help
|
||||
though ! If you have any problem and have a difficulty with the following,
|
||||
just contact the mailing list or the developers (see contacts on
|
||||
link:https://www.recoll.org/support.html[the Recoll site support page]).
|
||||
|
||||
If the problem concerns indexing, and was initially found using the
|
||||
*recoll* GUI, you should try to reproduce it using the
|
||||
*recollindex* command-line indexer, which is much simpler and easier to
|
||||
debug.
|
||||
|
||||
There are then two sources of useful information to diagnose the issue: the
|
||||
debug log file and, possibly, in case of a crash, a stack trace.
|
||||
|
||||
Crash and other problem reports are of very high value to me, and I am
|
||||
willing to help you with any of the steps described below if it is not
|
||||
familiar to you. I do realize that not everybody is a programmer or a
|
||||
system administrator.
|
||||
|
||||
=== Obtaining information from the log file
|
||||
|
||||
All Recoll commands write a varying amount of information to a common log file.
|
||||
|
||||
_All commands use the same log, and the file is reset every time a command
|
||||
is started: so it is important to make a copy right after the problem
|
||||
occurs (for example, do not start *recoll* after a *recollindex*
|
||||
crash, this would reset the log). A workaround for this issue is to let the
|
||||
messages go to the default +stderr+, and redirect this._
|
||||
|
||||
By default, the messages are output to +stderr+, and you probably don't even
|
||||
see them if Recoll is started from the desktop. In this case, you need to
|
||||
set the parameters so that output goes to a file, and the appropriate
|
||||
verbosity level is set. When using the command-line, you may actually
|
||||
prefer to redirect stderr to avoid the log-truncating issue described
|
||||
above.
|
||||
|
||||
You can set the log parameters from the GUI _Indexing parameters_
|
||||
section or by editing the '~/.recoll/recoll.conf' file: set the
|
||||
+loglevel+ and +logfilename+ parameters. E.g.:
|
||||
|
||||
----
|
||||
loglevel = 6
|
||||
logfilename = /tmp/recolltrace
|
||||
----
|
||||
|
||||
The log file can become very big if you need a big indexing run to
|
||||
reproduce the problem. Choose a file system with enough space available
|
||||
(possibly a few gigabytes).
|
||||
|
||||
Then run the sequence that leads to the problem, and make a copy of the log
|
||||
file just after. If the log is too big, it will usually be sufficient to
|
||||
use the last 500 lines or so (tail -500).
|
||||
|
||||
==== Single file indexing issues
|
||||
|
||||
When the problem concerns, or can be reproduced with, a single file it is
|
||||
very cumbersome to have to run a full indexing pass to reproduce it. There
|
||||
are two ways around this:
|
||||
|
||||
- Set up an ad hoc configuration with only the file of interest, or its
|
||||
parent directory:
|
||||
----
|
||||
cd
|
||||
mkdir recoll-test
|
||||
cd recoll-test
|
||||
echo /path/to/my/file/or/its/parent/dir > recoll.conf
|
||||
echo 'loglevel = 6' >> recoll.conf
|
||||
echo 'logfilename = /tmp/recolltrace' >> recoll.conf
|
||||
recollindex -z -c .
|
||||
----
|
||||
- Use the -e and -i options to recollindex to erase/reindex a single
|
||||
file. Set up the log, then:
|
||||
----
|
||||
recollindex -e /path/to/my/file
|
||||
recollindex -i /path/to/my/file
|
||||
----
|
||||
|
||||
When using the second approach, you must take care that the path used is
|
||||
consistent with the paths listed/used in the configuration (ie: if '/home' is
|
||||
a link to '/usr/home', and '/usr/home/me' is used in the configuration
|
||||
+topdirs+, `recollindex -i /home/me/myfile` will not work, you need
|
||||
to use `recollindex -i /usr/home/me/myfile`.
|
||||
|
||||
|
||||
=== Obtaining a stack trace
|
||||
|
||||
If the program actually crashes, and in order to maximize usefulness, a
|
||||
crash report should also include a so-called stack trace, something that
|
||||
indicates what the program was doing when it crashed. Getting a useful
|
||||
stack trace is not very difficult, but it may need a little work on your
|
||||
part (which will then enable me do my part of the work).
|
||||
|
||||
If your distribution includes a separate package for Recoll debugging
|
||||
symbols, it probably also has a page on its web site explaining how to use
|
||||
them to get a stack trace. You should follow these instructions. If there
|
||||
is no debugging package, you should follow the instructions below. A little
|
||||
familiarity with the command line will be necessary.
|
||||
|
||||
==== Compiling and installing a debugging version
|
||||
|
||||
- Obtain the recoll source for the version you are using (www.recoll.org),
|
||||
and extract the source tree.
|
||||
- Follow the
|
||||
link:http://www.lesbonscomptes.com/recoll/usermanual/rcl.install.building.html[instructions
|
||||
for building Recoll from source] with the following modifications:
|
||||
- Before running configure, edit the mk/localdefs.in file and remove the
|
||||
-O2 option(s).
|
||||
- When running configure, specify the standard installation location for
|
||||
your system as a prefix (to avoid ending up with two installed versions,
|
||||
which would almost certainly end in confusion). On Linux this would
|
||||
typically be: `configure --prefix=/usr`
|
||||
- When installing, arrange for the installed executables not to be stripped
|
||||
of debugging symbols by specifying a value for the STRIP environment
|
||||
variable (ie: *echo* or *ls*): `sudo make install STRIP=ls`
|
||||
|
||||
==== Getting a core dump
|
||||
|
||||
You will need to run the operation that caused the crash inside a writable
|
||||
directory, and tell the system that you accept core dumps. The commands
|
||||
need to be run in a shell inside a terminal window. E.g.:
|
||||
|
||||
----
|
||||
cd
|
||||
ulimit -c unlimited
|
||||
recoll #(or recollindex or whatever you want to run).
|
||||
----
|
||||
|
||||
Hopefuly, you will succeed in getting the command to crash, and you will
|
||||
get a core file. A possible approach then would be to make both the
|
||||
executable and the core files available to me by uploading it to a file
|
||||
sharing site (the core file may be quite big). You should be aware though
|
||||
that the core file may contain some of the data that was being indexed,
|
||||
which may be a privacy issue. Another approach is to generate the stack
|
||||
trace yourself.
|
||||
|
||||
=== Using gdb to get a stack trace
|
||||
|
||||
- Install gdb if it is not already on the system.
|
||||
- Run gdb on the command that crashed and the core file (depending on the
|
||||
system, the core file may be named "core" or something else, like
|
||||
recollindex.core, or core.pid), ie: {{{gdb /usr/bin/recollindex core}}}
|
||||
- Inside gdb, you need to use different commands to get a stack trace for
|
||||
recoll and recollindex. For recollindex you can use the bt command. For
|
||||
recoll use `thread apply all bt full`
|
||||
- Copy/paste the output to your report email :), and quit gdb ("q").
|
||||
|
||||
@ -1,61 +0,0 @@
|
||||
== Starting native applications ==
|
||||
|
||||
Another example of using an intermediary script for an application with a
|
||||
command line syntax which can't be directly defined in mimeview.
|
||||
|
||||
We use a script to preprocess and adapt the options before calling the
|
||||
actual command.
|
||||
|
||||
Details about configuring how the native application or script are called
|
||||
are given with the
|
||||
link:http://www.recoll.org/usermanual/usermanual.html#RCL.INSTALL.CONFIG.MIMEVIEW[description
|
||||
of the mimeview configuration file].
|
||||
|
||||
*qpdfview* (link:http://launchpad.net/qpdfview[web site]) is a very
|
||||
lightweight tabbed PDF viewer with great search performance and result
|
||||
highlighting.
|
||||
|
||||
It does support parsing the search term and page number from the command
|
||||
line with the following syntax:
|
||||
|
||||
----
|
||||
qpdfview --unique "%f"#%p --search "%s"
|
||||
----
|
||||
|
||||
However, qpdfview will not launch if either %p or %s are empty in the
|
||||
command above. To accommodate for that, Recoll user Florian has written a
|
||||
small wrapper shell script:
|
||||
|
||||
----
|
||||
#!/bin/bash
|
||||
|
||||
qpdfviewpath=qpdfview
|
||||
|
||||
if [ -z $2 ]
|
||||
then
|
||||
page=""
|
||||
|
||||
else
|
||||
page="#"$2""
|
||||
fi
|
||||
|
||||
if [ -z $3 ]
|
||||
then
|
||||
search=""
|
||||
|
||||
else
|
||||
search="--search "$3""
|
||||
fi
|
||||
|
||||
$qpdfviewpath --unique "$1"$page $search >&0 2>&0 &
|
||||
----
|
||||
|
||||
|
||||
The corresponding handler line for Recoll would be (depending on how you
|
||||
name the script and where you store it):
|
||||
|
||||
----
|
||||
qpdfviewwrapper %f %p %s
|
||||
----
|
||||
|
||||
|
||||
@ -1,18 +0,0 @@
|
||||
== Querying Recoll from a C program
|
||||
|
||||
The easiest way to query Recoll from a C or C++ program is to execute an
|
||||
external search command (`recollq` or `recoll -t`).
|
||||
|
||||
I have written a simple C module which deals with the related housekeeping
|
||||
and presents an easy to use API to the rest of the code. You will find it
|
||||
here:
|
||||
|
||||
https://bitbucket.org/medoc/recoll-capi
|
||||
|
||||
It is a bit experimental and will only work with recoll 1.20 for now
|
||||
(because it uses a new option for recollq). However it would be trivial to
|
||||
modify for working with 1.19, get in touch with me if you need this.
|
||||
|
||||
The other approach is to link with the Recoll library. This has no official
|
||||
API, but in practise, the internal one is fairly stable, and if you want to
|
||||
choose this approach, you should start from the code in recollq.cpp
|
||||
@ -1,58 +0,0 @@
|
||||
== Replacing the Category filter controls
|
||||
|
||||
The document category filter controls normally appear at the top of the
|
||||
*recoll* GUI, either as checkboxes just above the result list, or as a
|
||||
dropbox in the tool area.
|
||||
|
||||
By default, they are labeled _Media_, _Message_, _Spreadsheet_, _Text_,
|
||||
etc. and each map to a document category.
|
||||
|
||||
The mapping used to be fixed. You could change the number and composition
|
||||
of categories by redefining them inside the {{{mimeconf}}} configuration
|
||||
file (you still can), but the filters always used document categories.
|
||||
|
||||
Categories can also be selected from the query language by using an
|
||||
+rclcat:+ selector. E.g.: _rclcat:message_.
|
||||
|
||||
As of Recoll release 1.17, the filters are not hard-wired any more. They
|
||||
map to query language fragments. This means that you can freely redefine
|
||||
what they do.
|
||||
|
||||
The associations are configured inside the 'mimeconf' file, in the
|
||||
+[guifilters]+ section. Most GUI parameters are stored in the *Qt*
|
||||
configuration file, so this is not entirely consistent, and you will have
|
||||
to bear with my lazyness here.
|
||||
|
||||
A simple exemple will hopefuly make things clearer. If you add the
|
||||
following to your '~/.recoll/mimeconf' file:
|
||||
|
||||
----
|
||||
[guifilters]
|
||||
|
||||
Big Books = dir:"~/My Books" size>10K
|
||||
My Docs = dir:"~/My Documents"
|
||||
Small Books = dir:"~/My Books" size<10K
|
||||
System Docs = dir:/usr/share/doc
|
||||
|
||||
----
|
||||
|
||||
You will have four filter checkboxes, labelled _Big Books_, _My Docs_, etc.
|
||||
|
||||
The text after the equal sign must be a valid query language fragment, and
|
||||
will be translated to a *Recoll* query and combined with the rest of the
|
||||
query with an AND conjunction.
|
||||
|
||||
Any name text before a colon character will be erased in the display, but
|
||||
used for sorting. You can use this to display the checkboxes in any order
|
||||
you like. For exemple, the following would do exactly the same as above,
|
||||
but ordering the checkboxes in the reverse order.
|
||||
|
||||
----
|
||||
[guifilters]
|
||||
|
||||
d:Big Books = dir:"~/My Books" size>10K
|
||||
c:My Docs = dir:"~/My Documents"
|
||||
b:Small Books = dir:"~/My Books" size<10K
|
||||
a:System Docs = dir:/usr/share/doc
|
||||
|
||||
----
|
||||
@ -1,23 +0,0 @@
|
||||
== Result list thumbnails and how to create them
|
||||
|
||||
Recoll will display thumbnails for the results if the images exist in the
|
||||
standard location ('$HOME/.thumbnails' or '$HOME/.cache/thumbnails' depending
|
||||
on the xdg version).
|
||||
|
||||
But it will not create thumbnails, mainly because it is very hard to do
|
||||
portably.
|
||||
|
||||
Thumbnails are most commonly created when you visit a directory with your
|
||||
file manager, but visiting the whole file tree just to create thumbnails is
|
||||
a bit fastidious.
|
||||
|
||||
One simple trick to create thumbnails from the recoll GUI is to visit the
|
||||
parent directory for a result by using the _Open parent document/folder_
|
||||
entry in the right-click menu.
|
||||
|
||||
You can also find tools for the systematic creation of thumbnails for a
|
||||
directory tree. Three such tools are discussed on this
|
||||
link:http://askubuntu.com/questions/199110/how-can-i-instruct-nautilus-to-pre-generate-pdf-thumbnails[askubuntu.com discussion]
|
||||
|
||||
Also please note that no thumbnails can currently be generated or displayed
|
||||
for embedded documents (attachments, archive members, etc.).
|
||||
@ -1,61 +0,0 @@
|
||||
== User configuration backup
|
||||
|
||||
=== Why you would want to do this
|
||||
|
||||
If you are going to reinstall your system, and have some custom
|
||||
configuration, you may save some time by making a backup of your
|
||||
configuration and restoring it on the new system, rather than going through
|
||||
the menus to recreate it.
|
||||
|
||||
=== How to do it
|
||||
|
||||
==== Index/search configuration
|
||||
|
||||
The main recoll configuration data is normally kept inside '~/.recoll' or
|
||||
whatever *$RECOLL_CONFDIR* is set to.
|
||||
|
||||
This directory contains both configuration files and generated index
|
||||
data.In a standard configuration, the following files and directories
|
||||
contain generated data:
|
||||
|
||||
- 'xapiandb' contains the Xapian index, which normally consumes most of the
|
||||
total space.
|
||||
- 'aspdict.en.rws' contains the aspell dictionary used for spelling
|
||||
corrections.
|
||||
- 'mboxcache' contains cached offset data for email messages inside mbox
|
||||
folders.
|
||||
- 'webcache' contains saved web pages. This is more than a cache as
|
||||
destroying it will purge the corresponding data during the next
|
||||
indexing.
|
||||
|
||||
The other files are either very small or contain configuration data.
|
||||
|
||||
If you want to only save configuration, using minimum space, you can
|
||||
destroy the above files and directories (with the possible exception of
|
||||
'webcache'). Then taking a copy of the '.recoll' directory and adding the
|
||||
GUI configuration data described in the next will get you a full
|
||||
configuration data backup.
|
||||
|
||||
==== GUI configuration
|
||||
|
||||
The parameters set from the _Query configuration_ Qt menus are stored in
|
||||
Qt standard places:
|
||||
|
||||
- '~/.qt/recollrc' for Qt 3.x
|
||||
- '~/.config/Recoll.org/recoll.conf' for Qt 4 and later
|
||||
|
||||
|
||||
==== Other data
|
||||
|
||||
If you wish to save index data in addition to the customisation files,
|
||||
which only makes sense if the document access paths do not change after
|
||||
reinstallation, you can just take a backup of the full '.recoll'
|
||||
directory, taking care that the storage locations for some data elements
|
||||
can be changed (not be inside '.recoll'):
|
||||
|
||||
- The index data is normally kept inside '~/.recoll/xapiandb', but the
|
||||
location of this directory can be modified by the +dbdir+
|
||||
configuration parameter if it is set (check 'recoll.conf').
|
||||
- If you use the Firefox Recoll plugin, the WEB history cache is normally
|
||||
kept inside '~/.recoll/webcache', but the location can be modified by
|
||||
the +webcachedir+ configuration parameter.
|
||||
@ -1,109 +0,0 @@
|
||||
== Building and Installing the Ubuntu Unity Recoll Lens
|
||||
|
||||
Important preliminary notes:
|
||||
|
||||
- This only makes sense for Ubuntu versions using the Unity environment:
|
||||
Natty (11.04), Oneiric (11.10), Precise (12.04), and later.
|
||||
- _Remember that you still need to use the recoll GUI (or the recollindex
|
||||
//command) to get the indexing going !_
|
||||
- The Lens is artificially limited to showing at most 20 results. Use the
|
||||
recoll GUI for more complete capabilities (or edit rclsearch.py, change
|
||||
the "if actual_results >= 20:" line).
|
||||
|
||||
|
||||
=== The Lens with Recoll 1.17 and later
|
||||
|
||||
If you are willing to install or upgrade to Recoll version 1.17, all
|
||||
necessary packages are on the Recoll PPA, you just need to add the
|
||||
repository to your system sources and add or upgrade the packages: *_/This
|
||||
is the recommended approach!_*
|
||||
|
||||
----
|
||||
sudo add-apt-repository ppa:recoll-backports/recoll-1.15-on
|
||||
sudo apt-get update
|
||||
sudo apt-get install recoll-lens recoll
|
||||
----
|
||||
|
||||
This document may still be useful if you want to modify the lens source
|
||||
code.
|
||||
|
||||
=== The Lens with older Recoll versions
|
||||
|
||||
If, for some reason, you wish to test the Lens with an older Recoll
|
||||
version, read the following.
|
||||
|
||||
Please not that such an installation is somewhat crippled: you will not be
|
||||
able to display results for embedded documents (emails inside an mbox,
|
||||
attachments etc.). This requires a recoll command line option which is only
|
||||
available in 1.17
|
||||
|
||||
The Lens is based on the Recoll Python module which is not built by default
|
||||
for versions prior to 1.17, so so you will first need to pull the Recoll
|
||||
source code (for you version), then untar and proceed with the
|
||||
configure/build instructions below.
|
||||
|
||||
The following uses --prefix=/usr. I have no real reason to believe
|
||||
that this would not work with /usr/local (lenses are also searched there by
|
||||
default). If you confirm that things work with another prefix, please drop
|
||||
me a line.
|
||||
|
||||
When doing this over a previous Recoll compilation, run a "make clean" to
|
||||
get rid of the non-PIC objects.
|
||||
|
||||
Note that the following instructions change nothing to your existing Recoll
|
||||
installation, they only install the Python module and the Unity Lens,
|
||||
recoll, recollindex etc. are unaffected.
|
||||
|
||||
'/TOP/OF/RECOLL/SRC' designates the top of the recoll source tree.
|
||||
|
||||
=== Configure and build the recoll library and python module, install the module
|
||||
|
||||
The following needs the development packages for Xapian, Python and zlib.
|
||||
|
||||
----
|
||||
cd /TOP/OF/RECOLL/SRC
|
||||
# May fail if no previous build was performed
|
||||
make clean
|
||||
|
||||
# the gui/x11 disabling is just here to avoid having to install the
|
||||
# development libraries for Qt.
|
||||
configure --prefix=/usr --enable-pic --without-x --disable-qtgui
|
||||
make
|
||||
|
||||
cd python/recoll
|
||||
python setup.py build
|
||||
sudo python setup.py install
|
||||
----
|
||||
|
||||
=== Build and install the Unity Lens
|
||||
|
||||
----
|
||||
cd /TOP/OF/RECOLL/SRC
|
||||
cd desktop/unity-lens-recoll
|
||||
configure --prefix=/usr --sysconfdir=/etc
|
||||
sudo make install
|
||||
|
||||
----
|
||||
|
||||
Voilà, it should work...
|
||||
|
||||
Try to start the Dash, you should see the Recoll checkerboard (or
|
||||
whatever...) in the Lens list.
|
||||
|
||||
The Recoll Lens expects a Recoll query language string, so you can use
|
||||
field searches, directory, size, and date filtering (see the
|
||||
link:http://www.lesbonscomptes.com/recoll/usermanual/rcl.search.lang.html[Recoll
|
||||
manual] for a description of the query language).
|
||||
|
||||
If you want to disable the Lens, I think that you just have to delete
|
||||
'/usr/share/unity/lenses/recoll'
|
||||
|
||||
Other installed files:
|
||||
|
||||
----
|
||||
/usr/libexec/unity-recoll-daemon
|
||||
/usr/share/dbus-1/services/unity-lens-recoll.service
|
||||
/usr/share/doc/unity-lens-recoll
|
||||
/usr/share/unity-lens-recoll
|
||||
----
|
||||
|
||||
@ -1,68 +0,0 @@
|
||||
== Using the _Open With_ context menu in recoll 1.20 and newer
|
||||
|
||||
Recoll versions and newer have an _Open With_ entry in the result list
|
||||
context menu (the thing which pops up on a right click).
|
||||
|
||||
This allows choosing the application used to edit the document, instead of
|
||||
using the default one.
|
||||
|
||||
The list of applications is built from the desktop files found inside
|
||||
'/usr/share/applications'. For each application on the system, these
|
||||
files lists the mime types that the application can process.
|
||||
|
||||
If the application which you would want listed does not appear, the most
|
||||
probable cause is that it has no desktop file, which could happen due to a
|
||||
number of reasons.
|
||||
|
||||
This can be fixed very easily: just add a +.desktop+ file to
|
||||
'/usr/share/applications', starting from an existing one as a template.
|
||||
|
||||
As an example, based on an original idea from Recoll user +florianbw+,
|
||||
the following describes setting up a script for editing a PDF document
|
||||
title found in the recoll result list.
|
||||
|
||||
The script uses the *zenity* shell script dialog box tool to let you
|
||||
enter the new title, and then executes *exiftool* to actually change
|
||||
the document.
|
||||
|
||||
----
|
||||
#!/bin/sh
|
||||
|
||||
PDF=$1
|
||||
TITLE=`exiftool -Title -s3 "$PDF"`
|
||||
|
||||
RES=`zenity --entry \
|
||||
--title="Change PDF Title" \
|
||||
--text="Enter the Title:" \
|
||||
--entry-text "$TITLE"`
|
||||
|
||||
if [ "$RES" != "" ]; then
|
||||
echo -n "Changing title to $RES ... " && \
|
||||
exiftool -Title="$RES" "$PDF" && \
|
||||
recollindex -i "$PDF" && echo "Done!"
|
||||
else
|
||||
echo "No title entered"
|
||||
fi
|
||||
----
|
||||
|
||||
Name it, for example, 'pdf-edit-title.sh', and make it executable
|
||||
(`chmod a+x pdf-edit-title.sh`).
|
||||
|
||||
Then create a file named 'pdf-edit-title.desktop' inside
|
||||
'/usr/share/applications'. The file name does not need to be the same as the
|
||||
script's, this is just to make things clearer:
|
||||
|
||||
----
|
||||
[Desktop Entry]
|
||||
Name=PDF Title Editor
|
||||
Comment=Small script based on exiftool used to edit a pdf document title
|
||||
Exec=/home/dockes/bin/pdf-edit-title.sh %F
|
||||
Type=Application
|
||||
MimeType=application/pdf;
|
||||
----
|
||||
|
||||
You're done ! Restart Recoll, perform a search and right-click on a PDF
|
||||
result: you should see an entry named _PDF Title Editor_ in the _Open
|
||||
With_ list. Click on it, and you will be able to edit the title.
|
||||
|
||||
|
||||
@ -1,100 +0,0 @@
|
||||
== Using the log file to investigate indexing issues
|
||||
|
||||
All *Recoll* processes print trace messages. By default these go to the
|
||||
standard error output, and you may not ever see them (in the case, for
|
||||
example, of the *recoll* GUI started from the desktop interface).
|
||||
|
||||
There are a number of potential issues with indexing that may need
|
||||
investigation, such as:
|
||||
|
||||
- A file can't be found by searching even if it appears that it should have
|
||||
be indexed (this could happen because the file is not selected at all or
|
||||
because a filter program crashes).
|
||||
- The indexing process gets stuck and never finishes.
|
||||
- The indexing process ends up with an error.
|
||||
- The indexing process seems to be using too much system capacity.
|
||||
|
||||
The right way to approach these problems is to use the *recollindex*
|
||||
command line tool (instead of the *recoll* GUI), and to set up the
|
||||
trace log to provide information about what indexing is actually doing.
|
||||
|
||||
Trace log parameters can be set either from the GUI _Preferences->Indexing
|
||||
Configuration->Global Parameters_ panel, or by editing the configuration
|
||||
file '~/.recoll/recoll.conf'. You should set the following parameters:
|
||||
|
||||
----
|
||||
loglevel = 6
|
||||
logfilename = stderr
|
||||
thrQSizes = -1 -1 -1
|
||||
----
|
||||
|
||||
We use _stderr_ instead of an actual file in order to capture direct filter
|
||||
messages (such as a *python* stack trace) along with normal
|
||||
*recollindex* messages.
|
||||
|
||||
The last line sets recollindex for single-threaded operation, which will
|
||||
make the log much more readable.
|
||||
|
||||
You should then check that no *recoll* or *recollindex* process is
|
||||
currently running, and kill any you find.
|
||||
|
||||
Then, if this is an issue about an identified file, try indexing it only:
|
||||
|
||||
----
|
||||
recollindex -i myunfindablefile.xxx > /tmp/myindexlog 2>&1
|
||||
----
|
||||
|
||||
If this is a general issue with indexing (process not finishing properly),
|
||||
just start it:
|
||||
|
||||
----
|
||||
recollindex > /tmp/myindexlog 2>&1
|
||||
----
|
||||
|
||||
Usually, having a look at the trace will allow to see what is wrong (e.g.:
|
||||
a configuration issue or missing filter), and solve the problem.
|
||||
|
||||
In case of indexer misbehaviour (e.g. using too much memory, you should run
|
||||
_tail -f_ on the log to see what is going on.
|
||||
|
||||
If this is not enough, please
|
||||
link:https://opensourceprojects.eu/p/recoll1/tickets/new/[open a tracker
|
||||
issue] and attach or link to the log data, or just email me (jfd at
|
||||
recoll.org).
|
||||
|
||||
*recollindex* and *recollindex -i* usually have the same criteria to
|
||||
include a file or not (but see the _Path gotcha_ note below). It may
|
||||
happen that they behave differently, so it may sometimes be useful to run a
|
||||
full *recollindex* even for a specific file, but this will produce a
|
||||
big log file.
|
||||
|
||||
When you are done, it is better to reset the verbosity to a reasonable
|
||||
level (e.g.: +2+ : just errors, +3+ : information, listing indexed files).
|
||||
|
||||
=== Note: the path gotcha
|
||||
|
||||
*recollindex -i* will only index files under the directories defined by the
|
||||
+topdirs+ configuration variable (your home directory by
|
||||
default). Unfortunately, the test is done on the file path text, ignoring
|
||||
possible symbolic links. If you give a simple file name as a parameter to
|
||||
*recollindex -i* and there are symbolic links inside the +topdirs+
|
||||
entries, the comparison may fail. For example, if your home directory is
|
||||
'/home/me/' and '/home/' is a link to '/usr/home/', *recollindex -i
|
||||
somefilename* will actually try to index '/usr/home/somefilename/', and
|
||||
fail (because '/usr/home/me/' is not a subdirectory of '/home/me/'). This
|
||||
will manifest itself in the log by a message like the following.
|
||||
|
||||
----
|
||||
:4:../index/fsindexer.cpp:149:FsIndexer::indexFiles: skipping [/usr/home/me/somefile] (ntd)
|
||||
----
|
||||
|
||||
If this happens, give a full path consistent with what is found in the
|
||||
configuration file (e.g.: _recollindex -i /home/me/somefile_).
|
||||
|
||||
=== File system occupation
|
||||
|
||||
One of the possible reasons for failed indexing is a +maxfsoccup+
|
||||
parameter set too low. This is the value of file system occupation, not
|
||||
free space, where indexing will stop. It is set from the GUI indexing
|
||||
configuration or by editing 'recoll.conf'. A value of 0 implies no
|
||||
checking, but a very low, non-zero, value will just prevent indexing.
|
||||
@ -1,65 +0,0 @@
|
||||
== Recoll Wiki file index
|
||||
link:ElinksWeb.html[Extending the Recoll Firefox visited web page indexing mechanism to other browsers]
|
||||
|
||||
link:FaqsAndHowTos.html[Faqs and Howtos]
|
||||
|
||||
link:FilterArch.html[Recoll input filters ]
|
||||
|
||||
link:FilterRetrofit.html[Installing a filter for a new document type]
|
||||
|
||||
link:FilteringOutZipArchiveMembers.html[Filtering out Zip archive members]
|
||||
|
||||
link:GUIKeyboard.html[# Recoll GUI keyboard navigation]
|
||||
|
||||
link:HandleCustomField.html[Generating a custom field and using it to sort results]
|
||||
|
||||
link:Home.html[Welcome to the Recoll Wiki]
|
||||
|
||||
link:HotRecoll.html[Recoll hotkey: starting / hiding recoll with a keyboard shortcut]
|
||||
|
||||
link:IndexMailHeader.html[Indexing arbitrary mail headers ]
|
||||
|
||||
link:IndexMozillaCalendari.html[Indexing Mozilla calendar data ]
|
||||
|
||||
link:IndexOnAc.html[Laptops: automatically starting or stopping indexing according to AC power status]
|
||||
|
||||
link:IndexOutlook.html[Indexing Outlook archives]
|
||||
|
||||
link:IndexWebHistory.html[Indexing Web history with the Firefox extension ]
|
||||
|
||||
link:MultipleIndexes.html[Creating and using multiple indexes]
|
||||
|
||||
link:MuttAndRecoll.html[Interfacing Recoll and Mutt]
|
||||
|
||||
link:NonAsciiFileNames.html[Unix and non-ASCII file names, a summary of issues]
|
||||
|
||||
link:OpenHelperScript.html[Starting native applications ]
|
||||
|
||||
link:PreventIndexingDir.html[Preventing indexing in a directory]
|
||||
|
||||
link:ProblemSolvingData.html[Gathering useful data for asking help about or reporting a Recoll issue]
|
||||
|
||||
link:QpdfviewHelperScript.html[Starting native applications ]
|
||||
|
||||
link:QueryFromC.html[Querying Recoll from a C program]
|
||||
|
||||
link:ReplaceCategories.html[Replacing the Category filter controls]
|
||||
|
||||
link:ResultsThumbnails.html[Result list thumbnails and how to create them]
|
||||
|
||||
link:SavingConfig.html[User configuration backup]
|
||||
|
||||
link:UnityLens.html[Building and Installing the Ubuntu Unity Recoll Lens]
|
||||
|
||||
link:UsingOpenWith.html[Using the Open With context menu in recoll 1.20 and newe]
|
||||
|
||||
link:WhyIsMyFileNotIndexed.html[Using the log file to investigate indexing issues]
|
||||
|
||||
link:XDGBase.html[XDG: Tidying Recoll data storage]
|
||||
|
||||
link:ZDevCaseAndDiacritics1.html[Character case and diacritic marks (1), issues with stemming]
|
||||
|
||||
link:ZDevCaseAndDiacritics2.html[Character case and diacritic marks (2), user interface]
|
||||
|
||||
link:ZDevCaseAndDiacritics3.html[Character case and diacritic marks (3), implementation]
|
||||
|
||||
@ -1,42 +0,0 @@
|
||||
== XDG: Tidying Recoll data storage ==
|
||||
|
||||
The default storage structure of Recoll configuration and index data is
|
||||
quite at odds with what recommends the
|
||||
link:http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html[XDG
|
||||
Base Directory Specification], the reason being that it predates said spec.
|
||||
|
||||
By default, Recoll stores all its data in a single directory: '$HOME/.recoll'
|
||||
|
||||
This is not going to change, because it would be quite disturbing for
|
||||
current users.
|
||||
|
||||
However, the location of this directory can be modified using the
|
||||
+$RECOLL_CONFDIR+ environment variable.
|
||||
|
||||
Furthermore all significant Recoll data categories can be moved away from
|
||||
the configuration directory (maybe to '$HOME/.cache'), by setting
|
||||
configuration variables:
|
||||
|
||||
* _dbdir_ defines the location for storing the Xapian
|
||||
index. This could be set to, e.g., '$HOME/.cache/recoll/xapiandb'. It is
|
||||
quite recommended that
|
||||
this directory be dedicated to Xapian (don't store other things in
|
||||
there).
|
||||
* _mboxcachedir_ defines the location for caching access speedup information
|
||||
about mail folders in mbox format. e.g. '$HOME/.cache/recoll/mboxcache'
|
||||
* New in 1.22: you can use _aspellDictDir_ to define the storage
|
||||
location for the aspell spelling approximation
|
||||
dictionary. E.g. '$HOME/.cache/recoll'
|
||||
* _webcachedir_ may be used to define where the visited web pages
|
||||
archive is stored. E.g. '$HOME/.cache/recoll/webcache'. This is only used
|
||||
if you activate the Firefox plugin and web history indexing. You may
|
||||
want to think a bit more about where to store it, because, contrary to
|
||||
the above, this is not discardable data: your Recoll Web history goes
|
||||
away if you delete it.
|
||||
|
||||
If you use multiple Recoll configurations, each will have to be customized.
|
||||
|
||||
Once these are put away, there are still a few modifyiable files in the
|
||||
configuration directory, for example the 'recoll.pid' and 'history'
|
||||
files, but these are small files. Moving 'recoll.pid' away would be a
|
||||
serious headache because it is used by scripts.
|
||||
@ -1,143 +0,0 @@
|
||||
== Character case and diacritic marks (1), issues with stemming
|
||||
|
||||
=== Case and diacritics in Recoll
|
||||
|
||||
Recoll versions up to 1.17 almost fully ignore character case and diacritic
|
||||
marks.
|
||||
|
||||
All terms are converted to lower case and unaccented before they are
|
||||
written to the index. There are only two exceptions:
|
||||
|
||||
* File paths (as used in _dir:_ clauses) are not converted. This might
|
||||
be a bug or a feature, but the main reason is that we don't know how they
|
||||
are encoded.
|
||||
* It is possible to specify that some characters will keep their diacritic
|
||||
marks, because the entity formed by the character and the diacritic mark
|
||||
is considered to be a different letter, not a modified one. This is
|
||||
highly dependant on the language. For exemple, in Swedish, +å+ should
|
||||
be preserved, not turned into +a+.
|
||||
|
||||
As a necessary consequence, the same transformations are applied to search
|
||||
terms, and it is impossible to search for a specific capitalization of a
|
||||
word (+US+ is looked for as +us+), or a specific accented form
|
||||
(+café+ will be looked for as +cafe+).
|
||||
|
||||
However, there are some cases where you would like to be more specific:
|
||||
|
||||
* Searching for +US+ or +us+ should probably return different results.
|
||||
* Diacritics are seldom significant in English, but we can find a
|
||||
few examples anyway: +sake+ and +saké+, +mate+ and +maté+. Of
|
||||
course, there are many more cases in languages which use more diacritics.
|
||||
|
||||
On the other hand, accents are often mistyped or forgotten (résumé, résume,
|
||||
resume?), and capitalization is most often unsignificant, so that it is
|
||||
very important to retain the capability to ignore accent and character
|
||||
case differences, and that the discrimination can be easily switched on or
|
||||
off for each search (or even for specific terms).
|
||||
|
||||
This text and other pages which will follow will discuss issues in adding
|
||||
character case and diacritics sensitivity to Recoll, under the assumption
|
||||
that the main index will contain the raw source terms instead of
|
||||
case-folded and unaccented ones.
|
||||
|
||||
The following will use the _unaccent_ neologism to mean _remove
|
||||
diacritic marks_ (and not only accents).
|
||||
|
||||
English examples are used when possible, but given the limited use of
|
||||
diacritics in English, some French will probably creep in.
|
||||
|
||||
=== Diacritics and stemming
|
||||
|
||||
Stemming is the process by which we extend a search to terms related by
|
||||
grammatical inflexion, for example singular/plural, verb tenses, etc. For
|
||||
example a search for +floor+ is normally expanded by Recoll to +floors,
|
||||
floored, flooring, ...+
|
||||
|
||||
In practice Recoll has a separate data structure that has stemmed terms
|
||||
(stems) as keys pointing to a list of expansion terms
|
||||
{{{floor -> (floor,floors,floorings,...)}}}
|
||||
|
||||
Stemming should be applied to terms before they are stripped of
|
||||
diacritics. Accents may have a grammatical significance, and the accent may
|
||||
change how the term is stemmed. For example, in French the +âmes+ suffix
|
||||
generally marks a past conjugation but +ames+ does not. The standard
|
||||
Xapian French stemmer will turn +évitâmes+ (avoided) into an +évit+ stem,
|
||||
but +évitames+ will be turned into +évitam+ (stripping
|
||||
plural and feminine suffixes).
|
||||
|
||||
When the search is set to ignore diacritics, this poses a specific problem:
|
||||
if the user enters the search term without accents (which is correct
|
||||
because the system is supposed to ignore them), there is no warranty that
|
||||
the term will be correctly expanded by stemming.
|
||||
|
||||
The diacritic mismatch breaks the family relationship between the stem
|
||||
siblings, and this is independant of the type of index: it will happen with
|
||||
an index where diacritics are stripped just as with a raw one.
|
||||
|
||||
The simpler case where diacritics in the original term only affects
|
||||
diacritics in the stem also necessitates specific processing, but it is
|
||||
easier to work around.
|
||||
|
||||
Two examples illustrating these issues follow.
|
||||
|
||||
==== The simple case: diacritics in the term only affect diacritics in the stem
|
||||
|
||||
Let's imagine that the document set contains the term +éviter+
|
||||
(infinitive of +to avoid+), but not +évite+ (present). The only term in
|
||||
the actual index is then +éviter+.
|
||||
|
||||
The user enters an unaccented +evite+, counting on the
|
||||
diacritics-insensitive search mode to deal with the accents. As +évite+
|
||||
is not present in the index, we have no way to guess that +evite+ is
|
||||
really +évite+.
|
||||
|
||||
The stemmer will turn +evite+ into +evit+. There is no way that this
|
||||
can be related to +éviter+, and this legitimate result can't be found.
|
||||
|
||||
There is a way around this: we can compute a separate
|
||||
stem expansion dictionary for unaccented terms. This dictionary, to be used
|
||||
with diacritic-unsensitive searches only, contains the relationship
|
||||
between +evit+ and +eviter+ (as +éviter+ is in the index). We can
|
||||
then relate +eviter+ and +éviter+ because they differ only by accents,
|
||||
and the search will find the document with +éviter+.
|
||||
|
||||
==== The bad case: diacritics in the term change the stem beyond diacritics
|
||||
|
||||
Some grammatically significant accents will cause unexpectedly missing
|
||||
search results when using a supposedly diacritics-insensitive search mode.
|
||||
|
||||
Let's imagine that the document set contains the term +éviter+
|
||||
(infinitive of +to avoid+), but not +évitâmes+ (past). So the stemming
|
||||
expansion table has an entry for +évit+ -> +éviter+.
|
||||
|
||||
If the user enters an unaccented +evitames+, she would expect to find the
|
||||
documents containing +éviter+ in the results, because the latter term is
|
||||
a stemming sibling of +évitâmes+ and the search is supposedly not
|
||||
influenced by diacritics, so that +evitames+ and +évitâmes+ should be
|
||||
equivalent.
|
||||
|
||||
However, our search is now in trouble, because +évitâmes+ is not in any
|
||||
document, so that there is no data in the index which would inform us about
|
||||
how to transform the input term into something that differs only by accents
|
||||
but would yield a correct input for the stemmer.
|
||||
|
||||
If we try to feed the raw user input to the stemmer, it will propose
|
||||
an +evitam+ stem, which will not work, because the stem that actually
|
||||
exists is +évit+, and +evitam+ can not be related to +éviter+.
|
||||
|
||||
The only palliative approach I can think of would be a spelling correction
|
||||
of the input, performed independantly of the actual index contents, which
|
||||
would notice that +évitames+ is not a French word and propose a change or an
|
||||
expansion to +évitâmes+, which would correctly stem to +évit+ and allow
|
||||
us to find +éviter+.
|
||||
|
||||
This issue is not specific to Recoll or indeed to the fact that the index
|
||||
retains accent or not. As far as I can see, it is an intrinsic bad
|
||||
interaction between diacritics insensitivity and stemming.
|
||||
|
||||
It is also interesting to note that this case becomes less probable when
|
||||
the data set becomes bigger, because more term inflexions will then be
|
||||
present in the index.
|
||||
|
||||
We'll next think about an link:ZDevCaseAndDiacritics2.html[appropriate
|
||||
interface].
|
||||
@ -1,122 +0,0 @@
|
||||
== Character case and diacritic marks (2), user interface
|
||||
|
||||
In a link:ZDevCaseAndDiacritics1.html[previous document], we discussed some
|
||||
of the problems which arise when mixing case/diacritics sensitivity and
|
||||
stemming.
|
||||
|
||||
As of version 1.18, Recoll can create two types of indexes:
|
||||
* _Dumb_ indexes contain terms which are lowercased and stripped of
|
||||
diacritics. Searches using such an index are naturally case- and
|
||||
diacritics- insensitive: search terms are stripped before processing.
|
||||
* _Raw_ indexes contain terms which are just like they were found in the
|
||||
source document. Searching such an index is naturally sensitive to case
|
||||
and diacritics, and can be made insensitive by further processing.
|
||||
|
||||
The following explains how users can control these Recoll features.
|
||||
|
||||
=== Controlling the type of index we create: stripped or raw
|
||||
|
||||
The kind of index that recoll creates is determined by:
|
||||
|
||||
* A build-time *configure* switch: _--enable-stripchars_. If this is
|
||||
set, the code for case and diacritics sensitivity is not compiled in and
|
||||
recoll will work like the previous versions: unaccented and casefolded
|
||||
index, no runtime options for case or diacritics sensitivity
|
||||
|
||||
* An indexing configuration switch (in recoll.conf): if Recoll was built
|
||||
with _--disable-stripchars_, this will provide a dynamic way to return
|
||||
to the "traditional" index. The case and diacritics code will be present
|
||||
but inactive. Normally, a recoll installation with this switch set
|
||||
should behave exactly like one built with _--enable-stripchars_. When
|
||||
using multiple indexes, this switch MUST be consistent between
|
||||
indexes. There is no support whatsoever for mixing raw and dumb indexes.
|
||||
The option is named _indexStripChars_, and it is not settable from the
|
||||
GUI to avoid errors. This is something that would typically be set once
|
||||
and for all for a given installation. We need to decide what the default
|
||||
value will be for 1.18
|
||||
|
||||
* A number of query time switches. Using these it is also possible to
|
||||
perform a search insensitive to case and diacritics on a raw index. Note
|
||||
however, that, given the complexity of the issues involved, I give no
|
||||
guaranty at this time that this will yield exactly the same results as
|
||||
searching a dumb index. Details about query time behaviour follow.
|
||||
|
||||
|
||||
=== Controlling stem, case and diacritics expansion: user query interface
|
||||
|
||||
Recoll versions up to 1.17 were insensitive to case and diacritics. We only
|
||||
needed to give the user a way to control stem expansion. This was done in
|
||||
three ways:
|
||||
|
||||
* Globally, by setting a menu option.
|
||||
* Globally, by setting the stemming language value to empty.
|
||||
* On a term by term basis by Capitalizing the term, or, in query language
|
||||
mode only, by using an 'l' clause modifier (_"term"l_).
|
||||
|
||||
After switching to an unstripped index, capable of case and diacritic
|
||||
sensitivity, we need ways to control what processing is performed among:
|
||||
|
||||
* Case expansion.
|
||||
* Diacritics expansion.
|
||||
* Stem expansion.
|
||||
|
||||
The default mode will be compatible with the previous version, because
|
||||
this is is most generally what we want to do: ignore case and diacritics,
|
||||
expand stems.
|
||||
|
||||
There are two easy approaches for controlling the parameters:
|
||||
* Global options set in the GUI menus or as *recollq* command line
|
||||
switches.
|
||||
* Per-clause options set by modifiers in the query language.
|
||||
|
||||
We would like, however to let the user entry automatically override the
|
||||
defaults in a sensible way. For example:
|
||||
|
||||
* If a term is entered with diacritics, diacritic sensitivity is turned on
|
||||
(for this term only).
|
||||
* If a term is entered with upper-case characters, case sensitivity is
|
||||
turned on. In this case, we turn off stem expansion, because it makes
|
||||
really no sense with case sensitivity.
|
||||
|
||||
With this method we are stuck with 3 problems (only if the global mode is
|
||||
set to insensitive, and we're not using the query language):
|
||||
|
||||
* Turning off stemming without turning on case sensitivity.
|
||||
* Searching for an all lower-case term in case-sensitive mode.
|
||||
* Searching for a term without diacritics in diacritic-sensitive mode.
|
||||
|
||||
The two latter issues are relatively marginal and can be worked around easily
|
||||
by switching to query language mode or using negative clauses in the
|
||||
advanced search.
|
||||
|
||||
However, we need to be able to turn stemming off while remaining
|
||||
insensitive to case, and we need to stay reasonably compatible with the
|
||||
previous versions. This means that a term which has a capital first letter
|
||||
but is otherwise lowercase will turn stemming off, but not case sensitivity
|
||||
on.
|
||||
|
||||
So we're left with how to search for such a term in a case-sensitive way,
|
||||
and for this, you'll have to use global options or the query language.
|
||||
|
||||
The modified method is:
|
||||
|
||||
* If a term is entered with diacritics, diacritic sensitivity is turned on
|
||||
(for this term only).
|
||||
* If the first letter in a term is upper-case and the rest is lower-case,
|
||||
we turn stem expansion off, but we do not become case-sensitive
|
||||
* If any letter in a term except the first is upper-case, case sensitivity
|
||||
is turned on. Stem expansion is also turned-off (even if the first
|
||||
letter is lower-case), because it makes really no sense with case
|
||||
sensitivity.
|
||||
* To search for an all lower-case or capitalized term in a case-sensitive
|
||||
way, use the query language: "Capitalized"C, "lowercase"C
|
||||
* Use the query language and the "D" modifier to turn on diacritics
|
||||
sensitivity.
|
||||
|
||||
It can be noted that some combinations of choices do not make sense and
|
||||
they are not allowed by Recoll: for example, diacritics or case sensitivity
|
||||
do not make sense with stem expansion (which cannot preserve diacritics in
|
||||
any meaningful general way).
|
||||
|
||||
The [[ZDevCaseAndDiacritics3.wiki|next page]] describes the actual
|
||||
implementation in Recoll 1.18.
|
||||
@ -1,67 +0,0 @@
|
||||
== Character case and diacritic marks (3), implementation
|
||||
|
||||
In previous pages, we discussed link:ZDevCaseAndDiacritics1.html[diacritics
|
||||
and stemming], and an link:ZDevCaseAndDiacritics2.html[appropriate
|
||||
interface] for switchable search sensitivity to diacritics and character
|
||||
case.
|
||||
|
||||
So you are in this mood again and you don't want to type accents (maybe you're
|
||||
stuck with a QWERTY American english keyboard), or conversely you're
|
||||
want to resume looking for your résumé, and you've told Recoll as much,
|
||||
using the appropriate interface. What happens then ?
|
||||
|
||||
The second case is easy if the index is raw, and mostly impossible if it is
|
||||
stripped. So we'll concentrate on the first case: how to achieve case and
|
||||
diacritics insensitivity on a raw index ?
|
||||
|
||||
Recoll uses three expansion tables:
|
||||
|
||||
* The first table has stripped and lowercased terms as keys and raw terms as
|
||||
data: +mate -> (mate, maté, MATE,...)+.
|
||||
|
||||
* The second table has lowercased stems as keys and original lowercase terms
|
||||
as data (when using multiple languages, there are several such tables):
|
||||
+évit -> (éviter, évite, évitâmes, ...)+.
|
||||
|
||||
* The third table has stripped and lowercased stems as keys and stripped
|
||||
lowercased terms as data:
|
||||
+evit -> (eviter, evite, evitons)+ and +evitam -> (evitames, ...)+
|
||||
|
||||
The first table can be used for full case and diacritics expansion or for
|
||||
only one of those, by post-filtering the results of full expansion (e.g. if
|
||||
we only want diacritics expansion, we filter by stripping diacritics from
|
||||
each result term and check that it's identical to the input). For example
|
||||
if we have +mate -> (mate, maté, MATE, MATÉ)+ in the table and want to
|
||||
only perform case expansion for an input of +maté+, we apply case folding
|
||||
to the initial output and keep only +maté+, as +mate+ differs from the
|
||||
input.
|
||||
|
||||
We only perform stemming expansion when case and diacritics sensitivity is
|
||||
off. It is performed using the second and third tables, both on the
|
||||
lowercased and lowercased/stripped output of the first step, and each term
|
||||
in the output stemming is expanded again for case (using the first table).
|
||||
|
||||
A full example of the expansion occurring during an insensitive search
|
||||
for +resume+ using French stemming on a mixed English/French index
|
||||
follows. An important thing to remember is that the result of each
|
||||
expansion is a function of the terms actually present in the index, not
|
||||
some arbitrary computation (and so, of course, many of the possible but
|
||||
absent variations are missing).
|
||||
|
||||
# The case and diacritics expansion of +resume+ yields +RESUME Resume
|
||||
Résumé resumé résume résumé resume+
|
||||
|
||||
# The Stem expansion input list (lower-cased) is:
|
||||
+resume resumé résume résumé+, and the output is:
|
||||
+resum resume resumenes resumer resumes resumé resumée résum résumait
|
||||
résumant résume résumer résumerai résumerait résumes résumez résumé résumée
|
||||
résumées résumés+
|
||||
|
||||
# Each of the above terms is then fed to case and diacritics expansion (first
|
||||
table), for the final output:
|
||||
+resume résumé Résumé résumer résume Resume résumés RESUME resumes
|
||||
resumer résumant resúmenes resumé résumait résumes résumée resumee
|
||||
résumerait Résumez résumerai RÉSUMÉES Resumée Resumes résumées+.
|
||||
|
||||
A Xapian OR query is finally constructed from the expanded term list.
|
||||
|
||||
@ -1,67 +0,0 @@
|
||||
== Recoll Faqs and Howtos file index
|
||||
link:ElinksWeb.html[Extending the Recoll Firefox visited web page indexing mechanism to other browsers]
|
||||
|
||||
link:FilterArch.html[Recoll input handlers]
|
||||
|
||||
link:FilterRetrofit.html[Installing a filter for a new document type]
|
||||
|
||||
link:FilteringOutZipArchiveMembers.html[Filtering out Zip archive members]
|
||||
|
||||
link:GUIKeyboard.html[Recoll GUI keyboard navigation]
|
||||
|
||||
link:HandleCustomField.html[Generating a custom field and using it to sort results]
|
||||
|
||||
link:Home.html[Welcome to the Recoll Faqs and Recipees]
|
||||
|
||||
link:HotRecoll.html[Recoll hotkey: starting / hiding recoll with a keyboard shortcut]
|
||||
|
||||
link:IndexMailHeader.html[Indexing arbitrary mail headers]
|
||||
|
||||
link:IndexMozillaCalendari.html[Indexing Mozilla calendar data]
|
||||
|
||||
link:IndexOnAc.html[Laptops: starting or stopping indexing according to AC power status]
|
||||
|
||||
link:IndexOutlook.html[Indexing Outlook archives]
|
||||
|
||||
link:IndexWebHistory.html[Indexing Web history with the Firefox extension ]
|
||||
|
||||
link:MultipleIndexes.html[Creating and using multiple indexes]
|
||||
|
||||
link:MuttAndRecoll.html[Interfacing Recoll and Mutt]
|
||||
|
||||
link:NonAsciiFileNames.html[Unix and non-ASCII file names, a summary of issues]
|
||||
|
||||
link:OpenHelperScript.html[Starting native applications]
|
||||
|
||||
link:PreventIndexingDir.html[Preventing indexing in a directory]
|
||||
|
||||
link:ProblemSolvingData.html[Gathering useful data for asking help about or reporting a Recoll issue]
|
||||
|
||||
link:QpdfviewHelperScript.html[Starting native applications ]
|
||||
|
||||
link:QueryFromC.html[Querying Recoll from a C program]
|
||||
|
||||
link:ReplaceCategories.html[Replacing the Category filter controls]
|
||||
|
||||
link:ResultsThumbnails.html[Result list thumbnails and how to create them]
|
||||
|
||||
link:SavingConfig.html[User configuration backup]
|
||||
|
||||
link:UnityLens.html[Building and Installing the Ubuntu Unity Recoll Lens]
|
||||
|
||||
link:UsingOpenWith.html[Using the _Open With_ context menu in recoll 1.20 and newer]
|
||||
|
||||
link:WhyIsMyFileNotIndexed.html[Using the log file to investigate indexing issues]
|
||||
|
||||
link:WikiIndex.html[Recoll Wiki file index]
|
||||
|
||||
link:XDGBase.html[XDG: Tidying Recoll data storage]
|
||||
|
||||
link:ZDevCaseAndDiacritics1.html[Character case and diacritic marks (1), issues with stemming]
|
||||
|
||||
link:ZDevCaseAndDiacritics2.html[Character case and diacritic marks (2), user interface]
|
||||
|
||||
link:ZDevCaseAndDiacritics3.html[Character case and diacritic marks (3), implementation]
|
||||
|
||||
link:index.html[Faqs and Howtos]
|
||||
|
||||
@ -1,41 +0,0 @@
|
||||
== Faqs and Howtos
|
||||
|
||||
link:..[Back to recoll.org top page]
|
||||
|
||||
link:faqsindex.html[Full file index]
|
||||
|
||||
=== Indexing
|
||||
* link:WhyIsMyFileNotIndexed.html[Why is this file not indexed ? Investigating indexing issues]
|
||||
* link:PreventIndexingDir.html[Preventing the indexing of a directory]
|
||||
* link:IndexOnAc.html[Starting/stopping the indexer depending on power/battery status]
|
||||
* link:IndexMozillaCalendari.html[Indexing Mozilla Sunbird / Lightning calendar data]
|
||||
* link:MultipleIndexes.html[Creating and using multiple indexes]
|
||||
* link:IndexWebHistory.html[Indexing Web history with the Firefox browser extension]
|
||||
* link:ElinksWeb.html[Extending the Web queue mechanism to other browsers and general WEB indexing]
|
||||
* link:IndexMailHeader.html[Indexing arbitrary mail headers]
|
||||
* link:IndexOutlook.html[Indexing Outlook archives]
|
||||
* link:HandleCustomField.html[Generating a custom field and using it to sort results]
|
||||
* link:http://www.recoll.org/recoll_XMP/index.html.html[An example of filter/field customisation, using XMP metadata with PDFs]
|
||||
* link:FilteringOutZipArchiveMembers.html[Filtering out Zip archive members]
|
||||
|
||||
=== Searching
|
||||
* link:GUIKeyboard.html[Recoll GUI keyboard navigation]
|
||||
* link:HotRecoll.html[On the desktop: using a keyboard shortcut for starting/hiding recoll]
|
||||
* link:OpenHelperScript.html[Handling issues for starting native apps, esp. email clients - getting Thunderbird to open message files]
|
||||
* link:QpdfviewHelperScript.html[Another example open helper script - using qpdfview to open pdf and postscript files, with support for page and search options]
|
||||
* link:UsingOpenWith.html[Using the new Open With menu in recoll 1.20 with a custom
|
||||
app]
|
||||
* link:ReplaceCategories.html[Replacing the document category filters]
|
||||
* link:ResultsThumbnails.html[Result list thumbnails and how to create them]
|
||||
* link:MuttAndRecoll.html[Interfacing Recoll and Mutt]
|
||||
* link:QueryFromC.html[Querying from a C program]
|
||||
|
||||
=== Administration and miscellaneous
|
||||
* link:http://www.recoll.org/pages/recoll-webui-install-wsgi.html.html[Installation of the Recoll WebUI with Apache]
|
||||
* link:FilterRetrofit.wiki.html[Installing a filter for a new document type]
|
||||
* link:UnityLens.html[Building and Installing the Ubuntu Unity Recoll Lens]
|
||||
* link:SavingConfig.wiki.html[Recoll configuration backup]
|
||||
* link:XDGBase.wiki.html[Tidying Recoll data storage]
|
||||
* link:ProblemSolvingData.html[Collecting diagnostic information]
|
||||
* link:NonAsciiFileNames.html[Unix and non-ascii file names]
|
||||
* link:FilterArch.html[Recoll filters]
|
||||
@ -1,20 +0,0 @@
|
||||
#!/bin/sh
|
||||
WIDX=faqsindex.txt
|
||||
|
||||
echo "== Recoll Faqs and Howtos file index" > $WIDX
|
||||
for f in *.txt; do
|
||||
if test "$f" = $WIDX ; then continue; fi
|
||||
h="`basename $f .txt`.html"
|
||||
title=`head -1 "$f" | sed -e 's/=//g' -e 's/^ *//' -e 's/ *$//' -e 's/
//g'`
|
||||
echo 'link:'$h'['$title']' >> $WIDX
|
||||
echo >> $WIDX
|
||||
done
|
||||
|
||||
exit 0
|
||||
# Check and display what files are in the index but not in the contents table:
|
||||
|
||||
grep \| FaqsAndHowTos.txt | awk -F\| '{print $1}' | sed -e 's/\* \[\[//' -e 's/.wiki//' |sort > ctfiles.tmp
|
||||
grep '\[\[' WikiIndex.txt | awk -F\| '{print $1}' | sed -e 's/\[\[//' -e 's/.wiki//' -e 's/.md//' | sort > ixfiles.tmp
|
||||
echo 'diff ContentFiles IndexFiles:'
|
||||
diff ctfiles.tmp ixfiles.tmp
|
||||
rm ctfiles.tmp ixfiles.tmp
|
||||
|
Before Width: | Height: | Size: 318 B |
@ -1,490 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>RECOLL: a personal text search system for
|
||||
Unix/Linux</title>
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search,fulltext,desktop search,unix,linux,solaris,open source,free">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content=
|
||||
"text/html; charset=iso-8859-1">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="index.html">Home</a></li>
|
||||
|
||||
<li><a href="pics/index.html">Screenshots</a></li>
|
||||
|
||||
<li><a href="download.html">Downloads</a></li>
|
||||
|
||||
<li><a href="doc.html">Documentation</a></li>
|
||||
|
||||
<li><a href="support.html">Support</a></li>
|
||||
|
||||
<li><a href="devel.html">Development</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
<h1>Recoll features</h1>
|
||||
|
||||
<div class="intrapage">
|
||||
<table width=100%>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><a href="#systems">Supported systems</a></td>
|
||||
<td><a href="#doctypes">Document types</a></td>
|
||||
<td><a href="#other">Other features</a></td>
|
||||
<td><a href="#integration">Desktop and web integration</a></td>
|
||||
<td><a href="#stemming">Stemming</a></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<h2><a name="general">General features</a></h2>
|
||||
<ul>
|
||||
<li>Easy installation, few dependancies. No database daemon,
|
||||
web server, desktop environment or exotic language necessary.</li>
|
||||
<li>Will run on most Unix-based <a href="features.html#systems">
|
||||
systems</a>, and on MS-Windows too.</li>
|
||||
<li>Qt 4 GUI, plus command line, Unity Lens, KIO and krunner
|
||||
interfaces.</li>
|
||||
|
||||
<li>Searches most common
|
||||
<a href="features.html#doctypes">document types</a>, emails and
|
||||
their attachments. Transparently handles decompression
|
||||
(gzip, bzip2).</li>
|
||||
|
||||
<li>Powerful query facilities, with boolean searches,
|
||||
phrases, proximity, wildcards, filter on file types and directory
|
||||
tree.</li>
|
||||
|
||||
<li>Multi-language and multi-character set with Unicode based
|
||||
internals.</li>
|
||||
|
||||
<li>Extensive documentation, with a
|
||||
complete <a href="usermanual/usermanual.html">user
|
||||
manual</a> and manual pages for each command.</li>
|
||||
|
||||
</ul>
|
||||
|
||||
<h2><a name="systems">Supported systems</a></h2>
|
||||
|
||||
<p><span class="application">Recoll</span> has been compiled and
|
||||
tested on Linux, MS-Windows 7-10, MacOS X and Solaris (initial
|
||||
versions Redhat 7, Fedora Core 5, Suse 10, Gentoo, Debian 3.1,
|
||||
Solaris 8). It should compile and run on all subsequent releases
|
||||
of these systems and probably a few others too.</p>
|
||||
|
||||
<p>Qt versions from 4.7 and later</p>
|
||||
|
||||
<h2><a name="doctypes">Document types</a></h2>
|
||||
|
||||
<p><span class="application">Recoll</span> can index many document
|
||||
types (along with their compressed versions). Some types are
|
||||
handled internally (no external application needed). Other types
|
||||
need a separate application to be installed to extract the
|
||||
text. Types that only need very common utilities
|
||||
(awk/sed/groff/Python etc.) are listed in the native section.</p>
|
||||
|
||||
<p>The MS-Windows installer includes the supporting application,
|
||||
the only additional package you will need is the Python language
|
||||
installation.</p>
|
||||
|
||||
<p>Many formats are processed
|
||||
by <span class="application">Python</span> scripts. The Python
|
||||
dependency will not always be mentionned. In general, Recoll
|
||||
expects Python 2.x to be available (many, but not all, scripts
|
||||
are compatible with Python 3). Formats which are processed
|
||||
using <span class="application">Python</span> and its standard
|
||||
library are listed in the <i>native</i> section.</p>
|
||||
|
||||
<h4>File types indexed natively</h4>
|
||||
|
||||
<ul>
|
||||
<li><span class="application">text</span>.</li>
|
||||
<li><span class="application">html</span>.</li>
|
||||
<li><span class="application">maildir</span>,
|
||||
<span class="application">mh</span>, and
|
||||
<span class="application">mailbox</span> (
|
||||
<span class="application">Mozilla</span>,
|
||||
<span class="application">Thunderbird</span> and
|
||||
<span class="application">Evolution</span> mail ok).
|
||||
<em><b>Evolution note</b>: be sure to remove <tt>.cache</tt> from
|
||||
the <tt>skippedNames</tt> list in the GUI <tt>Indexing
|
||||
preferences/Local Parameters/</tt> pane if you want to
|
||||
index local copies of Imap mail.</em>
|
||||
</li>
|
||||
|
||||
<li><span class="application">gaim</span> and
|
||||
<span class="application">purple</span> log files.</li>
|
||||
|
||||
<li><span class="application">Scribus</span> files.</li>
|
||||
|
||||
<li><span class="application">Man pages</span> (needs
|
||||
<span class="application">groff</span>).</li>
|
||||
|
||||
<li><span class="application">Dia</span> diagrams.</li>
|
||||
<li><span class="application">Excel</span>
|
||||
and <span class="application">Powerpoint</span>
|
||||
for <span class="application">Recoll</span> versions 1.19.12
|
||||
and later.</li>
|
||||
|
||||
<li><span class="application">Tar</span> archives. Tar file
|
||||
indexing is disabled by default (because tar archives don't
|
||||
typically contain the kind of documents that people search
|
||||
for), you will need to enable it explicitely, like with the
|
||||
following in your
|
||||
<span class="filename">$HOME/.recoll/mimeconf</span> file:
|
||||
<pre>
|
||||
[index]
|
||||
application/x-tar = execm rcltar
|
||||
</pre>
|
||||
</li>
|
||||
|
||||
<li><span class="application">Zip</span> archives.</li>
|
||||
<li><span class="application">Konqueror webarchive</span>
|
||||
format with Python (uses the <tt>tarfile</tt> standard
|
||||
library module).</li>
|
||||
|
||||
<li><span class="application">Mimehtml web archive
|
||||
format</span> (support based on the mail
|
||||
filter, which introduces some mild weirdness, but still
|
||||
usable).</li>
|
||||
</ul>
|
||||
|
||||
|
||||
|
||||
<h4>File types indexed with external helpers</h4>
|
||||
|
||||
<p>Many document types need the <span class="command">iconv</span>
|
||||
command in addition to the applications specifically listed.</p>
|
||||
|
||||
<h5>The XML ones</h5>
|
||||
|
||||
<p>The following types need <span class="command">
|
||||
xsltproc</span> from the <b>libxslt</b> package for recoll
|
||||
versions before 1.22, and in addition, python-libxslt1 and
|
||||
python-libxml2 for 1.22 and newer.
|
||||
Quite a few also need <span class="command">unzip</span>:</p>
|
||||
|
||||
<ul>
|
||||
<li><span class="application">Abiword</span> files.</li>
|
||||
|
||||
<li><span class="application">Fb2</span> ebooks.</li>
|
||||
|
||||
<li><span class="application">Kword</span> files.</li>
|
||||
|
||||
<li><span class="application">Microsoft Office Open XML</span>
|
||||
files.</li>
|
||||
|
||||
<li><span class="application">OpenOffice</span> files.</li>
|
||||
|
||||
<li><span class="application">SVG</span> files.</li>
|
||||
<li><span class="application">Gnumeric</span> files.</li>
|
||||
<li><span class="application">Okular</span> annotations files.</li>
|
||||
|
||||
</ul>
|
||||
|
||||
<h5>Other formats</h5>
|
||||
|
||||
<p>The following need miscellaneous helper programs to decode
|
||||
the internal formats.</p>
|
||||
|
||||
<ul>
|
||||
<li><span class="application">pdf</span> with the <span class=
|
||||
"command">pdftotext</span> command, which comes with
|
||||
<a href="http://poppler.freedesktop.org/">poppler</a>,
|
||||
(the package name is quite often <tt>poppler-utils</tt>). <br/>
|
||||
Note: the older <span class="command">pdftotext</span> command
|
||||
which comes with <span class="application">xpdf</span> is
|
||||
not compatible with <span class="application">
|
||||
Recoll</span><br/>
|
||||
|
||||
<em>New in 1.21</em>: if the <span class="application">
|
||||
tesseract</span> OCR application, and the
|
||||
<span class="command">pdftoppm</span> command are available
|
||||
on the system, the <span class="command">rclpdf</span>
|
||||
filter has the capability to run OCR. See the comments at
|
||||
the top of <span class="command">rclpdf</span> (usually
|
||||
found
|
||||
in <span class="filename">/usr/share/recoll/filters</span>)
|
||||
for how to enable this and configuration details.<br/>
|
||||
<em>Opening PDFs at the right page</em>: the default
|
||||
configuration uses <span class="command">evince</span>,
|
||||
which has options for direct page access and pre-setting the
|
||||
search strings (hits will be highlighted). There is an
|
||||
example line in the default mimeview for doing the same
|
||||
thing with <span class="command">qpdfview</span>
|
||||
(<span class="literal">qpdfview --search %s %f#%p</span>).
|
||||
Okular does not have a search string option (but it does
|
||||
have a page number one).
|
||||
</li>
|
||||
|
||||
<li><span class="application">msword</span> with <a href=
|
||||
"http://www.winfield.demon.nl/">antiword</a>. It is also useful to
|
||||
have <a href="http://wvware.sourceforge.net/">wvWare</a> installed
|
||||
as it may be be used as a fallback for some files which antiword
|
||||
does not handle.</li>
|
||||
|
||||
<li><span class="application">Wordperfect</span> with the
|
||||
<span class="command">wpd2html</span> command from <a href=
|
||||
"http://libwpd.sourceforge.net">libwpd</a>. On some distributions,
|
||||
the command may come with a package named <span
|
||||
class="literal">libwpd-tools</span> or such, not the base <a
|
||||
span="literal">libwpd</a> package.</li>
|
||||
|
||||
<li><span class="application">Lyx</span> files (needs
|
||||
<span class="application">Lyx</span> to be installed).</li>
|
||||
|
||||
<li><span class="application">Powerpoint</span> and <span
|
||||
class="application">Excel</span> with the <a href=
|
||||
"http://vitus.wagner.pp.ru/software/catdoc/">catdoc</a>
|
||||
utilities up to recoll 1.19.12. Recoll 1.19.12 and later use
|
||||
internal Python filters for Excel and Powerpoint, and catdoc
|
||||
is not needed at all (catdoc did not work on many semi-recent
|
||||
Excel and Powerpoint files).</li>
|
||||
|
||||
<li><span class="application">CHM (Microsoft help)</span> files
|
||||
with <span class="command">Python,
|
||||
<a href="http://gnochm.sourceforge.net/pychm.html">pychm</a>
|
||||
and <a href="http://www.jedrea.com/chmlib/">chmlib</a></span>.</li>
|
||||
|
||||
<li><span class="application">GNU info</span> files
|
||||
with <span class="command">Python</span> and the
|
||||
<span class="command">info</span> command.</li>
|
||||
|
||||
<li><span class="application">EPUB</span> files
|
||||
with <span class="command">Python</span> and this
|
||||
<a href="http://pypi.python.org/pypi/epub/">Python epub</a>
|
||||
decoding module, which is packaged on Fedora, but not Debian.</li>
|
||||
|
||||
<li><span class="application">Rar</span> archives (needs <span
|
||||
class="command">Python</span>), the
|
||||
<a href="http://pypi.python.org/pypi/rarfile/">rarfile</a> Python
|
||||
module and the <a
|
||||
href="http://www.rarlab.com/rar_add.htm">unrar</a>
|
||||
utility. The Python module is packaged by Fedora, not by Debian.</li>
|
||||
|
||||
<li><span class="application">7zip</span> archives (needs
|
||||
<span class="command">Python</span> and
|
||||
the <a href="https://pypi.python.org/pypi/pylzma">pylzma
|
||||
module</a>). This is a recent addition, and you need to
|
||||
download the filter from
|
||||
the <a href="filters/filters.html">filters pages</a> for
|
||||
all Recoll versions prior to 1.21.</li>
|
||||
|
||||
<li><span class="application">iCalendar</span>(.ics) files
|
||||
(needs <span class="command">Python, <a href=
|
||||
"http://pypi.python.org/pypi/icalendar/2.1">icalendar</a></span>).</li>
|
||||
|
||||
<li><span class="application">Mozilla calendar data</span> See
|
||||
<a href="faqsandhowtos/IndexMozillaCalendari.html">
|
||||
the Howto</a> about this.</li>
|
||||
|
||||
<li><span class="application">postscript</span> with <a href=
|
||||
"http://www.gnu.org/software/ghostscript/ghostscript.html">
|
||||
ghostscript</a> and <a href=
|
||||
"http://www.cs.wisc.edu/~ghost/doc/pstotext.htm">pstotext</a>.
|
||||
Pstotext 1.9 has a serious issue with special characters in
|
||||
file names, and you should either use the version packaged for
|
||||
your system which is probably patched, or apply the Debian
|
||||
patch which is stored <a href=
|
||||
"files/pstotext-1.9_4-debian.patch">here</a> for
|
||||
convenience. See http://packages.debian.org/squeeze/pstotext
|
||||
and http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=356988
|
||||
for references/explanations.
|
||||
<blockquote>
|
||||
To make things a bit easier, I also
|
||||
store <a href="files/pstotext-1.9-patched.tar.gz">an
|
||||
already patched version</a>. I added an
|
||||
install target to the Makefile... This installs to
|
||||
/usr/local, use <i>make install PREFIX=/usr</i> to
|
||||
change. So all you need is:
|
||||
<pre>
|
||||
tar xvzf pstotext-1.9-patched.tar.gz
|
||||
cd pstotext-1.9-patched
|
||||
make
|
||||
make install
|
||||
</pre>
|
||||
</blockquote>
|
||||
</li>
|
||||
|
||||
|
||||
<li><span class="application">RTF</span> files with
|
||||
<a href="http://www.gnu.org/software/unrtf/unrtf.html">
|
||||
unrtf</a>. Please note that up to version 0.21.3,
|
||||
<span class="command">unrtf</span> mostly does not work with
|
||||
non western-european character sets. Many serious problems
|
||||
(crashes with serious security implications and infinite
|
||||
loops) were fixed in unrtf 0.21.8, so you really want to use
|
||||
this or a newer release. Building Unrtf from source is quick
|
||||
and easy.</li>
|
||||
|
||||
<li><span class="application">TeX</span> with <span class=
|
||||
"command">untex</span>. If there is no untex package for
|
||||
your distribution, <a href="untex/untex-1.3.jf.tar.gz">a
|
||||
source package is stored on this site</a> (as untex has no
|
||||
obvious home). Will also work with <a href=
|
||||
"http://www.cs.purdue.edu/homes/trinkle/detex/">detex</a>
|
||||
if this is installed.</li>
|
||||
|
||||
<li><span class="application">dvi</span> with <a href=
|
||||
"http://www.radicaleye.com/dvips.html">dvips</a>.</li>
|
||||
|
||||
<li><span class="application">djvu</span> with <a href=
|
||||
"http://djvu.sourceforge.net">DjVuLibre</a>.</li>
|
||||
|
||||
<li><span class="application">Audio file tags</span>.
|
||||
Recoll releases 1.14 and later use a Python filter based
|
||||
on <a href="http://code.google.com/p/mutagen/">mutagen</a>
|
||||
for all audio types.</li>
|
||||
|
||||
<li><span class="application">Image file tags</span> with <a href=
|
||||
"http://www.sno.phy.queensu.ca/~phil/exiftool/">exiftool</a>.
|
||||
This is a perl program, so you also need perl on the
|
||||
system. This works with about any possible image file and
|
||||
tag format (jpg, png, tiff, gif etc.).</li>
|
||||
|
||||
<li><span class="application">Midi karaoke files</span> with
|
||||
Python, the
|
||||
<a href="http://pypi.python.org/pypi/midi/0.2.1">
|
||||
midi module</a>, and some help
|
||||
from <a href="http://chardet.feedparser.org/">chardet</a>. There
|
||||
is probably a <tt>python-chardet</tt> package for your distribution,
|
||||
but you will quite probably need to build the midi
|
||||
package. This is easy but see the <a href="helpernotes.html#midi">
|
||||
notes here</a>.
|
||||
</li>
|
||||
|
||||
<li><span class="application">MediaWiki dump files</span>:
|
||||
Thomas Levine has written a handler for these, you will find
|
||||
it here:
|
||||
<a href="https://bitbucket.org/tlevine/recoll/src/0127be78bffdd8a294067966a3ba7b2663d7b0cf/src/filters/rclmwdump?at=default&fileviewer=file-view-default">rclmwdump</a>.</li>
|
||||
|
||||
</ul>
|
||||
|
||||
<h2><a name="other">Other features</a></h2>
|
||||
|
||||
<ul>
|
||||
<li>Can use a Firefox extension to index visited Web pages
|
||||
history. See <a href="faqsandhowtos/IndexWebHistory.html">the
|
||||
Howto</a> for more detail.</li>
|
||||
|
||||
<li>Processes all email attachments, and more generally any
|
||||
realistic level of container imbrication (the "msword attachment to
|
||||
a message inside a mailbox in a zip" thingy...) .</li>
|
||||
|
||||
<li>Multiple selectable databases.</li>
|
||||
|
||||
<li>Powerful query facilities, with boolean searches,
|
||||
phrases, filter on file types and directory tree.</li>
|
||||
|
||||
<li>Xesam-compatible query language.</li>
|
||||
|
||||
<li>Wildcard searches (with a specific and faster function
|
||||
for file names).</li>
|
||||
|
||||
<li>Support for multiple charsets. Internal processing and
|
||||
storage uses Unicode UTF-8.</li>
|
||||
|
||||
<li><a href="#Stemming">Stemming</a> performed at query
|
||||
time (can switch stemming language after indexing).</li>
|
||||
|
||||
<li>Easy installation. No database daemon, web server or
|
||||
exotic language necessary.</li>
|
||||
|
||||
<li>An indexer which runs either as a batch, cron'able
|
||||
program, or as a real-time indexing daemon, depending on
|
||||
preference.</li>
|
||||
</ul>
|
||||
|
||||
<h2><a name="integration">Desktop and web integration</a></h2>
|
||||
|
||||
<p>The <span class="application">Recoll</span> GUI has many
|
||||
features that help to specify an efficient search and to manage
|
||||
the results. However it maybe sometimes preferable to use a
|
||||
simpler tool with a better integration with your desktop
|
||||
interfaces. Several solutions exist:</p>
|
||||
<ul>
|
||||
<li>The <span class="application">Recoll</span> KIO module
|
||||
allows starting queries and viewing results from the
|
||||
Konqueror browser or KDE applications <em>Open</em> dialogs.</li>
|
||||
<li>The <a href="http://kde-apps.org">recollrunner</a> krunner
|
||||
module allows integrating Recoll search results into a
|
||||
krunner query.</li>
|
||||
<li>The Ubuntu Unity Recoll Lens (or Scope for newer Unity
|
||||
versions) lets you access Recoll search
|
||||
from the Unity Dash. More
|
||||
slightly obsolete information <a href="faqsandhowtos/UnityLens.html">
|
||||
here</a>. </li>
|
||||
<li>The <a href="http://github.com/medoc92/recoll-webui">Recoll
|
||||
Web UI</a> lets you query a Recoll index from a web browser</li>
|
||||
</ul>
|
||||
<p>Recoll also has
|
||||
<a href="usermanual/usermanual.html#RCL.PROGRAM.PYTHONAPI">
|
||||
<span class="application">Python</span></a> and
|
||||
<span class="application">PHP</span> modules which can allow
|
||||
easy integration with web or other applications.</p>
|
||||
|
||||
<h2><a name="stemming"></a>Stemming</h2>
|
||||
|
||||
<p>Stemming is a process which transforms inflected words
|
||||
into their most basic form. For example, <i>flooring</i>,
|
||||
<i>floors</i>, <i>floored</i> would probably all be
|
||||
transformed to <i>floor</i> by a stemmer for the English
|
||||
language.</p>
|
||||
|
||||
<p>In many search engines, the stemming process occurs during
|
||||
indexing. The index will only contain the stemmed form of
|
||||
words, with exceptions for terms which are detected as being
|
||||
probably proper nouns (ie: capitalized). At query time, the
|
||||
terms entered by the user are stemmed, then matched against
|
||||
the index.</p>
|
||||
|
||||
<p>This process results into a smaller index, but it has the
|
||||
grave inconvenient of irrevocably losing information during
|
||||
indexing.</p>
|
||||
|
||||
<p>Recoll works in a different way. No stemming is performed
|
||||
at query time, so that all information gets into the index.
|
||||
The resulting index is bigger, but most people probably don't
|
||||
care much about this nowadays, because they have a 100Gb disk
|
||||
95% full of binary data <em>which does not get
|
||||
indexed</em>.</p>
|
||||
|
||||
<p>At the end of an indexing pass, Recoll builds one or
|
||||
several stemming dictionaries, where all word stems are
|
||||
listed in correspondence to the list of their
|
||||
derivatives.</p>
|
||||
|
||||
<p>At query time, by default, user-entered terms are stemmed,
|
||||
then matched against the stem database, and the query is
|
||||
expanded to include all derivatives. This will yield search
|
||||
results analogous to those obtained by a classical engine.
|
||||
The benefits of this approach is that stem expansion can be
|
||||
controlled instantly at query time in several ways:</p>
|
||||
|
||||
<ul>
|
||||
<li>It can be selectively turned-off for any query term by
|
||||
capitalizing it (<i>Floor</i>).</li>
|
||||
|
||||
<li>The stemming language (ie: english, french...) can be
|
||||
selected (this supposes that several stemming databases
|
||||
have been built, which can be configured as part of the
|
||||
indexing, or done later, in a reasonably fast way).</li>
|
||||
</ul>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@ -1,242 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>Recoll updated filters</title>
|
||||
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux
|
||||
based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search, desktop search, unix, linux">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
|
||||
<link type="text/css" rel="stylesheet" href="../styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="../index.html">Home</a></li>
|
||||
<li><a href="../download.html">Downloads</a></li>
|
||||
<li><a href="../usermanual/index.html">User manual</a></li>
|
||||
<li><a href="../usermanual/RCL.INSTALL.html">Installation</a></li>
|
||||
<li><a href="../index.html#support">Support</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h1>Updated filters for Recoll</h1>
|
||||
|
||||
<p>The following describe new and updated filters, which will be
|
||||
part of the next release, but can be installed on an older
|
||||
release if you need them.</p>
|
||||
|
||||
<p>For updated filters, you just need to copy the script to the
|
||||
filters directory which may be typically either <span
|
||||
class="filename">/usr/share/recoll/filters</span>, or <span
|
||||
class="filename">/usr/local/share/recoll/filters</span>. Please check
|
||||
that the script is executable after copying it, and make it so if
|
||||
needed (chmod a+x <i>scriptname</i>)</p>
|
||||
|
||||
<p>For new filters, you'll need to copy the script file as
|
||||
above, possibly install the supporting application, and usually
|
||||
edit the
|
||||
<span class="filename">mimemap</span>,
|
||||
<span class="filename">mimeview</span> and
|
||||
<span class="filename">mimeconf</span> files, either in the
|
||||
shared directory
|
||||
(<span class="filename">
|
||||
/usr[/local]/share/recoll/examples</span>), or
|
||||
in your personal configuration directory
|
||||
(<span class="filename">$HOME/.recoll</span> or
|
||||
<span class="filename">$RECOLL_CONFDIR</span>).</p>
|
||||
|
||||
<p>Alternatively, you can replace your system files with
|
||||
these updated and complete versions:
|
||||
<a href="mimemap">mimemap</a>
|
||||
<a href="mimeconf">mimeconf</a>
|
||||
<a href="mimeview">mimeview</a>.</p>
|
||||
|
||||
<p>There is a slightly more detailed description of the filter
|
||||
installation procedure on the
|
||||
<a href="http://www.recoll.org/faqsandhowtos/FilterRetrofit.html">
|
||||
Recoll Wiki</a>.</p>
|
||||
|
||||
<p>The following entries are in reverse chronologic order. Each
|
||||
lists the latest Recoll release on which the update makes sense
|
||||
(newer releases have an up to date version of the filter).</p>
|
||||
|
||||
<p>However, if you are running a Recoll version older than 1.17,
|
||||
you should really upgrade.</p>
|
||||
|
||||
<h2>PDF documents</h2>
|
||||
<p>Fixded <a href="rclpdf">rclpdf</a> filter, compatible with
|
||||
newer poppler pdftotext versions, which now properly escape
|
||||
text inside the html <head> section (but not the body,
|
||||
curiously).</p>
|
||||
|
||||
<h2>Scribus documents</h2>
|
||||
<p>An improved <a href="rclscribus">rclscribus</a> filter,
|
||||
thanks to Morten Langlo.</p>
|
||||
|
||||
<h2>7zip archives</h2>
|
||||
<p>A new <a href="rcl7z">rcl7z</a> filter by François Botha
|
||||
for 7zip archives. Needs the
|
||||
<a href="https://pypi.python.org/pypi/pylzma">pylzma Python
|
||||
module</a>. </p>
|
||||
|
||||
<h2>Attachments to PDF documents (1.20 and older)</h2>
|
||||
|
||||
<p>A new <a href="rclmpdf">rclmpdf</a> filter for processing
|
||||
PDF files with attachments. This replaces the old <b>rclpdf</b>
|
||||
filter. You need to add it to ~/.recoll/mimeconf until it is
|
||||
made standard (this is still a bit experimental, and a big
|
||||
change from the previous filter):
|
||||
<pre><tt>
|
||||
[index]
|
||||
application/pdf = execm rclmpdf
|
||||
</tt></pre>
|
||||
Note the <tt>execm</tt> instead of <tt>exec</tt>. </p>
|
||||
|
||||
<h2><a name="soff1">Open/Libre-Office documents (1.19 and older)</a></h2>
|
||||
|
||||
<p><a href="rclsoff">rclsoff</a>: the previous version did not
|
||||
produce white space between input tab-separated words, leading
|
||||
to search failures.</p>
|
||||
|
||||
|
||||
<h2>Purple logs (1.20 and older)</h2>
|
||||
|
||||
<p>New <a href="rclpurple">rclpurple</a> filter for Pidging and
|
||||
other chat applications log files. Handles newer log
|
||||
formats. </p>
|
||||
|
||||
<h2>PowerPoint documents (1.19 and older)</h2>
|
||||
|
||||
<p>The <b>rclppt</b> filter was based on <b>catppt</b>, but this
|
||||
seems to fail quite often on newer PPT
|
||||
documents. The new version is based on code from
|
||||
the <b>libreoffice</b> <b>mso-dump</b> project. It is both
|
||||
reasonably fast and quite thorough.
|
||||
</p>
|
||||
|
||||
<p>Installation:<ul>
|
||||
<li>As <tt>recollindex</tt> was executing <b>catppt</b>
|
||||
directly in the default configuration, you will also need to add
|
||||
the following to
|
||||
the <tt>mimeconf</tt> file (e.g.: ~/.recoll/mimeconf):
|
||||
<pre>
|
||||
[index]
|
||||
application/vnd.ms-powerpoint = exec rclppt
|
||||
</pre>
|
||||
</li>
|
||||
<li>Copy the 3 following files to the Recoll filters directory (e.g:
|
||||
<i>/usr/share/recoll/filters</i>) and make sure
|
||||
that <tt>ppt-dump.py</tt> and <tt>rclppt</tt> are executable.
|
||||
<ul>
|
||||
<li><a href="rclppt">rclppt</a></li>
|
||||
<li><a href="ppt-dump.py">ppt-dump.py</a></li>
|
||||
<li><a href="msodump.zip">msodump.zip</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</p>
|
||||
|
||||
<h2>EPUB documents (1.17 and older)</h2>
|
||||
|
||||
<p>New <a href="rclepub">rclepub</a> filter for EPUB documents.
|
||||
This needs
|
||||
the <a href="http://pypi.python.org/pypi/epub/0.5.0">
|
||||
python epub decoding module</a>. </p>
|
||||
|
||||
<h2>CHM files (1.17.1 and older)</h2>
|
||||
<p><a href="rclchm">rclchm</a>. The previous version of the
|
||||
filter mishandled files which had encoded internal URLs (not
|
||||
very frequent, but happens).</p>
|
||||
|
||||
<h2>Updated Open Document filter (1.17 and older)</h2>
|
||||
|
||||
<p>The <a href="rclsoff">new filter</a> will correctly handle
|
||||
exported Google Docs documents and also Open/LibreOffice ones in
|
||||
some cases. The previous filters concatenated all the text
|
||||
inside the exported Google docs without any spacing...</p>
|
||||
|
||||
<h2>TAR archives (1.17 and older)</h2>
|
||||
|
||||
<p>New <a href="rcltar">rcltar</a> filter for tar archives. The
|
||||
indexing of tar archives is disabled by default in the sample
|
||||
configuration (stored here). This is an <tt>execm</tt>
|
||||
filter !. You'll need to add an <br>
|
||||
<tt>application/x-tar = execm rcltar</tt><br>
|
||||
line in the [index] section of your
|
||||
$HOME/mimeconf to enable it, not an <tt>exec</tt> one.</p>
|
||||
|
||||
<h2>XML files (1.17 and older)</h2>
|
||||
|
||||
<p>By default, the current recoll version does not index xml
|
||||
content (except for known formats like dia, svg etc.). This
|
||||
new <a href="rclxml">rclxml</a> filter will extract the data
|
||||
from any xml file. Only text data is extracted, no attribute
|
||||
values. The other option is to treat xml file as plain text
|
||||
one (see comment in mimeconf), and index everything, including
|
||||
a lot of garbage.</p>
|
||||
|
||||
<h2>DIA files (1.16 and older)</h2>
|
||||
<p><a href="rcldia">rcldia</a> is a new filter
|
||||
for <a href="http://projects.gnome.org/dia/">Dia</a> files,
|
||||
contributed by Stefan Friedel.</p>
|
||||
|
||||
|
||||
<h2>Okular annotations (1.16 and older)</h2>
|
||||
<p><a href="rclokulnote">rclokulnote</a>. Okular lets you create
|
||||
annotations for PDF documents and stores them in xml format
|
||||
somewhere under ~/.kde. This filter does not do a nice job to
|
||||
format the data, but will at least let you find it...</p>
|
||||
|
||||
<h2>Gnumeric (1.16 and older)</h2>
|
||||
<p><a href="rclgnm">rclgnm</a>. Needs xsltproc and
|
||||
gunzip. As <tt>.gnumeric</tt> was in the list of
|
||||
explicitely ignored suffixes, you can't just add the mime
|
||||
and indexer script lines to your local mimemap and mimeconf, you
|
||||
also need to define recoll_noindex in the local mimemap (to
|
||||
override the system one which
|
||||
contains <tt>.gnumeric</tt>). The simplest approach may be to
|
||||
just replace the system files with those above.</p>
|
||||
|
||||
<h2>Rar archive support (1.15 and older)</h2>
|
||||
<p><a href="rclrar">rclrar</a>. This is up to date in Recoll
|
||||
1.16.2 but may be added to Recoll 1.15. It needs the Python
|
||||
rarfile module. </p>
|
||||
|
||||
<h2>Mimehtml support (1.15)</h2>
|
||||
<p>This is based on the internal mail filter, you just need to
|
||||
download and install the configuration files (mimemap and
|
||||
mimeconf. Will only work with 1.15 and later.</p>
|
||||
|
||||
<h2>Konqueror webarchive (.war) filter (1.15)</h2>
|
||||
<p><a href="rclwar">rclwar</a></p>
|
||||
|
||||
<h2>Updated zip archive filter (1.15)</h2>
|
||||
<p>The filter is corrected to handle utf-8 paths in zip archives:
|
||||
<a href="rclzip">rclzip</a>. Up to date in Recoll 1.16, but
|
||||
may be useful with Recoll 1.15</p>
|
||||
|
||||
<h2>Updated audio tag filter (1.14)</h2>
|
||||
<p>The mutagen-based rclaudio filter delivered with recoll 1.14.2
|
||||
used a very recent mutagen interface which will only work with
|
||||
mutagen versions after 1.17 (probably. at least works with 1.19,
|
||||
doesn't with 1.15).
|
||||
You can download the <a href="rclaudio">corrected script
|
||||
here. Not useful with Recoll 1.5 or 1.6</a>.
|
||||
</p>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@ -1,211 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>RECOLL: un outil personnel de recherche textuelle pour
|
||||
Unix et Linux</title>
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll est un logiciel personnel de recherche textuelle pour unix et linux basé sur Xapian, un moteur d'indexation puissant et mature.">
|
||||
<meta name="Keywords" content=
|
||||
"recherche textuelle,desktop,unix,linux,solaris,open source,free">
|
||||
<meta http-equiv="Content-language" content="fr">
|
||||
<meta http-equiv="content-type" content=
|
||||
"text/html; charset=iso-8859-1">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
<link type="text/css" rel="stylesheet" href="../styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="../index.html">Base</a></li>
|
||||
<li><a href="../pics/index.html">Copies d'écrans</a></li>
|
||||
<li><a href="../download.html">Téléchargements</a></li>
|
||||
<li><a href="../doc.html">Documentation</a></li>
|
||||
<li><a href="../index.html#support">Support</a></li>
|
||||
<li><a href="../devel.html">Développement</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h1 class="intro">Caractéristiques de Recoll</h1>
|
||||
|
||||
<dl>
|
||||
<dt><a name="systems">Systèmes</a></dt>
|
||||
<dd><span class="application">Recoll</span> a été compilé et
|
||||
testé sur FreeBSD, Linux, Darwin, Solaris (versions
|
||||
FreeBSD 5/6, Fedora Core 5/6, Suse 10.1, Gentoo,
|
||||
Debian 3.1, Ubuntu Edgy, Solaris 8/9, mais d'autres versions
|
||||
récentes conviennent sans doute également).</dd>
|
||||
|
||||
<dd>Versions de QT: 3.2, 3.3 et 4.2</dd>
|
||||
|
||||
<dt><a name="doctypes">Types de documents</a></dt>
|
||||
<dd>Recoll peut traiter les types de documents suivants, ainsi
|
||||
que des fichiers compressés du même type:
|
||||
|
||||
<dl>
|
||||
<dt>En interne</dt>
|
||||
|
||||
<dd>
|
||||
<ul>
|
||||
<li><var class="literal">text</var>.</li>
|
||||
|
||||
<li><var class="literal">html</var>.</li>
|
||||
|
||||
<li><span class="application">OpenOffice</span>
|
||||
(avec l'aide de la commande <b>unzip</b>).</li>
|
||||
|
||||
<li><span class="application">Abiword</span>.</li>
|
||||
|
||||
<li><span class="application">Kword</span>.</li>
|
||||
|
||||
<li><var class="literal">maildir</var>,
|
||||
<var class="literal">mh</var> et <var
|
||||
class="literal">mailbox</var> (<span class=
|
||||
"application">Mozilla</span>, <span class=
|
||||
"application">Thunderbird</span>, <span class=
|
||||
"application">Evolution</span> et sans doute
|
||||
d'autres).</li>
|
||||
|
||||
<li>Fichiers de conversation <span class="application">
|
||||
gaim</span>.</li>
|
||||
<li><span class="application">Lyx</span> (qui doit
|
||||
être présent).</li>
|
||||
|
||||
<li><span class="application">Scribus</span>.</li>
|
||||
|
||||
</ul>
|
||||
</dd>
|
||||
|
||||
<dt>Avec des paquets externes</dt>
|
||||
|
||||
<dd>
|
||||
<ul>
|
||||
<li><var class="literal">pdf</var> avec <a href=
|
||||
"http://www.foolabs.com/xpdf/">xpdf</a>.</li>
|
||||
|
||||
<li><var class="application">Wordperfect</var> avec <a href=
|
||||
"http://libwpd.sourceforge.net">libwpd</a>.</li>
|
||||
|
||||
<li><var class="literal">postscript</var> avec
|
||||
<a href="http://www.gnu.org/software/ghostscript/ghostscript.html">
|
||||
ghostscript</a> et
|
||||
<a href="http://www.cs.wisc.edu/~ghost/doc/pstotext.htm">
|
||||
pstotext</a>.</li>
|
||||
|
||||
<li><span class="application">msword</span> avec <a href=
|
||||
"http://www.winfield.demon.nl/">antiword</a>.</li>
|
||||
|
||||
<li><span class="application">Powerpoint</span> et
|
||||
<span class="application">Excel</span> avec les utilitaires
|
||||
<a href="http://www.45.free.net/~vitus/software/catdoc/">
|
||||
catdoc</a>.</li>
|
||||
|
||||
<li><var class="literal">rtf</var> avec <a href=
|
||||
"http://www.gnu.org/software/unrtf/unrtf.html">unrtf</a>.</li>
|
||||
|
||||
<li><var class="literal">dvi</var> avec
|
||||
<a href="http://www.radicaleye.com/dvips.html">dvips</a>.
|
||||
</li>
|
||||
|
||||
<li><var class="literal">djvu</var> avec
|
||||
<a href="http://djvulibre.djvuzone.org/doc/index.html">
|
||||
DjVuLibre</a>. </li>
|
||||
|
||||
<li>Tags <var class="literal">mp3</var> avec
|
||||
<a href="http://id3lib.sourceforge.net/">
|
||||
id3info (id3lib)</a>. </li>
|
||||
|
||||
</ul>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd>
|
||||
|
||||
<dt>Autres caractéristiques</dt>
|
||||
<dd>
|
||||
<ul>
|
||||
<li>Index multiples interrogeables ensemble ou séparément.</li>
|
||||
|
||||
<li>Fonctions de recherche puissantes, avec expressions
|
||||
booléennes, phrases et proximité, caractères jokers,
|
||||
filtrage sur les types de fichiers où l'emplacement.</li>
|
||||
|
||||
<li>Fonction spécifique de recherche de noms de fichiers.</li>
|
||||
|
||||
<li>Support de jeux de caractères multiples. Les traitements
|
||||
internes et l'index utilisent l'encodage Unicode UTF-8.</li>
|
||||
|
||||
<li>L'extraction des racines de mots <a href="#Stemming">
|
||||
Stemming</a> est effectuée au moment de la recherche
|
||||
(permet de changer de langue après l'indexation).</li>
|
||||
|
||||
<li>Installation facile. Pas de processus permanent, de
|
||||
serveur web ou environnement exotique.</li>
|
||||
|
||||
<li>Un indexeur qui peut fonctionner soit comme un
|
||||
processus léger dans l'interface de consultation, comme un
|
||||
programme batch externe intégrable par
|
||||
<span class="application">cron</span>, ou comme un processus
|
||||
permanent pour l'indexation au fil de l'eau.</li>
|
||||
|
||||
</ul>
|
||||
</dd>
|
||||
</ul>
|
||||
|
||||
<h2><a name="#stemming"></a>Lemmatisation</h2>
|
||||
|
||||
<p><em>Note: je serais preneur d'une traduction française
|
||||
agréable pour "stemming".</em></p>
|
||||
<p>La lemmatisation transforme un mot dérivé vers sa racine.
|
||||
Par exemple, <i>aimer</i>, <i>aimerai</i>, <i>aimait</i>,
|
||||
<i>aimez</i> etc. seraient transformés en <i>aim</i> en
|
||||
français. Une recherche de l'un quelconque des dérivés peut
|
||||
automatiquement être étendue vers tous les autres</p>
|
||||
|
||||
<p>Certains moteurs de recherche appliquent la transformation
|
||||
pendant l'indexation. L'index ne stocke que les racines des
|
||||
mots, avec des exceptions pour les termes qui sont reconnus
|
||||
comme des noms propres (capitalisation). Au moment de la
|
||||
recherche, les termes de la requête sont également transformés
|
||||
avant comparaison à l'index.</p>
|
||||
|
||||
<p>Cette approche permet un index plus petit, mais elle perd
|
||||
irrévocablement de l'information pendant l'indexation.</p>
|
||||
|
||||
<p>Recoll fonctionne différemment. Les termes sont indexés sans
|
||||
transformation. L'index résultant est plus gros, ce qui n'a
|
||||
probablement pas beaucoup d'importance à une époque de disques
|
||||
de 100 Go principalement remplis d'information multimédia
|
||||
<em>non indexée</em>.
|
||||
|
||||
<p>À la fin de l'indexation, Recoll construit un ou plusieurs
|
||||
dictionnaires de transformation (pour différents langages), où
|
||||
toutes les racines sont listées avec leurs transformations
|
||||
possibles.</p>
|
||||
|
||||
|
||||
<p>Au moment de la recherche, par défaut, les termes de
|
||||
l'utilisateurs sont transformés, et étendus aux dérivés par
|
||||
utilisation du dictionnaire.
|
||||
Les résultats obtenus sont analogues à ceux de
|
||||
l'autre méthode. L'avantage est que l'expansion peut être
|
||||
contrôlée au moment de la recherche:
|
||||
<ul>
|
||||
<li>On peut la supprimer pour n'importe quel terme de la
|
||||
requête, (en le faisant débuter par une capitale:
|
||||
<em>Aime</em> par exemple pour chercher la ville d'Aime la
|
||||
Plagne). </li>
|
||||
<li>Le langage de transformation peut également être changé,
|
||||
en supposant que plusieurs dictionnaires de transformation
|
||||
aient été construits lors de l'indexation.</li>
|
||||
</ul>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@ -1,74 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>RECOLL: a personal text search system for
|
||||
Unix/Linux</title>
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search,fulltext,desktop search,unix,linux,solaris,open source,free">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content=
|
||||
"text/html; charset=iso-8859-1">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="index.html">Home</a></li>
|
||||
|
||||
<li><a href="features.html#doctypes">Back to document types</a></li>
|
||||
|
||||
<li><a href="pics/index.html">Screenshots</a></li>
|
||||
|
||||
<li><a href="download.html">Downloads</a></li>
|
||||
|
||||
<li><a href="doc.html">User manual</a></li>
|
||||
|
||||
<li><a href="index.html#support">Support</a></li>
|
||||
|
||||
<li><a href="devel.html">Development</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
<h1>Notes about building/using specific external helper
|
||||
applications</h1>
|
||||
|
||||
<h2><a name="midi">The Python midi module</a></h2>
|
||||
<p>The normal procedure for building a Python module
|
||||
applies:</p>
|
||||
<pre><tt>
|
||||
tar xvzf midi-0.2.1.tar.gz
|
||||
cd midi-0.2.1
|
||||
python setup.py build
|
||||
sudo python setup.py install
|
||||
</tt></pre>
|
||||
|
||||
<p>However, the midi module includes an alsa driver interface
|
||||
which needs Swig to build and probably does not build at all
|
||||
on recent Linux versions (the last version for the package
|
||||
dates from 2006). Recoll does not need midi sequencer hardware
|
||||
:), so if you don't need for other purposes, you can disable
|
||||
the Alsa interface by editing setup.py and changing the
|
||||
platform name at line 37 (the Alsa thing is only tried on
|
||||
Linux):</p>
|
||||
|
||||
|
||||
<pre><tt>
|
||||
37c37
|
||||
< if platform.startswith('linux'):
|
||||
---
|
||||
> if platform.startswith('NONE'):
|
||||
</tt></pre>
|
||||
|
||||
<p>The package should then build and install just fine.</p>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@ -1,57 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>RECOLL: building id3lib with gcc 4.4</title>
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search,fulltext,desktop search,unix,linux,solaris,open source,free">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content=
|
||||
"text/html; charset=iso-8859-1">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="index.html">Home</a></li>
|
||||
<li><a href="features.html">Features</a></li>
|
||||
<li><a href="pics/index.html">Screenshots</a></li>
|
||||
<li><a href="download.html">Downloads</a></li>
|
||||
<li><a href="doc.html">User manual</a></li>
|
||||
<li><a href="index.html#support">Support</a></li>
|
||||
<li><a href="devel.html">Development</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h2>Compiling id3lib with recent gcc versions (2010-06-29)</h1>
|
||||
<p>Recoll uses a program installed by the id3lib package for
|
||||
indexing mp3 files. Id3lib has not been updated for some time and
|
||||
will not compile with gcc versions after 4.4 because of gcc
|
||||
incompatibilities.</p>
|
||||
<p><a href="files/id3lib-3.8.3-gcc44.patch">Here is a minuscule
|
||||
patch</a> to help compiling id3lib. To use it:<p>
|
||||
<ul>
|
||||
<li>Download the patch (right-click the link and use 'Save As').</li>
|
||||
<li>Extract the id3 lib source
|
||||
(<tt>tar xvzf id3lib-3.8.3.tar.gz</tt>).</li>
|
||||
<li>Change your current directory to the top of the id3lib source
|
||||
tree and apply the patch:<br>
|
||||
<tt>cd id3lib-3.8.3<br>
|
||||
patch -p1 < /path/to/the/saved/patch</tt></li>
|
||||
<li>Run autoconf (you may have to install it, but your package
|
||||
manager can certainly do it for you).</li>
|
||||
<li>Run <tt>make</tt> and <tt>make install</tt>.</li>
|
||||
</ul>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@ -1,7 +0,0 @@
|
||||
.SUFFIXES: .txt .html
|
||||
|
||||
.txt.html:
|
||||
asciidoc $<
|
||||
|
||||
all: threadingRecoll.html forkingRecoll.html xapDocCopyCrash.html
|
||||
|
||||
|
Before Width: | Height: | Size: 30 KiB |
@ -1,224 +0,0 @@
|
||||
= Recoll command execution performance
|
||||
:Author: Jean-François Dockès
|
||||
:Email: jfd@recoll.org
|
||||
:Date: 2015-05-22
|
||||
|
||||
== Abstract
|
||||
|
||||
== Introduction
|
||||
|
||||
The Recoll indexer, *recollindex*, is a big process which executes many
|
||||
others, mostly for extracting text from documents. Some of the executed
|
||||
processes are quite short-lived, and the time used by the process execution
|
||||
machinery can actually dominate the time used to translate data. This
|
||||
document explores possible approaches to improving performance without
|
||||
adding excessive complexity or damaging reliability.
|
||||
|
||||
Studying fork/exec performance is not exactly a new venture, and there are
|
||||
many texts which address the subject. While researching, though, I found
|
||||
out that not so many were accurate and that a lot of questions were left as
|
||||
an exercise to the reader.
|
||||
|
||||
== Issues with fork
|
||||
|
||||
The traditional way for a Unix process to start another is the
|
||||
+fork()+/+exec()+ system call pair.
|
||||
|
||||
+fork()+ duplicates the process address space and resources (open files
|
||||
etc.), then duplicates the thread of execution, ending up with 2 mostly
|
||||
identical processes.
|
||||
|
||||
+exec()+ then replaces part of the newly executing process with an address
|
||||
space initialized from an executable file, inheriting some of the resources
|
||||
under various conditions.
|
||||
|
||||
This was all fine with the small processes of the first Unix systems, but
|
||||
as time progressed, processes became bigger and the copy-before-discard
|
||||
operation was found to waste significant resources. It was optimized using
|
||||
two methods (at very different points in time):
|
||||
|
||||
- The first approach was to supplement +fork()+ with the +vfork()+ call, which
|
||||
is similar but does not duplicate the address space: the new process
|
||||
thread executes in the old address space. The old thread is blocked
|
||||
until the new one calls +exec()+ and frees up access to the memory
|
||||
space. Any modification performed by the child thread persists when
|
||||
the old one resumes.
|
||||
|
||||
- The more modern approach, which cohexists with +vfork()+, was to replace
|
||||
the full duplication of the memory space with duplication of the page
|
||||
descriptors only. The pages in the new process are marked copy-on-write
|
||||
so that the new process has write access to its memory without
|
||||
disturbing its parent. This approach was supposed to make +vfork()+
|
||||
obsolete, but the operation can still be a significant resource consumer
|
||||
for big processes mapping a lot of memory, so that +vfork()+ is still
|
||||
around. Programs can have big memory spaces not only because they have
|
||||
huge data segments (rare), but just because they are linked to many
|
||||
shared libraries (more common).
|
||||
|
||||
NOTE: Orders of magnitude: a *recollindex* process will easily grow into a
|
||||
few hundred of megabytes of virtual space. It executes the small and
|
||||
efficient *antiword* command to extract text from *ms-word* files. While
|
||||
indexing multiple such files, *recollindex* can spend '60% of its CPU time'
|
||||
doing `fork()`/`exec()` housekeeping instead of useful work (this is on Linux,
|
||||
where `fork()` uses copy-on-write).
|
||||
|
||||
Apart from the performance cost, another issue with +fork()+ is that a big
|
||||
process can fail executing a small command because of the temporary need to
|
||||
allocate twice its address space. This is a much discussed subject which we
|
||||
will leave aside because it generally does not concern *recollindex*, which
|
||||
in typical conditions uses a small portion of the machine virtual memory,
|
||||
so that a temporary doubling is not an issue.
|
||||
|
||||
The Recoll indexer is multithreaded, which may introduce other issues. Here
|
||||
is what happens to threads during the +fork()+/+exec()+ interval:
|
||||
|
||||
- +fork()+:
|
||||
* The parent process threads all go on their merry way.
|
||||
* The child process is created with only one thread active, duplicated
|
||||
from the one which called +fork()+
|
||||
- +vfork()+
|
||||
* The parent process thread calling +vfork()+ is suspended, the others
|
||||
are unaffected.
|
||||
* The child is created with only one thread, as for +fork()+.
|
||||
This thread shares the memory space with the parent ones, without
|
||||
having any means to synchronize with them (pthread locks are not
|
||||
supposed to work across processes): caution needed !
|
||||
|
||||
NOTE: for a multithreaded program using the classical pipe method to
|
||||
communicate with children, the sequence between the `pipe()` call and the
|
||||
parent `close()` of the unused side is a candidate for a critical section:
|
||||
if several threads can interleave in there, children process may inherit
|
||||
descriptors which 'belong' to other `fork()`/`exec()` operations, which may
|
||||
in turn be a problem or not depending on how descriptor cleanup is
|
||||
performed in the child (if no cleanup is performed, pipes may remain open
|
||||
at both ends which will prevents seeing EOFs etc.). Thanks to StackExchange
|
||||
user Celada for explaining this to me.
|
||||
|
||||
For multithreaded programs, both +fork()+ and +vfork()+ introduce possibilities
|
||||
of deadlock, because the resources held by a non-forking thread in the
|
||||
parent process can't be released in the child because the thread is not
|
||||
duplicated. This used to happen from time to time in *recollindex* because
|
||||
of an error logging call performed if the +exec()+ failed after the +fork()+
|
||||
(e.g. command not found).
|
||||
|
||||
With +vfork()+ it is also possible to trigger a deadlock in the parent by
|
||||
(inadvertently) modifying data in the child. This could happen just
|
||||
link:http://www.oracle.com/technetwork/server-storage/solaris10/subprocess-136439.html[because
|
||||
of dynamic linker operation] (which, seriously, should be considered a
|
||||
system bug).
|
||||
|
||||
|
||||
In general, the state of program data in the child process is a semi-random
|
||||
snapshot of what it was in the parent, and the official word about what you
|
||||
can do is that you can only call
|
||||
link:http://man7.org/linux/man-pages/man7/signal.7.html[async-safe library
|
||||
functions] between +fork()+ and +exec()+. These are functions which are
|
||||
safe to call from a signal handler because they are either reentrant or
|
||||
can't be interrupted by a signal. A notable missing entry in the list is
|
||||
`malloc()`.
|
||||
|
||||
These are normally not issues for programs which only fork to execute
|
||||
another program (but the devil is in the details as demonstrated by the
|
||||
logging call issue...).
|
||||
|
||||
One of the approaches often proposed for working around this mine-field is
|
||||
to use an auxiliary small process to execute any command needed by the main
|
||||
one. The small process can just use +fork()+/+exec()+ with no performance
|
||||
issues. This has the inconvenient of complicating communication a lot if
|
||||
data needs to be transferred one way or another.
|
||||
|
||||
////
|
||||
Passing descriptors around
|
||||
http://stackoverflow.com/questions/909064/portable-way-to-pass-file-descriptor-between-different-processes
|
||||
http://www.normalesup.org/~george/comp/libancillary/
|
||||
http://stackoverflow.com/questions/28003921/sending-file-descriptor-by-linux-socket/
|
||||
|
||||
The process would then be:
|
||||
- Tell slave to fork/exec cmd (issue with cmd + args format)
|
||||
- Get fds
|
||||
- Tell slave to wait, recover status.
|
||||
////
|
||||
|
||||
== The posix_spawn() Linux non-event
|
||||
|
||||
Given the performance issues of `fork()` and tricky behaviour of `vfork()`,
|
||||
a "simpler" method for starting a child process was introduced by Posix:
|
||||
`posix_spawn()`.
|
||||
|
||||
The `posix_spawn()` function is a black box, externally equivalent to a
|
||||
`fork()`/`exec()` sequence, and has parameters to specify the usual
|
||||
house-keeping performed at this time (file descriptors and signals
|
||||
management etc.). Hiding the internals gives the system a chance to
|
||||
optimize the performance and avoid `vfork()` pitfalls like the `ld.so`
|
||||
lockup described in the Oracle article.
|
||||
|
||||
The Linux posix_spawn() is implemented by a `fork()`/`exec()` pair by default.
|
||||
|
||||
`vfork()` is used either if specified by an input flag or no
|
||||
signal/scheduler/process_group changes are requested. There must be a
|
||||
reason why signal handling changes would preclude `vfork()` usage, but I
|
||||
could not find it (signal handling data is stored in the kernel task_struct).
|
||||
|
||||
The Linux glibc `posix_spawn()` currently does nothing that user code could
|
||||
not do. Still, using it would probably be a good future-proofing idea, but
|
||||
for a significant problem: there is no way to specify closing all open
|
||||
descriptors bigger than a specified value (closefrom() equivalent). This is
|
||||
available on Solaris and quite necessary in fact, because we have no way to
|
||||
be sure that all open descriptors have the CLOEXEC flag set.
|
||||
|
||||
So, no `posix_spawn()` for us (support was implemented inside
|
||||
*recollindex*, but the code is normally not used).
|
||||
|
||||
== The chosen solution
|
||||
|
||||
The previous version of +recollindex+ used to use +vfork()+ if it was running
|
||||
a single thread, and +fork()+ if it ran multiple ones.
|
||||
|
||||
After another careful look at the code, I could see few issues with
|
||||
using +vfork()+ in the multithreaded indexer, so this was committed.
|
||||
|
||||
The only change necessary was to get rid of an implementation of the
|
||||
lacking Linux +closefrom()+ call (used to close all open descriptors above a
|
||||
given value). The previous Recoll implementation listed the +/proc/self/fd+
|
||||
directory to look for open descriptors but this was unsafe because of of
|
||||
possible memory allocations in +opendir()+ etc.
|
||||
|
||||
== Test results
|
||||
|
||||
.Indexing 12500 small .doc files
|
||||
[options="header"]
|
||||
|===============================
|
||||
|call |real |user |sys
|
||||
|fork |0m46.025s |0m26.574s |0m39.494s
|
||||
|vfork |0m18.223s |0m17.753s |0m1.736s
|
||||
|spawn/fork| 0m45.726s|0m27.082s| 0m40.575s
|
||||
|spawn/vfork|0m18.915s|0m18.681s|0m3.828s
|
||||
|recoll 1.18|1m47.589s|0m21.537s|0m29.458s
|
||||
|================================
|
||||
|
||||
No surprise here, given the implementation of +posix_spawn()+, it gets the
|
||||
same times as the +fork()+/+vfork()+ options.
|
||||
|
||||
The tests were performed on an Intel Core i5 750 (4 cores, 4 threads).
|
||||
|
||||
It would be painful to play it safe and discard the 60% reduction in
|
||||
execution time offered by using +vfork()+, so this was adopted for Recoll
|
||||
1.21. To this day, no problems were discovered, but, still crossing
|
||||
fingers...
|
||||
|
||||
The last line in the table is just for the fun: *recollindex* 1.18
|
||||
(single-threaded) needed almost 6 times as long to process the same
|
||||
files...
|
||||
|
||||
////
|
||||
Objections to vfork:
|
||||
sigaction locks
|
||||
https://bugzilla.redhat.com/show_bug.cgi?id=193631
|
||||
Is Linux vfork thread-safe ? Quoting interesting comments from Solaris
|
||||
implementation: No answer to the issues cited though.
|
||||
https://sourceware.org/bugzilla/show_bug.cgi?id=378
|
||||
Aussi:
|
||||
http://blog.famzah.net/2009/11/20/fork-gets-slower-as-parent-process-use-more-memory/
|
||||
http://blog.famzah.net/2009/11/20/a-much-faster-popen-and-system-implementation-for-linux/
|
||||
Avec un workaround basé sur clone (donc linux-only). Tried it but crashes.
|
||||
////
|
||||
|
Before Width: | Height: | Size: 35 KiB |
|
Before Width: | Height: | Size: 19 KiB |
@ -1,406 +0,0 @@
|
||||
= Converting Recoll indexing to multithreading
|
||||
:Author: Jean-François Dockès
|
||||
:Email: jfd@recoll.org
|
||||
:Date: 2012-12-03
|
||||
|
||||
== Abstract
|
||||
|
||||
This relates lessons learned while modifying *Recoll* indexing to be
|
||||
multithreaded. I am by no means a threaded applications expert, so that a
|
||||
few of the observations I made whole doing this may be of use to other
|
||||
novices.
|
||||
|
||||
== Introduction
|
||||
|
||||
http://www.recoll.org[*Recoll*] is a document indexing application, it
|
||||
allows you to find documents by specifying search terms.
|
||||
|
||||
The documents need to be _indexed_ for searches to be fast. In a nutshell,
|
||||
we convert the different document formats to text, then split the text into
|
||||
terms and remember where those occur. This is a time-consuming operation.
|
||||
|
||||
Up to version 1.18 *Recoll* indexing is single-threaded: routines which
|
||||
call each other sequentially.
|
||||
|
||||
In most personal indexer contexts, it is also CPU-bound. There is a lot of
|
||||
conversion work necessary for turning those PDF (or other) files into
|
||||
appropriately cleaned up pure text, then split it into terms and update the
|
||||
index. Given the relatively modest amount of data, and the speed of
|
||||
storage, I/O issues are secondary.
|
||||
|
||||
Looking at the _CPU idle_ *top* output stuck at 75% on my quad-core CPU,
|
||||
while waiting for the indexing to finish, was frustrating, and I was
|
||||
tempted to find a way to keep those other cores at temperature and shorten
|
||||
the waiting.
|
||||
|
||||
For some usages, the best way to accomplish this may be to just partition
|
||||
the index and independantly start indexing on different configurations,
|
||||
using multiple processes to better utilize the available processing power.
|
||||
|
||||
This is not an universal solution though, as it is complicated to set up,
|
||||
not optimal in general for indexing performance, and not always optimal
|
||||
either at query time.
|
||||
|
||||
The most natural way to improve indexing times is to increase CPU
|
||||
utilization by using multiple threads inside an indexing process.
|
||||
|
||||
Something similar had been done with earlier versions of the *Recoll* GUI,
|
||||
which had an internal indexing thread. This had been a frequent source of
|
||||
trouble though, and linking the GUI and indexing process lifetimes was a
|
||||
bad idea, so, in recent versions, the indexing is always performed by an
|
||||
external process. Still, this experience had put in light most of the
|
||||
problem areas, and prepared the code for further work.
|
||||
|
||||
It should be noted that, as `recollindex` is both _nice_'d and _ionice_'d
|
||||
as a lowest priority process, it will only use free computing power on the
|
||||
machine, and will step down as soon as anything else wants to work.
|
||||
|
||||
****
|
||||
|
||||
The only case where you may notice that the indexing is at work
|
||||
is when the machine is short on memory and things (such as
|
||||
your Web browser) get swapped-out while you are not actively using
|
||||
them. You then notice a long delay when you want to start, because they
|
||||
need to be swapped back in. There is little which can be done about
|
||||
this. Setting _idxflushmb_ to a low value may help in some cases (depending
|
||||
on the document sizes). May I also suggest in this case that, if your
|
||||
machine can take more memory, it may be a good idea to procure some, as
|
||||
memory is nowadays quite cheap, and memory-starved machines are not fun.
|
||||
|
||||
****
|
||||
|
||||
In general, augmenting the machine utilisation by `recollindex` just does
|
||||
not change its responsiveness. My PC has a an Intel Pentium Core i5 750 (4
|
||||
cores, no hyperthreading), which is far from being a high performance CPU
|
||||
(nowadays...), and I often forget that I am running indexing tests, it is
|
||||
just not noticeable. The machine does have a lot of memory though (12GB).
|
||||
|
||||
|
||||
== The Recoll indexing processing flow
|
||||
|
||||
image::nothreads.png["Basic flow", float="right"]
|
||||
|
||||
There are 4 main steps in the `recollindex` processing pipeline:
|
||||
|
||||
. Find the file
|
||||
. Convert it to text
|
||||
. Process the text (split, strip etc.) and create a *Xapian* document
|
||||
. Update the index
|
||||
|
||||
The first step, walking the file system (or some other data source), is
|
||||
usually much faster than the others, and we just leave it alone to be
|
||||
performed by the main thread. It outputs file names (and the associated
|
||||
*POSIX* _stat_ data).
|
||||
|
||||
The last step, *Xapian* index updating, can only be single-threaded.
|
||||
|
||||
The first idea is to change the indexing pipeline so that each step is
|
||||
performed by an independant worker thread, passing its output to the next
|
||||
thread, in assembly-line fashion.
|
||||
|
||||
In order to achieve this, we need to decouple the different phases. They
|
||||
are normally linked by procedure calls, which we replace with a job
|
||||
control object: the 'WorkQueue'.
|
||||
|
||||
=== The WorkQueue
|
||||
|
||||
|
||||
The _WorkQueue_ object is implemented by a reasonably simple class, which
|
||||
manages an input queue on which client append jobs, and a set of worker
|
||||
threads, which retrieve and perform the jobs, and whose lifetime are
|
||||
managed by the _WorkQueue_ object. The implementation is straightforward
|
||||
with *POSIX* threads synchronization functions and C++ *STL* data
|
||||
structures.
|
||||
|
||||
In practise it proved quite simple to modify existing code to create a job
|
||||
object and put it on the queue, instead of calling the downstream routine
|
||||
with the job parameters, _while keeping the capacity to call the downstream
|
||||
routine directly_. The kind of coupling is determined either by compilation
|
||||
flags (for global disabling/enabling of multithreading), or according to
|
||||
configuration data, which allows experimenting with different threads
|
||||
arrangements just by changing parameters in a file, without recompiling.
|
||||
|
||||
Each _WorkQueue_ accepts two parameters: the length of the input queue
|
||||
(before a client will block when trying to add a job), and the number of
|
||||
worker threads. Both parameters can be set in the *Recoll* configuration
|
||||
file for each of the three queues used in the indexing pipeline. Setting
|
||||
the queue length to -1 will disable the corresponding queue (using a direct
|
||||
call instead).
|
||||
|
||||
unfloat::[]
|
||||
|
||||
|
||||
== The Assembly Line
|
||||
|
||||
image::assembly.png["Assembly line", float="right"]
|
||||
|
||||
So the first idea is to create 3 explicit threads to manage the file
|
||||
conversion, the term generation, and the *Xapian* index update. The first
|
||||
thread prepares a file, passes it on to the term generation thread, and
|
||||
immediately goes back to work on the next file, etc.
|
||||
|
||||
The presumed advantage of this method is that the different stages, which
|
||||
perform disjointed processing, should share little, so that we can hope to
|
||||
minimize the changes necessitated by the threads interactions.
|
||||
|
||||
However some changes to the code were needed to make this work (and a few
|
||||
bugs were missed, which only became apparent at later stages, confirming
|
||||
that the _low interaction_ idea was not completely false).
|
||||
|
||||
=== Converting to multithreading: what to look for
|
||||
|
||||
I am probably stating the obvious here, but when preparing a program for
|
||||
multi-threading, problems can only arise where non-constant data is
|
||||
accessed by different threads.
|
||||
|
||||
Once you have solved the core problems posed by the obvious data that needs
|
||||
to be shared, you will be left to deal with less obvious, hidden,
|
||||
interactions inside the program.
|
||||
|
||||
Classically this would concern global or static data, but in a C++ program,
|
||||
class members will be a concern if a single object can be accessed by
|
||||
several threads.
|
||||
|
||||
Hunting for static data inside a program of non trivial size is not always
|
||||
obvious. Two approaches can be used: hunting for the _static_ keyword in
|
||||
source code, or looking at global and static data symbols in *nm* output.
|
||||
|
||||
Once found, there are mostly three types of static/global data:
|
||||
|
||||
* Things that need to be eliminated: for example, routines can be made
|
||||
reentrant by letting the caller supply a storage buffer instead of using
|
||||
an internal static one (which was a bad idea in the first place
|
||||
anyway).
|
||||
* Things that need to be protected: sometimes, the best approach is just
|
||||
to protect the access with a mutex lock. It is trivial to encapsulate
|
||||
the locks in C++ objects to use the "Resource Acquisition is
|
||||
Initialization" idiom, easily making sure that locks are freed when
|
||||
exiting the critical section. Recoll used to include a basic home-made
|
||||
implementation, but now lets C++11 work for it.
|
||||
* Things which can stay: this is mostly initialization data such as value
|
||||
tables which are computed once, and then stay logically constant during
|
||||
program execution. In order to be sure of a correct single-threaded
|
||||
initialization, it is best to explicitly initialize the modules or
|
||||
functions that use this kind of data in the main thread when the program
|
||||
starts.
|
||||
|
||||
=== Assembly line approach: the results
|
||||
|
||||
Unfortunately, the assembly line approach yields very modest improvements
|
||||
when used inside *Recoll* indexing. The reason, is that this method needs
|
||||
stages of equivalent complexity to be efficient. If one of the stages
|
||||
dominates the others, its thread will be the only one active at any time,
|
||||
and little will be gained.
|
||||
|
||||
What is especially problematic is that the balance between tasks need not
|
||||
only exist on average, but also for the majority of individual jobs.
|
||||
|
||||
For *Recoll* indexing, even if the data preparation and index update steps
|
||||
are often of the same order of magnitude _on average_, their balance
|
||||
depends a lot on the kind of data being processed, so that things are
|
||||
usually unbalanced at any given time: the index update thread is mostly
|
||||
idle while processing PDF files, and the data preparation has little to do
|
||||
when working on HTML or plain text.
|
||||
|
||||
In practice, very modest indexing time improvements from 5% to 15% were
|
||||
achieved with this method.
|
||||
|
||||
[[recoll.idxthreads.multistage]]
|
||||
== The next step: multi-stage parallelism
|
||||
|
||||
image::multipara.png["Multi-stage parallelism", float="right"]
|
||||
|
||||
Given the limitations of the assembly line approach, the next step in the
|
||||
transformation of *Recoll* indexing was to enable full parallelism wherever
|
||||
possible.
|
||||
|
||||
Of the four processing steps (see figures), two are not candidates for
|
||||
parallelization:
|
||||
|
||||
* File system walking is so fast compared to the other steps that using
|
||||
several threads would make no sense (it would also quite probably become
|
||||
IO bound if we tried anyway).
|
||||
* The *Xapian* library index updating code is not designed for
|
||||
multi-threading and must stay protected from multiple accesses.
|
||||
|
||||
The two other steps are good candidates.
|
||||
|
||||
Most of the work to make *Recoll* code reentrant had been performed for the
|
||||
previous transformation. Going full-parallel only implied protecting the
|
||||
data structures that needed to be shared by the threads performing a given
|
||||
processing step.
|
||||
|
||||
Just for the anecdotic value, a list of the elements that needed mutexes:
|
||||
|
||||
- Filter subprocesses cache: some file conversion subprocesses may be
|
||||
expensive (starting a Python process is no piece of cake), so they are
|
||||
cached for reuse after they are done translating a file. The shared cache
|
||||
needs protection.
|
||||
- Status updates: an object used to update the current file name and indexing
|
||||
status to a shared file.
|
||||
- Missing store: the list of missing helper programs
|
||||
- The readonly *Xapian* database object: a Xapian::Database object which is
|
||||
used for checking the validity of current index data against a file's
|
||||
last modification date.
|
||||
- Document existence map: a bit array used to store an existence bit about
|
||||
every document, and purge the disappeared at the end of the indexing
|
||||
pass. This is accessed both from the file conversion and database update
|
||||
code, so it also needed protection in the previous assembly line
|
||||
approach.
|
||||
- Mbox offsets cache. Used to store the offsets of individual messages
|
||||
inside *mbox* files.
|
||||
- *iconv* control blocks: these are cached for reuse in several places, and
|
||||
need protection. Actually, it might be better in multithreading context
|
||||
to just suppress the reuse and locking. Rough tests seem to indicate that
|
||||
the impact on overall performance is small, but this might change with
|
||||
higher parallelism (or not...).
|
||||
|
||||
The *Recoll* configuration also used to be managed by a single shared
|
||||
object, which is mutable as values may depend on what area of the
|
||||
file-system we are exploring, so that the object is stateful and updated as
|
||||
we change directories. The choice made here was to duplicate the object
|
||||
where needed (each indexing thread gets its own). This gave rise to the
|
||||
sneakiest bug in the whole transformation (see further down).
|
||||
|
||||
Having a dynamic way to define the threads configuration makes it easy to
|
||||
experiment. For example, the following data defines the configuration that
|
||||
was finally found to be best overall on my hardware:
|
||||
|
||||
thrQSizes = 2 2 2
|
||||
thrTCounts = 4 2 1
|
||||
|
||||
This is using 3 queues of depth 2, 4 threads working on file conversion, 2
|
||||
on text splitting and other document processing, and 1 on Xapian updating
|
||||
(no choice here).
|
||||
|
||||
unfloat::[]
|
||||
|
||||
== Bench results
|
||||
|
||||
So the big question after all the work: was it worth it ? I could only get
|
||||
a real answer when the program stopped crashing, so this took some time and
|
||||
a little faith, but the answer is positive, as far as I'm
|
||||
concerned. Performance has improved significantly and this was a fun
|
||||
project.
|
||||
|
||||
|
||||
.Results on a variety of file system areas:
|
||||
[options="header", width="70%"]
|
||||
|=======================
|
||||
|Area |Seconds before |Seconds after| Percent Improvement| Speed Factor
|
||||
|home |12742 | 6942 | 46%| 1.8
|
||||
|mail |2700 | 1563 | 58% | 1.7
|
||||
|projets | 5022 | 1970 | 61% | 2.5
|
||||
|pdf | 2164 | 770 | 64% | 2.8
|
||||
|otherhtml | 5593 | 4014| 28% | 1.4
|
||||
|=======================
|
||||
|
||||
.Characteristics of the data
|
||||
[options="header", width="70%"]
|
||||
|=======================
|
||||
|Area | Files MB | Files | DB MB | Documents
|
||||
|home | 64106 | 44897 | 1197 | 104797
|
||||
|mail | 813 | 232 | 663 | 47267
|
||||
|projets | 2056 | 34504 | 549 | 40281
|
||||
|pdf | 1123 | 1139 | 111 | 1139
|
||||
|otherhtml | 3442 | 223007 | 2080 | 221890 |
|
||||
|=======================
|
||||
|
||||
_home_ is my home directory. The high megabyte value is due to a number of
|
||||
very big and not indexed *VirtualBox* images. Otherwise, it's a wide
|
||||
mix of source files, email, miscellaneous documents and ebooks.
|
||||
|
||||
_mail_ is my mail directory, full of *mbox* files.
|
||||
|
||||
_projets_ mostly holds source files, and a number of documents.
|
||||
|
||||
_pdf_ holds random *pdf* files harvested on the internets. The performance
|
||||
is quite spectacular, because most of the processing time goes to
|
||||
converting them to text, and this is done in parallel. Probably could be
|
||||
made a bit faster with more cores, until we hit the *Xapian* update speed
|
||||
limit.
|
||||
|
||||
_otherhtml_ holds myriad of small html files, mostly from
|
||||
*wikipedia*. The improvement is not great here because a lot of time is
|
||||
spent in the single-threaded *Xapian* index update.
|
||||
|
||||
The tests were made with queue depths of 2 on all queues, and 4 threads
|
||||
working on the file conversion step, 2 on the term generation.
|
||||
|
||||
== A variation: linear parallelism
|
||||
|
||||
Once past the assembly-line idea, another possible transformation would be
|
||||
to get rid of the two downstream queues, and just create a job for each
|
||||
file and let it go to the end (using a mutex to protect accesses to the
|
||||
writable *Xapian* database).
|
||||
|
||||
With the current *Recoll* code, this can be defined by the following
|
||||
parameters (one can also use a deeper front queue, this changes little):
|
||||
|
||||
thrQSizes = 2 -1 -1
|
||||
thrTCounts = 4 0 0
|
||||
|
||||
In practise, the performance is close to the one for the multistage
|
||||
version.
|
||||
|
||||
If we were to hard-code this approach, this would be a simpler
|
||||
modification, necessitating less changes to the code, but it has a slight
|
||||
inconvenient: when working on a single big multi-document file, no
|
||||
parallelism at all can be obtained. In this situation, the multi-stage
|
||||
approach brings us back to the assembly-line behaviour, so the improvements
|
||||
are not great, but they do exist.
|
||||
|
||||
|
||||
|
||||
== Miscellany
|
||||
|
||||
=== The big gotcha: my stack dump staring days
|
||||
|
||||
Overall, debugging the modified program was reasonably
|
||||
straightforward. Data access synchronization issues mostly provoke dynamic
|
||||
data corruption, which can be beastly to debug. I was lucky enough that
|
||||
most crashes occurred in the code that was actually related to the
|
||||
corrupted data, not in some randomly located and unrelated dynamic memory
|
||||
user, so that the issues were reasonably easy to find.
|
||||
|
||||
One issue though kept me working for a few days. The indexing process kept
|
||||
crashing randomly at an interval of a few thousands documents, segfaulting
|
||||
on a bad pointer. An access to the configuration data structure seemed to
|
||||
be involved, but, as each thread was supposed to have its own copy, I was
|
||||
out of ideas.
|
||||
|
||||
After reviewing all the uses for the configuration data (there are quite a
|
||||
few), the problem was finally revealed to lie with the filter process
|
||||
cache. Each filter structure stored in the cache stores a pointer to a
|
||||
configuration structure. This belonged to the thread which initially
|
||||
created the filter. But the filter would often be reused by a different
|
||||
thread, with the consequence that the configuration object was now accessed
|
||||
and modified by two unsynchronized threads... Resetting the config pointer
|
||||
at the time of filter reuse was a very simple (almost)single-line fix to
|
||||
this evasive problem.
|
||||
|
||||
Looking at multi-threaded stack dumps is mostly fun for people with several
|
||||
heads, which is unfortunately not my case, so I was quite elated when this
|
||||
was over.
|
||||
|
||||
=== Fork performance issues
|
||||
|
||||
On a quite unrelated note, something that I discovered while evaluating the
|
||||
program performance is that forking a big process like `recollindex` can be
|
||||
quite expensive. Even if the memory space of the forked process is not
|
||||
copied (it's Copy On Write, and we write very little before the following
|
||||
exec), just duplicating the memory maps can be slow when the process uses a
|
||||
few hundred megabytes.
|
||||
|
||||
I modified the single-threaded version of `recollindex` to use *vfork*
|
||||
instead of *fork*, but this can't be used with multiple threads (no
|
||||
modification of the process memory space is allowed in the child between
|
||||
*vfork* and *exec*, so we'd have to have a way to suspend all the threads
|
||||
first).
|
||||
|
||||
I did not implement a solution to this issue, and I don't think
|
||||
that a simple one exists. The workaround is to use modest *Xapian* flush
|
||||
values to prevent the process from becoming too big.
|
||||
|
||||
A longer time solution would be to implement a small slave process to do
|
||||
the executing of ephemeral external commands.
|
||||
@ -1,138 +0,0 @@
|
||||
= The case of the bad Xapian::Document copy
|
||||
|
||||
== How things were supposed to work
|
||||
|
||||
Coming from the link:threadingRecoll.html[threading *Recoll*] page,
|
||||
you may remember that the third stage of the
|
||||
processing pipeline breaks up text into terms, producing a *Xapian*
|
||||
document (+Xapian::Document+) which is finally processed by the last stage,
|
||||
the index updater.
|
||||
|
||||
What happens in practise is that the main routine in this stage has a local
|
||||
+Xapian::Document+ object, automatically allocated on the stack, which it
|
||||
updates appropriately and then copies into a task object which is placed on
|
||||
the input queue for the last stage.
|
||||
|
||||
The text-splitting routine then returns, and its local +Xapian::Document+
|
||||
object is (implicitely) deleted while the stack unwinds.
|
||||
|
||||
The idea is that the *copy* of the document which is on the queue should be
|
||||
unaffected, it is independant of the original and will further be processed
|
||||
by the index update thread, without interaction with the text-splitting one.
|
||||
|
||||
At no point do multiple threads access the +Xapian::Document+ data, so
|
||||
there should be no problem.
|
||||
|
||||
== The problem
|
||||
|
||||
Most *Xapian* objects are reference-counted, which means that the object
|
||||
itself is a small block of house-keeping variables. The actual data is
|
||||
allocated on the heap through eventual calls to new/malloc, and is shared
|
||||
by multiple copies of the object. This is the case for +Xapian::Document+
|
||||
|
||||
This is aboundantly documented, and users are encouraged to use copies
|
||||
instead of passing pointers around (copies are cheap because only a small
|
||||
block of auxiliary data is actually duplicated). This in general makes
|
||||
memory management easier.
|
||||
|
||||
This is well-known, and it would not appear to be a problem in the above
|
||||
case as the +Xapian::Document+ actual data is never accessed by multiple
|
||||
threads.
|
||||
|
||||
The problem is that the reference counter which keeps track of the object
|
||||
usage and triggers actual deletion when it goes to zero is accessed by two
|
||||
threads:
|
||||
|
||||
- It is decremented while the first local object is destroyed during the
|
||||
stack unwind in the first thread
|
||||
- It is also updated by the last stage thread, incremented if copies are
|
||||
made, then decremented until it finally goes down to 0 when we are done
|
||||
with the object, at which point the document data is unallocated.
|
||||
|
||||
As the counter is not protected in any way against concurrent access, the
|
||||
actual sequence of events is undefined and at least two kinds of problems
|
||||
may occur: double deletion of the data, or accesses to already freed heap
|
||||
data (potentially thrashing other threads allocations, or reading modified
|
||||
data).
|
||||
|
||||
A relatively simple fix for this would be to use atomic test-and-set
|
||||
operations for the counter (which is what the GNU +std::string+ does). But
|
||||
the choice made by *Xapian* to let the application deal with all
|
||||
synchronization issues is legitimate and documented, nothing to complain
|
||||
about here. I just goofed.
|
||||
|
||||
Because the counter test and update operations are very fast, and occur
|
||||
among a lot of processing from the final stage thread, the chances of
|
||||
concurrent access are low, which is why the problem manifests itself very
|
||||
rarely. Depending on thread scheduling and all manners of semi-random
|
||||
conditions, it is basically impossible to reproduce reliably.
|
||||
|
||||
== The fix
|
||||
|
||||
The implemented fix was trivial: the upstream thread allocates the initial
|
||||
+Xapian::Document+ on the heap, copies the pointer to the queue object, and
|
||||
forgets about it. The index-updating thread peruses the object then
|
||||
+delete+'s it. Real easy.
|
||||
|
||||
An alternative solution would have been to try and use locking to protect
|
||||
the counter updates. The only place where such locking operations could
|
||||
reasonably occur is inside the +Xapian::Document+ refcounted pointer
|
||||
object, which we can't modify. Otherwise, we would have to protect the
|
||||
_whole scopes of existence_ of the Xapian::Document object in any routine
|
||||
which creates/copies or (implicitely) deletes it, which would cause many
|
||||
problems and/or contention issues
|
||||
|
||||
== Why did I miss this ?
|
||||
|
||||
The mechanism of the crashes is simple enough, quasi-obvious.
|
||||
How on earth could I miss this problem while writing the code ?
|
||||
|
||||
For the sake of anecdote, my first brush with atomicity for updates of
|
||||
reference counters was while debugging a System V release 4 kernel VFS file
|
||||
system module, at the time when SVR4 got a preemptive kernel with SVR4-MP,
|
||||
circa 1990... I ended up replacing a +counter+++ with +atomic_add()+ after
|
||||
a set of _interesting_ debugging sessions interspersed with kernel crashes
|
||||
and +fsck+ waits. This should have left some memories. So what went wrong ?
|
||||
Here follow a list of possible reasons:
|
||||
|
||||
- Reasoning by analogy: std::string are safe to use in this way. The other
|
||||
objects used in the indexing pipe are also safe. I just used
|
||||
+Xapian::Document+ in the same way without thinking further.
|
||||
- Probably not how I would do it: faced with designing +Xapian::Document+,
|
||||
(not clever enough to do this anyway), I'd probably conclude that not
|
||||
wanting to deal with full-on concurrency is one thing, not protecting the
|
||||
reference counters is another, and going too far.
|
||||
- The problem was not so easily visible because the object deletion is
|
||||
implicitely performed during the stack unwind: this provides no clue, no
|
||||
specific operation to think about.
|
||||
- Pure lazyness.
|
||||
|
||||
|
||||
As a conclusion, a humble request to library designers: when an
|
||||
interface works counter to the reasonable expectations of at least some of
|
||||
the users (for example because it looks like, but works differently, than a
|
||||
standard library interface), it is worth it to be very specific in the
|
||||
documentation and header file comments about the gotcha's. Saving people
|
||||
from their own deficiencies is a worthy goal.
|
||||
|
||||
Here, a simple statement that the reference count was not mt-safe
|
||||
(admittedly redundant with the general statement that the *Xapian* library
|
||||
does not deal with threads), would have got me thinking and avoided the
|
||||
error.
|
||||
|
||||
++++
|
||||
<h2 id="comments">Comments</h2>
|
||||
|
||||
<div id="disqus_thread"></div>
|
||||
<script type="text/javascript">
|
||||
var disqus_shortname = 'lesbonscomptes';
|
||||
(function() {
|
||||
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
|
||||
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
|
||||
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
|
||||
})();
|
||||
</script>
|
||||
<noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
|
||||
<a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
|
||||
|
||||
++++
|
||||
@ -1,401 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html>
|
||||
<head>
|
||||
<title>Recoll text search finds your documents</title>
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Description" content="Recoll is a desktop text search application for Unix, Linux, Microsoft Windows and Mac OS X, based on the Xapian search engine library.">
|
||||
<meta name="Keywords" content="text search, pdf search, document search, full-text search, desktop search, open source,free">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
||||
<link rel="shortcut icon" href="favicon.ico" />
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="http://www.recoll.org">Home</a></li>
|
||||
<li><a href="pics/index.html">Screenshots</a></li>
|
||||
<li><a href="download.html">Downloads</a></li>
|
||||
<li><a href="doc.html">Documentation</a></li>
|
||||
<li><a href="support.html">Support</a></li>
|
||||
<li><a href="devel.html">Helping out</a></li>
|
||||
<li><a href="index.html.fr">En Français</a></li>
|
||||
<li><a class="weak" href="../pages/lbc-hosting.html">lesbonscomptes</a>
|
||||
</ul>
|
||||
<p class="indexthumb">
|
||||
<a href="pics/index.html"><img width="100" alt=
|
||||
"Thumbnail of recoll main screen" src=
|
||||
"pics/recoll0-thumb.png"></a>
|
||||
</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h1><img align="center" src="pics/recoll64.png"/>
|
||||
<a href="http://www.recoll.org/">Recoll</a> is
|
||||
a desktop full-text search tool.</h1>
|
||||
|
||||
<p><span class="application">Recoll</span> finds keywords
|
||||
inside documents as well as file names.</p>
|
||||
<ul>
|
||||
<li>Versions are available for <a href="download.html">Linux</a>
|
||||
and <a href="pages/recoll-windows.html">MS Windows</a>.</li>
|
||||
<li>A
|
||||
<a href="https://github.com/koniu/recoll-webui">WEB
|
||||
front-end</a> with preview and download features can
|
||||
replace or supplement the GUI for remote
|
||||
use.</li>
|
||||
<li>It can search
|
||||
most <span class="important"><a href="features.html#doctypes">document
|
||||
formats</a></span>. <a href="features.html#doctypes">You may
|
||||
need external applications for text extraction</a>.</li>
|
||||
<li>It can reach any storage place: files,
|
||||
archive members, email attachments, transparently
|
||||
handling decompression.</li>
|
||||
<li>One click will open the document inside a native editor or
|
||||
display an even quicker text preview.</li>
|
||||
<li>The software is free, open source,
|
||||
and licensed under the GPL.</li>
|
||||
<li><a href="features.html">Detailed features</a> and
|
||||
application requirements for supported document types.</li>
|
||||
</ul>
|
||||
|
||||
<p>The current <span class="application">Recoll</span> version is
|
||||
<a href="download.html">1.23.2</a>
|
||||
(<a href="release-1.23.html">Release notes</a>,
|
||||
<a href="BUGS.html">known bugs</a>,
|
||||
<a href="release-history.html">Release history</a>).</p>
|
||||
|
||||
|
||||
<p><span class="application">Recoll</span> is based on the very
|
||||
capable <a href="http://www.xapian.org">Xapian</a> search
|
||||
engine library, for which it provides a powerful text
|
||||
extraction layer and a complete, yet easy to use, Qt graphical
|
||||
interface.</p>
|
||||
|
||||
<p class="remark">Recoll will index an <b>MS-Word</b> document
|
||||
stored as an <b>attachment</b> to an <b>e-mail message</b> inside
|
||||
a <b>Thunderbird folder</b> archived in a <b>Zip file</b> (and
|
||||
more...). It will also help you search for it with a friendly and
|
||||
powerful interface, and let you open a copy of a PDF at the right
|
||||
page with two clicks. There is little that will remain
|
||||
hidden on your disk.</p>
|
||||
|
||||
<p>Recoll has extensive <a href="doc.html">
|
||||
documentation</a>. If you run into a problem, or want to
|
||||
propose improvements, you are welcome to use
|
||||
the <a href="support.html">
|
||||
<span class="important">mailing list or problem
|
||||
tracker</span></a>.</p>
|
||||
|
||||
<p><b><i>Recoll user ?</i></b> Maybe there are still a few useful
|
||||
search tricks that you don't know about. A quick look at
|
||||
the <a href="usermanual/RCL.SEARCH.html#RCL.SEARCH.GUI.TIPS">search
|
||||
tips</a> might prove useful ! Also
|
||||
the <a href="faqsandhowtos/index.html">
|
||||
Faqs and Howtos section</a>, and some contributed
|
||||
<a href="custom.html">result list formats</a>.</p>
|
||||
|
||||
<h2>Thanks</h2>
|
||||
<p>Recoll borrows a lot of code
|
||||
from other packages, and welcomes code and ideas from
|
||||
contributors, see some of the
|
||||
<a class="important" href="credits.html">Credits</a>.</p>
|
||||
|
||||
<h2>News</h2>
|
||||
<div class="news">
|
||||
|
||||
<dl>
|
||||
<dt>2017-07-31</dt><dd>Finalizing the move to the new site,
|
||||
I am closing the old BitBucket project. The existing
|
||||
BitBucket issues <a href="bitbucket-issues-recoll/index.html">
|
||||
have been archived</a>.</dd>
|
||||
|
||||
<dt>2017-07-02</dt><dd>The source code repository and issue
|
||||
tracker are moving to a
|
||||
<a href="https://opensourceprojects.eu/p/recoll1/">
|
||||
new place</a>.<br clear="all"></dd>
|
||||
|
||||
<dt>2017-05-23</dt><dd>Release 1.23.2 has gotten much
|
||||
better at <a href="recoll_XMP">processing PDF XMP
|
||||
data</a>.</dd>
|
||||
|
||||
<dt>2017-05-15</dt><dd>Release 1.23.2. This fixes a couple
|
||||
of quite serious bugs. See
|
||||
the <a href="release-1.23.html">Release notes</a></dd>
|
||||
|
||||
<dt>2017-03-09</dt><dd>Release 1.23.1. See
|
||||
the <a href="release-1.23.html">Release notes</a></dd>
|
||||
|
||||
<dt>2016-11-25</dt><dd>Release 1.22.4 is available and fixes
|
||||
an ennoying qt5 glitch (advanced search 'start search'
|
||||
button doing nothing). <a href="release-1.22.html">Release
|
||||
notes</a></dd>
|
||||
|
||||
<dt>2016-06-21</dt><dd>Release 1.22.3 is available. This is
|
||||
going to replace 1.21 as the main release. See
|
||||
the <a href="release-1.22.html">the release
|
||||
notes</a>. Some input handler dependancies have changed.</dd>
|
||||
|
||||
<dt>2016-05-11</dt><dd>Release 1.21.7 fixes an ennoying but
|
||||
benign GUI crash-on-exit bug reported on Fedora 23 (qt5).</dd>
|
||||
|
||||
<dt>2016-04-21</dt><dd>I experimented with installing
|
||||
the <a href="https://github.com/koniu/recoll-webui">Recoll
|
||||
Web UI</a> with Apache, and found out
|
||||
that <a href="pages/recoll-webui-install-wsgi.html">this
|
||||
is really easy</a>, actually both easier to set up and
|
||||
more useful than running it standalone. Recently added:
|
||||
instructions for running with Nginx instead of Apache.</dd>
|
||||
|
||||
<dt>2016-04-18</dt><dd>Found a <a href="BUGS.html#GUIADV">GUI
|
||||
crash bug</a> with a reasonably easy workaround.</dd>
|
||||
|
||||
<dt>2016-04-14</dt><dd>Release 1.22.0 is now available from
|
||||
the download area. The binary packages should wait until
|
||||
enough brave souls have tested it. See
|
||||
the <a href="release-1.22.html">the release notes</a>.</dd>
|
||||
|
||||
<dt>2016-04-07</dt><dd>Release 1.21.6 adds KDE5 compatibility
|
||||
for the KIO slave.</dd>
|
||||
|
||||
<dt>2016-01-29</dt><dd>Release 1.21.5 is out. It fixes a
|
||||
relatively nasty bug affecting all previous 1.21 versions:
|
||||
the query language parser processed incorrectly multiple
|
||||
mime type or category specifications, with missing results
|
||||
as a consequence </dd>
|
||||
|
||||
<dt>2016-01-12</dt><dd>It seems that we currently have a
|
||||
relatively frequent problem resulting in damaged indexes. If
|
||||
you are experimenting heavy reindexing (incremental indexing
|
||||
takes longer than it should), or missing search results,
|
||||
please take a look at the top of
|
||||
the <a href="BUGS.html">known bugs page</a></dd>
|
||||
|
||||
<dt>2015-11-09</dt>
|
||||
<dd><a href="pics/windows-recoll.html">
|
||||
<img align="left" width="100" alt="Recoll on MS-Windows"
|
||||
src="pics/windows-recoll-thumb.png"></a>
|
||||
<span class="important">Recoll for
|
||||
MS-Windows</span>. Still a few things missing (like
|
||||
real-time monitoring), but it does work, and it has a proper
|
||||
installer, so you can easily get rid of it if you don't like
|
||||
it. <a href="pages/recoll-windows.html">Have a look.</a>.
|
||||
This is an almost-native port, based on Qt and the Windows
|
||||
API, no need for Cygwin. Thanks to Christian Motz for
|
||||
helping with the filter interface (and the rest). I would
|
||||
love some feedback!<br clear="all">
|
||||
</dd>
|
||||
|
||||
<dt>2015-10-17</dt>
|
||||
<dd>A bug in the verification of configuration file path variables
|
||||
generates spurious warnings from recollindex when the
|
||||
skippedPaths variable contains elements with wildcards. This
|
||||
has no consequence except for the spurious error
|
||||
message.</dd>
|
||||
|
||||
<dt>2015-10-01</dt>
|
||||
<dd>Release 1.21.2 is out, and replaces 1.20 as production
|
||||
release. </dd>
|
||||
|
||||
<dt>2015-06-30</dt>
|
||||
<dd>A new rclpdf filter, with improved compatibility with
|
||||
recent poppler pdftotext
|
||||
versions. See <a href="filters/filters.html">rclpdf
|
||||
filter</a>.</dd>
|
||||
|
||||
<dt>2015-06-16</dt>
|
||||
<dd>Recoll 1.21.0 is out. This has a new query parser and
|
||||
should be considered an instable release, please do not
|
||||
package it (1.20.6 is the one you want for stability). It
|
||||
also <a href="idxthreads/forkingRecoll.html">changes the way
|
||||
filters are executed</a> for better performance. See the
|
||||
<a href="release-1.21.html">release notes</a> for more
|
||||
detail about the few other changes.</dd>
|
||||
|
||||
<dt>2015-04-25</dt>
|
||||
<dd>Recoll 1.20.6 is out, with mostly small fixes to
|
||||
compressed file handling, which may make a big difference in
|
||||
some cases. See the <a href="release-1.20.html">release
|
||||
notes</a>. Of course it also incorportates the Qt 5
|
||||
compatibility from 1.20.5 (Qt
|
||||
5.3.2 ok, 5.2 does not work).</dd>
|
||||
|
||||
<dt>2015-03-30</dt>
|
||||
<dd>Recoll 1.20.4 released. This fixes real time indexing of
|
||||
the web history (when using the Firefox plugin).</dd>
|
||||
|
||||
<dt>2014-12-27</dt>
|
||||
<dd><a href="https://www.gnu.org/software/unrtf/">
|
||||
Unrtf 21.8</a> has been released. This fixes many issues
|
||||
in unrtf, some with possible security implications. You
|
||||
really want to use this version.</dd>
|
||||
|
||||
<dt>2014-12-18</dt> <dd>Recoll 1.20.1 is out and replaces 1.19
|
||||
as the main version. I have been using 1.20 for months
|
||||
(along with a number of fearless builders-from-source), and
|
||||
it's as stable as 1.19, with nice
|
||||
small <a href="release-1.20.html">new features</a>. Packages
|
||||
will follow shortly. It is recommended (but not strictly
|
||||
required, see the notes) to run an index reset when
|
||||
upgrading.</dd>
|
||||
|
||||
<dt>2014-12-10</dt> <dd>The aspell command used for
|
||||
orthographic suggestions is broken on Debian Jessie (because
|
||||
of an aspell packaging issue), and this will not be fixed
|
||||
for the Debian release. See the <a href="BUGS.html#aspelljessie">
|
||||
simple workaround here</a>.</dd>
|
||||
|
||||
<dt>2014-11-09</dt> <dd>If you are still running anything
|
||||
older than 1.19.14p2, <span class="important">YOU SHOULD
|
||||
UPGRADE</span>. In
|
||||
particular, <a href="release-1.19.html#rodb">this index
|
||||
corruption issue</a> leading to repeated reindexing of
|
||||
documents, and possibly query problems too, can be pretty
|
||||
ennoying.<br/>
|
||||
GOTO <a href="download.html">download</a> and
|
||||
install 1.19.14p2 or 1.20. <em>Reset your index after
|
||||
upgrading (rm -rf ~/.recoll/xapiandb)</em>.</dd>
|
||||
|
||||
<dt>2014-07-28</dt> <dd>A nice new application to complement
|
||||
Recoll: <a href="https://github.com/pidlug/recollfs">recollfs</a>
|
||||
implements a Fuse filesystem where Recoll queries are
|
||||
represented as directories, the contents of which are links
|
||||
to the result documents.</dd>
|
||||
|
||||
<dt>2014-07-16</dt> <dd>Recoll version 1.19.14p2 fixes more
|
||||
resource management issues in the Python module (only the
|
||||
Python package needs upgrading for this), and the processing
|
||||
of Bengali characters (no more diacritics stripping).</dd>
|
||||
|
||||
<dt>2014-06-24</dt> <dd><a href="filters/filters.html#soff1">An
|
||||
updated filter</a> for Open/LibreOffice documents. The
|
||||
previous version merged words which were tab-separated in
|
||||
the input.</dd>
|
||||
|
||||
<dt>2014-06-17</dt> <dd>The source tarball for version 1.20.0
|
||||
has been released. This version has
|
||||
a <a href="release-1.20.html">number of improvements</a> over
|
||||
1.19, but also some incompatibilities. The first minor
|
||||
releases for 1.20 may contain some functional changes in
|
||||
addition to bug fixes, so they may be slightly less stable
|
||||
than 1.19, and 1.19 packages remain the "safe Recoll" for
|
||||
now. Still, if you build from source, there are a few nice
|
||||
things in 1.20...</dd>
|
||||
|
||||
<dt>2014-06-07</dt> <dd>Version 1.19.14 is out and fixes a
|
||||
handful of minor-to-ennoying indexing glitches (see the
|
||||
<a href="release-1.19.html">Release notes</a>).</dd>
|
||||
|
||||
<dt>2014-05-06</dt> <dd>Version 1.19.13 is out and hopefully
|
||||
fixes the remaining (rare) crashes of multithreaded
|
||||
indexing.</dd>
|
||||
|
||||
<dt>2014-04-03</dt> <dd>I have separated the code for the
|
||||
<a href="https://opensourceprojects.eu/p/unityscoperecol/">Recoll
|
||||
Unity Scope</a> from the main body of code, in hope that it may
|
||||
interest someone to work on it. It's Python and simple,
|
||||
mostly depending on the Unity API. The Ubuntu Unity API is
|
||||
apparently going to change *again* for the next version, and
|
||||
I think I've seen enough of it.</dd>
|
||||
|
||||
<dt>2014-04-02</dt> <dd>1.19.12 is out. It's mostly identical
|
||||
to 1.19.11 apart from a new parameter to change the max size
|
||||
of stored attributes. No need to update in general.</dd>
|
||||
|
||||
<dt>2014-02-27</dt> <dd>I hear from time to time about
|
||||
recollindex crashes. These appear to be quite rare, but they
|
||||
do happen, and I think that they are linked to a yet unfound
|
||||
bug in multithread indexing. If you experience such crashes or
|
||||
stalls, you can disable multithreading by adding the following
|
||||
to your recoll.conf:
|
||||
<pre><tt>thrQSizes = -1 -1 -1</tt></pre>
|
||||
</dd>
|
||||
|
||||
<dt>2014-02-27</dt><dd>While working on a
|
||||
<a href="http://www.recoll.org/faqsandhowtos/MuttAndRecoll.html">
|
||||
Recoll-Mutt interface</a> I discovered incidentally that
|
||||
the <a href="https://github.com/koniu/recoll-webui">Recoll
|
||||
Webui Web interface</a> works quite well with the
|
||||
<a href="http://links.twibright.com/">links</a> web browser
|
||||
inside a terminal window. This appears to be an interesting
|
||||
solution for people looking for a search interface usable in
|
||||
a non-GUI environment.</dd>
|
||||
|
||||
<dt>2013-11-19</dt> <dd>A <a href="filters/filters.html">new
|
||||
filter</a> for PowerPoint files. The previous one was
|
||||
based on the ancient <b>catppt</b> from the <b>catdoc</b>
|
||||
utilities and usually extracted nothing from more recent
|
||||
PowerPoint files (this is about .ppt: .pptx is handled by a native
|
||||
Recoll filter).</dd>
|
||||
|
||||
<dt>2013-05-18</dt><dd>Sometimes things
|
||||
<a href="http://www.lesbonscomptes.com/pages/happysearch.html">
|
||||
just work</a>...</dd>
|
||||
|
||||
<dt>2013-04-30</dt><dd>Thanks to some of its users, Recoll now
|
||||
has filters to
|
||||
<a href="http://sourceforge.net/projects/rcollnotesfiltr/">
|
||||
index and retrieve Lotus Notes messages</a>
|
||||
(some
|
||||
<a href="http://richardappleby.wordpress.com/2013/04/11/you-dont-have-to-know-the-answer-to-everything-just-how-to-find-it/">
|
||||
implementation notes from an early user</a>), and there is
|
||||
also now a
|
||||
<a href="https://github.com/koniu/recoll-webui/">
|
||||
Web browser interface</a> for querying your Recoll
|
||||
indexes.</dd>
|
||||
|
||||
<dt>2012-10-25</dt> <dd>A problem with a simple workaround has caused
|
||||
several reported <span class="important">recollindex
|
||||
crashes</span> recently (for 1.17). If you store and index
|
||||
Mozilla/Thunderbird email out of the standard location
|
||||
(~/.thunderbird), you should add the following at the end of
|
||||
your configuration file (e.g.:
|
||||
~/.recoll/recoll.conf): <pre><tt>
|
||||
[/path/to/my/mozilla/mail]
|
||||
mhmboxquirks = tbird
|
||||
</tt></pre> Adjust the path to your local value of course...
|
||||
Without this hint, recollindex has trouble finding the
|
||||
message delimiters inside the folder files, and will
|
||||
possibly use all the computer's memory and crash. Apart from
|
||||
crashes, which only occur for very big folders, this also
|
||||
causes incorrect mail indexing.
|
||||
</dd>
|
||||
|
||||
<dt>2012-09-11</dt> <dd>A new user-contributed script for those who use
|
||||
real-time indexing on laptops: stop or start indexing
|
||||
according to AC power status. See the details on
|
||||
the <a href="http://www.recoll.org/faqsandhowtos/IndexOnAc.html">
|
||||
Wiki</a>. </dd>
|
||||
|
||||
<dt>2012-04-07</dt><dd>We now have a Chinese user manual:
|
||||
Recoll现在有中文手册咯:
|
||||
<a href="http://stupidbeauty.com/Blog/2012/03/recoll%E7%94%A8%E6%88%B6%E6%89%8B%E5%86%8A%E7%BF%BB%E8%AD%AF%EF%BC%8Crecoll-user-manual-2/">
|
||||
Recoll中文手册,HTML</a></dd>
|
||||
|
||||
|
||||
</dl>
|
||||
</div>
|
||||
|
||||
<h2>On the side</h2>
|
||||
|
||||
<div class="news">
|
||||
<blockquote>
|
||||
<p>We rent <a href="http://www.metairie-enbor.com/index.html.en">
|
||||
a big country house</a> in the Aude area, in the south of
|
||||
France (<a href="http://www.metairie-enbor.com/acces.html.en">see
|
||||
map on the site</a>). If you are
|
||||
looking for a wonderful country place with a pool to
|
||||
spend holidays with a big bunch of family and/or
|
||||
friends in a nice historical but very quiet area, this may be it.</p>
|
||||
</blockquote>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@ -1,193 +0,0 @@
|
||||
<!DDOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>RECOLL: un outil personnel de recherche textuelle pour
|
||||
Unix et Linux</title>
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Description" content=
|
||||
"recoll est un logiciel personnel de recherche textuelle pour unix et linux basé sur Xapian, un moteur d'indexation puissant et mature.">
|
||||
<meta name="Keywords" content=
|
||||
"recherche textuelle,desktop,unix,linux,solaris,open
|
||||
source, free, bois de chauffage">
|
||||
<meta http-equiv="Content-language" content="fr">
|
||||
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
||||
<link rel="shortcut icon" href="favicon.ico" />
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="http://www.recoll.org">Base</a></li>
|
||||
<li><a href="pics/index.html">Copies d'écrans</a></li>
|
||||
<li><a href="download.html">Téléchargements</a></li>
|
||||
<li><a href="doc.html">Documentation</a></li>
|
||||
<li><a href="devel.html">Développement</a></li>
|
||||
</ul>
|
||||
<p class="indexthumb">
|
||||
<a href="pics/index.html"><img width="100" alt=
|
||||
"Imagette de l'écran principal" src=
|
||||
"pics/recoll0-thumb.png"></a>
|
||||
</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h1><img align="center" src="pics/recoll64.png"/>
|
||||
<a href="http://www.recoll.org/">Recoll</a> est
|
||||
un outil personnel de recherche textuelle pour Unix et Linux</h1>
|
||||
|
||||
<p>Il est basé sur le puissant moteur d'indexation <a href=
|
||||
"http://www.xapian.org">Xapian</a>, pour lequel il offre une
|
||||
interface graphique QT facile d'utilisation, riche, et facile à
|
||||
mettre en oeuvre.</p>
|
||||
|
||||
<p><span class="application">Recoll</span> est un logiciel libre
|
||||
gratuit, dont le code source est disponible sous licence GPL.
|
||||
La dernière version est
|
||||
<a class="important" href="download.html">1.23.1</a>
|
||||
(<a href="release-1.23.html">notes sur la version, en
|
||||
anglais</a>)</p>
|
||||
|
||||
<p>L'interface utilisateur de
|
||||
<span class="application">Recoll</span> est traduite en
|
||||
Français, mais pas encore la documentation, malheureusement,
|
||||
et la plupart des liens de cette page pointent sur des textes
|
||||
en Anglais.</p>
|
||||
|
||||
|
||||
<h2>Caractéristiques: </h2>
|
||||
|
||||
<ul>
|
||||
<li>Installation facile, peu de dépendances. Pas besoin de
|
||||
démon permanent, de serveur http, d'un environnement de bureau
|
||||
particulier ou d'un langage exotique.</li>
|
||||
|
||||
<li>Tourne sur la plupart des
|
||||
<a href="fr/features.html#systems">systèmes</a> fondés sur
|
||||
Unix.</li>
|
||||
|
||||
<li>Interface conçue avec <a href="http://www.trolltech.com">
|
||||
Qt 4 ou 5 selon les plateformes.</a></li>
|
||||
|
||||
<li>Traite la plupart des <a href="fr/features.html#doctypes">
|
||||
types de documents</a> courants, les messages et leurs fichiers
|
||||
attachés. Peut aussi traiter leurs versions comprimées
|
||||
(gzip ou bzip2) de tous ces documents.
|
||||
<a href="features.html#doctypes">Application externes pour
|
||||
l'extraction du texte</a>.</li>
|
||||
|
||||
<li>Fonctions de recherche puissantes, avec expressions Booléennes,
|
||||
phrases et proximité, wildcards, filtrage sur les types de fichiers
|
||||
ou l'emplacement.</li>
|
||||
|
||||
<li>Multi-langage et multi-jeu de caractères, utilisant
|
||||
Unicode en interne.</li>
|
||||
|
||||
<li><a class="weak" href="fr/features.html">
|
||||
(plus de détails)</a></li>
|
||||
|
||||
</ul>
|
||||
|
||||
<p><b><i>Déjà utilisateur ?</i></b> Il est possible qu'il
|
||||
y ait encore quelques astuces qui vous aient échappées. Un coup
|
||||
d'oeil rapide sur la page des <a
|
||||
href="usermanual/RCL.SEARCH.html#RCL.SEARCH.GUI.TIPS"> petites
|
||||
recettes de recherche</a> (en anglais) pourrait s'avérer
|
||||
fructueux ! Également, en anglais,
|
||||
la <a href="faqsandhowtos/index.html">section des questions
|
||||
fréquentes et trucs divers</a>.</p>
|
||||
|
||||
<h2>Nouvelles: </h2>
|
||||
|
||||
<dl>
|
||||
<dt>2017-05-15</dt><dd>Version 1.23.2. Corrige quelques bugs
|
||||
sérieux. Voir les <a href="release-1.23.html">Release notes (en
|
||||
anglais).</a></dd>
|
||||
<dt>2017-03-09</dt><dd>Version 1.23.1.
|
||||
the <a href="release-1.23.html">Release notes (en
|
||||
anglais).</a></dd>
|
||||
|
||||
<td>2016-11-23</td><dd>Version 1.22.4.</dd>
|
||||
<dt>2016-06-15</dt><dd>La version 1.22.3 est disponible et va
|
||||
progressivement remplacer 1.21 comme version
|
||||
principale. <a href="release-1.22.html">Notes de version</a>
|
||||
(en anglais).</dd>
|
||||
|
||||
<dt>2016-05-11</dt><dd>Release 1.21.7: corrige un crash bénin
|
||||
mais agaçant au moment de quitter l'interface utilisateur
|
||||
(Fedora 23 / qt5).</dd>
|
||||
|
||||
<dt>2015-11-09</dt>
|
||||
<dd>Recoll indexe Windows ! Il y a encore quelques éléments
|
||||
manquants, comme l'indexation temps-réel, et la traduction
|
||||
en Français, mais ça marche suffisamment bien pour être
|
||||
essayé. Il y a un installeur standard, donc si vous n'aimez
|
||||
pas, c'est facile à désinstaller...
|
||||
Pas de traduction Française pour le moment. Il y
|
||||
a <a href="pages/recoll-windows.html"> quelques
|
||||
explications en Anglais sur l'installation </a>.
|
||||
Si vous l'essayez, dites moi ce que vous en pensez !
|
||||
</dd>
|
||||
|
||||
<dt>2012-10-25</dt><dd> Un problème avec une solution simple
|
||||
peut provoquer
|
||||
des <span class="important">plantages de
|
||||
recollindex</span>.
|
||||
Si vous indexez des messages mail Mozilla/Thunderbird
|
||||
ailleurs qu'à l'endroit standard (~/.thunderbird), vous
|
||||
devriez ajouter les lignes qui suivent à la fin de votre
|
||||
fichier de configuration (~/.recoll/recoll.conf):
|
||||
<pre><tt>
|
||||
[/path/to/my/mozilla/mail]
|
||||
mhmboxquirks = tbird
|
||||
</tt></pre> Changez le chemin d'accès pour le votre bien
|
||||
sûr. Sans cette indication, recollindex a des difficultés à
|
||||
déterminer les limites de message dans les fichiers mailbox,
|
||||
et peut arriver à utiliser toute la mémoire de la machine,
|
||||
et à se planter. Dans les cas moins graves (avec des
|
||||
fichiers de taille "raisonnable"), cela provoque aussi une
|
||||
indexation incorrecte des messages.
|
||||
</dd>
|
||||
|
||||
<dt>2010-11-20</dt><dd>Un petit script pour activer/cacher recoll sur un
|
||||
bureau gnome d'un seul coup de clavier:
|
||||
<a href="http://www.recoll.org/faqsandhowtos/HotRecoll.html">
|
||||
recette d'installation</a>.</dd>
|
||||
|
||||
</ul>
|
||||
|
||||
<h2><a name="support">Support</a></h2>
|
||||
|
||||
<p>Si vous avez un problème quelconque avec le logiciel ou son
|
||||
installation, ou une idée de fonctions à ajouter, merci de me
|
||||
<a href=
|
||||
"mailto:jfd@recoll.org">contacter</a>.</p>
|
||||
|
||||
<p>Voir aussi la <a href="devel.html">page sur le
|
||||
développement</a>.</p>
|
||||
<p><a href="BUGS.html">Liste des problèmes connus</a> (en
|
||||
anglais). </p>
|
||||
|
||||
<h2>Remerciements</h2>
|
||||
<p><span class="application">Recoll</span> emprunte beaucoup de code
|
||||
d'autres logiciels libres, et accueille volontiers les
|
||||
contributions en code ou en suggestions, voir la page des
|
||||
<a class="important" href="credits.html">Attributions</a>.</p>
|
||||
|
||||
<h2>Autres</h2>
|
||||
<p>Je loue une
|
||||
<a href="http://www.metairie-enbor.com/index.html.fr">
|
||||
grande maison sympa dans l'Aude</a> :), et nous produisons aussi
|
||||
du <a href="http://www.metairie-enbor.com/bois-de-chauffage.html">
|
||||
bois de chauffage</a>. (Il faut bien que cette page me serve
|
||||
tout de même à <em>quelque chose</em> à moi aussi de temps
|
||||
en temps !).</p>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@ -1,10 +0,0 @@
|
||||
.SUFFIXES: .txt .html
|
||||
|
||||
.txt.html:
|
||||
asciidoc $<
|
||||
|
||||
all: recoll-windows.html recoll-windows-faq.html \
|
||||
recoll-webui-install-wsgi.html
|
||||
|
||||
clean:
|
||||
rm -f *.html
|
||||
@ -1,280 +0,0 @@
|
||||
= Recoll WebUI Apache and nginx installation from scratch
|
||||
|
||||
NOTE: thanks to Michael L. Wilson for the `nginx` part.
|
||||
|
||||
The https://github.com/koniu/recoll-webui[Recoll WebUI] offers an
|
||||
alternative, WEB-based, interface for querying a Recoll index.
|
||||
|
||||
It can be quite useful to extend the use of a shared index to multiple
|
||||
workstations, without the need for a local Recoll installation and shared
|
||||
data storage.
|
||||
|
||||
The Recoll WebUI is based on the
|
||||
http://bottlepy.org/docs/dev/index.html[Bottle Python framework], which has
|
||||
a built-in WEB server, and the simplest deployment approach is to run it
|
||||
standalone. However the built-in server is restricted to handling one
|
||||
request at a time, which is problematic in multi-user situations,
|
||||
especially because some requests, like extracting a result list into a CSV
|
||||
file, can take a significant amount of time.
|
||||
|
||||
The Bottle framework can work with several multi-threading Python HTTP
|
||||
server libraries, but, given the limitations of the Recoll Python module
|
||||
and the Python interpreter itself, this will not yield optimal performance,
|
||||
and, especially can't efficiently leverage the now ubiquitous
|
||||
multiprocessors.
|
||||
|
||||
In multi-user situations, you can get better performance and ease of use
|
||||
from the Recoll WebUI by running it under Apache or Nginx rather than as a
|
||||
standalone process. With this approach, a few requests per second can
|
||||
easily be handled even in the presence of long-running ones.
|
||||
|
||||
Neither Recoll nor the WebUI are optimized for high multi-user load, and it
|
||||
would be very unwise to use them as the search interface to a busy WEB
|
||||
site.
|
||||
|
||||
The instructions about using the WebUI under Apache as given in the
|
||||
repository README are a bit terse, and are missing a few details,
|
||||
especially ones which impact performance.
|
||||
|
||||
Here follows the synopsis of three WebUI installations on initially
|
||||
Apache-less Ubuntu (14.04) and DragonFly BSD systems, and for
|
||||
Nginx/BSD. The first should extend easily to other Debian-based systems,
|
||||
the second at least to FreeBSD. rpm-based systems are left as an exercise
|
||||
to the reader, at least for now...
|
||||
|
||||
|
||||
CAUTION: THE CONFIGURATIONS DESCRIBED HAVE NO ACCESS CONTROL. ANYONE WITH
|
||||
ACCESS TO THE NETWORK WHERE THE SERVER IS LOCATED CAN RETRIEVE ANY
|
||||
DOCUMENT.
|
||||
|
||||
link:#nginx[Jump to the nginx section].
|
||||
|
||||
[[apache]]
|
||||
== Apache
|
||||
=== On a Debian/Ubuntu system
|
||||
|
||||
==== Install recoll
|
||||
|
||||
sudo apt-get install recoll python-recoll
|
||||
|
||||
Configure the indexing and check that the normal search works (I spent
|
||||
quite a lot of time trying to understand why the WebUI did not work, when
|
||||
in fact it was the normal recoll configuration which was broken and the
|
||||
regular search did not work either).
|
||||
|
||||
Take care to be logged in as the user you want to run the web search as
|
||||
while you do this.
|
||||
|
||||
|
||||
==== Install the WebUI
|
||||
|
||||
Clone the github repository, or extract the master tar installation, and
|
||||
move it to '/var/www/recoll-webui-master/'. Take care that it is read/execute
|
||||
accessible by your user.
|
||||
|
||||
==== Install Apache and mod-wsgi
|
||||
|
||||
|
||||
sudo apt-get install apache2 libapache2-mod-wsgi
|
||||
|
||||
I then got the following message:
|
||||
|
||||
AH00558: apache2: Could not reliably determine the server's fully qualified domain name, using 127.0.1.1. Set the 'ServerName' directive globally to suppress this message
|
||||
|
||||
To clear it, I added a ServerName directive to the Apache config, maybe you
|
||||
won't need it. Edit '/etc/apache2/sites-available/000-default.conf' and add
|
||||
the following at the top (globally). Things work without this fix anyway,
|
||||
this is just to suppress the error message. You probably need to adjust the
|
||||
address or use a real host name:
|
||||
|
||||
ServerName 192.168.4.6
|
||||
|
||||
|
||||
Edit '/etc/apache2/mods-enabled/wsgi.conf', add the following at the end of
|
||||
the "IfModule" section.
|
||||
|
||||
Change the user ('dockes' in the example) taking care that he is the one who
|
||||
owns the index ('.recoll' is in his home directory).
|
||||
|
||||
WSGIDaemonProcess recoll user=dockes group=dockes \
|
||||
threads=1 processes=5 display-name=%{GROUP} \
|
||||
python-path=/var/www/recoll-webui-master
|
||||
WSGIScriptAlias /recoll /var/www/recoll-webui-master/webui-wsgi.py
|
||||
<Directory /var/www/recoll-webui-master>
|
||||
WSGIProcessGroup recoll
|
||||
Order allow,deny
|
||||
allow from all
|
||||
</Directory>
|
||||
|
||||
NOTE: the Recoll WebUI application is mostly single-threaded, so it is of
|
||||
little use (and may actually be counter-productive in some cases) to
|
||||
specify multiple threads on the WSGIDaemonProcess line. Specify multiple
|
||||
processes instead to put multiple CPUs to work on simultaneous requests.
|
||||
|
||||
|
||||
Then run the following to restart Apache:
|
||||
|
||||
sudo apachectl restart
|
||||
|
||||
The Recoll WebUI should now be accessible. on 'http://my.server.com/recoll/'
|
||||
|
||||
NOTE: Take care that you need a '/' at the end of the URL used to access
|
||||
the search (use: 'http://my.server.com/recoll/', not
|
||||
'http://my.server.com/recoll'), else files other than the script itself are
|
||||
not found (the page looks weird and the search does not work).
|
||||
|
||||
CAUTION: THERE IS NO ACCESS CONTROL. ANYONE WITH ACCESS TO THE NETWORK
|
||||
WHERE THE SERVER IS LOCATED CAN RETRIEVE ANY DOCUMENT.
|
||||
|
||||
=== Apache Variant for BSD/ports
|
||||
|
||||
==== Packages
|
||||
|
||||
As root:
|
||||
|
||||
pkg install recoll
|
||||
|
||||
|
||||
Do what you need to do to configure the indexing and check that the normal
|
||||
search works.
|
||||
|
||||
Take care to be logged in as the user you want to run the web search as
|
||||
while you do this.
|
||||
|
||||
pkg install apache24
|
||||
|
||||
Add apache24_enable="YES" in /etc/rc.conf
|
||||
|
||||
pkg install ap24-mod_wsgi4
|
||||
pkg install git
|
||||
|
||||
==== Clone the webui repository
|
||||
|
||||
cd /usr/local/www/apache24/
|
||||
git clone https://github.com/koniu/recoll-webui.git recoll-webui-master
|
||||
|
||||
Important: most input handler helper applications (e.g. 'pdftotext') are
|
||||
installed in '/usr/local/bin' which is not in the PATH as seen by Apache
|
||||
(at least on DragonFly). The simplest way to fix this is to modify the
|
||||
launcher module for the webui app so that it fixes the PATH.
|
||||
|
||||
Edit 'recoll-webui-master/webui-wsgi.py' and add the following line after
|
||||
the 'import os' line:
|
||||
|
||||
os.environ['PATH'] = os.environ['PATH'] + ':' + '/usr/local/bin'
|
||||
|
||||
|
||||
|
||||
==== Configure Apache
|
||||
|
||||
Edit /usr/local/etc/apache24/modules.d/270_mod_wsgi.conf
|
||||
|
||||
Uncomment the LoadModule line, and add the directives to alias /recoll/ to
|
||||
the webui script.
|
||||
|
||||
Change the user (dockes in the example) taking care that he is the one who
|
||||
owns the index (.recoll is in his home directory).
|
||||
|
||||
Contents of the file:
|
||||
|
||||
## $FreeBSD$
|
||||
## vim: set filetype=apache:
|
||||
##
|
||||
## module file for mod_wsgi
|
||||
##
|
||||
## PROVIDE: mod_wsgi
|
||||
## REQUIRE:
|
||||
|
||||
LoadModule wsgi_module libexec/apache24/mod_wsgi.so
|
||||
|
||||
WSGIDaemonProcess recoll user=dockes group=dockes \
|
||||
threads=1 processes=5 display-name=%{GROUP} \
|
||||
python-path=/usr/local/www/apache24/recoll-webui-master/
|
||||
WSGIScriptAlias /recoll /usr/local/www/apache24/recoll-webui-master/webui-wsgi.py
|
||||
|
||||
<Directory /usr/local/www/apache24/recoll-webui-master>
|
||||
WSGIProcessGroup recoll
|
||||
Require all granted
|
||||
</Directory>
|
||||
|
||||
==== Restart Apache
|
||||
|
||||
As root:
|
||||
|
||||
apachectl restart
|
||||
|
||||
|
||||
[[nginx]]
|
||||
== Nginx
|
||||
=== Nginx for BSD/ports
|
||||
|
||||
As root:
|
||||
|
||||
pkg install recoll
|
||||
|
||||
Do what you need to do to configure the indexing and check that the normal
|
||||
search works. Take care to be logged in as the user you want to run the web
|
||||
search as while you do this.
|
||||
|
||||
Install required packages:
|
||||
|
||||
pkg install nginx uwsgi git
|
||||
|
||||
=== Nginx: clone the webui repository
|
||||
|
||||
rm /usr/local/www/nginx
|
||||
mkdir /usr/local/www/nginx
|
||||
cd /usr/local/www/nginx
|
||||
git clone https://github.com/koniu/recoll-webui.git recoll-webui-master
|
||||
|
||||
Important: most input handler helper applications (e.g. 'pdftotext') are
|
||||
installed in '/usr/local/bin' which is not in the PATH as seen by Nginx
|
||||
(at least on DragonFly). The simplest way to fix this is to modify the
|
||||
launcher module for the webui app so that it fixes the PATH.
|
||||
|
||||
Edit 'recoll-webui-master/webui-wsgi.py' and add the following line after
|
||||
the 'import os' line:
|
||||
|
||||
os.environ['PATH'] = os.environ['PATH'] + ':' + '/usr/local/bin'
|
||||
|
||||
Also change the following to find the correct path:
|
||||
|
||||
#os.chdir(os.path.dirname(__file__))
|
||||
os.chdir('/usr/local/www/nginx/recoll-webui-master')
|
||||
|
||||
|
||||
=== Nginx: configure uWSGI
|
||||
|
||||
Assuming the user running the search is "dockes" (change it to your user),
|
||||
|
||||
sysrc uwsgi_uid=$(id -u dockes)
|
||||
sysrc uwsgi_gid=$(id -g dockes)
|
||||
sysrc uwsgi_flags="-M -L --wsgi-file /usr/local/www/nginx/recoll-webui-master/webui-wsgi.py"
|
||||
|
||||
(ALTERNATIVELY)
|
||||
|
||||
Add the following to rc.conf
|
||||
|
||||
uwsgi_uid="dockes"
|
||||
uwsgi_gid="dockes"
|
||||
uwsgi_flags="-M -L --wsgi-file /usr/local/www/nginx/recoll-webui-master/webui-wsgi.py"
|
||||
|
||||
|
||||
=== Configure nginx
|
||||
|
||||
Edit /usr/local/etc/nginx/nginx.conf and set up a proxy to uwsgi service:
|
||||
|
||||
location / {
|
||||
include uwsgi_params;
|
||||
uwsgi_pass unix:///tmp/uwsgi.sock;
|
||||
}
|
||||
|
||||
=== Enable and start both services
|
||||
|
||||
As root:
|
||||
|
||||
sysrc uwsgi_enable=YES #Or uwsgi_enable="YES" (in rc.conf)
|
||||
sysrc nginx_enable=YES #Or nginx_enable="YES" (in rc.conf)
|
||||
|
||||
service uwsgi start
|
||||
service nginx start
|
||||
@ -1,88 +0,0 @@
|
||||
= Recoll on Windows tips and tricks
|
||||
Jean-Francois Dockes <jf@dockes.org>
|
||||
:toc:
|
||||
|
||||
== Checking that Python is in the PATH
|
||||
|
||||
Recoll input handlers are the programs which extract the documents text
|
||||
content for indexing. Most of these programs are Python scripts. If Recoll
|
||||
can find documents by file name but not by content, the first thing to
|
||||
check is that you do have the Python interpreter in your PATH.
|
||||
|
||||
NOTE: Only Python 2 is supported at the moment (2.7 and later were
|
||||
tested). This limitation is not caused by the Recoll scripts themselves but
|
||||
to some of the auxiliary libraries (e.g.: the one used for LibreOffice text
|
||||
extraction). If you also have Python 3 installed, you will have to arrange
|
||||
for Recoll to only 'see' the Python 2 version.
|
||||
|
||||
For simple cases, to check that the Python interpreter is in the PATH, the
|
||||
easiest approach is to start a command window and type 'python' in it. You
|
||||
should see messages from the Python interpreter, which you can then
|
||||
exit by typing 'quit()'. If the command interpreter complains about Python
|
||||
not being found, you probably need to adjust the PATH.
|
||||
|
||||
NOTE: To start a command window, type 'command' in the start menu input
|
||||
area and select 'Command Prompt'.
|
||||
|
||||
If the Python interpreter is not found, check that Python 2 is indeed
|
||||
installed. Adding the Python binary to the PATH is an option during
|
||||
installation (so one approach to fix the issue is to just run the
|
||||
installation again).
|
||||
|
||||
You can also edit the environment variable directly:
|
||||
|
||||
- Start the Control Panel
|
||||
- Select 'System and Security'
|
||||
- Select 'System'
|
||||
- Select 'Advanced system settings' in the left panel,
|
||||
- Select 'Environment Variables' at the bottom of the dialog
|
||||
- Edit 'Path' inside 'System variables' and add:
|
||||
`C:\Python27\;C:\Python27\Scripts;` to it.
|
||||
|
||||
== Using an alternate configuration directory
|
||||
|
||||
This tip is useful if you want to manage several configurations, or if you
|
||||
really have some reason to not let the configuration directory stay in its
|
||||
default location ($HOMEDIR/AppData/Local/Recoll). If your concerns are only
|
||||
a bout storage space, and do not actually want to manage multiple
|
||||
configuration directories, you can more simply change the index storage
|
||||
location from the GUI 'Index Configuration' panel.
|
||||
|
||||
The easiest approach is to create a shortcut on the desktop and have it
|
||||
start the GUI with a '-c' option. For example, set the shortcut's 'Target'
|
||||
to something like:
|
||||
|
||||
----
|
||||
"C:\Program Files (x86)\Recoll\recoll.exe" -c c:/path/to/my/configdir
|
||||
----
|
||||
|
||||
_Do use forward slashes for the configuration directory path_. This will
|
||||
hopefully be fixed some day.
|
||||
|
||||
You will need to create the configuration directory, Recoll will not do it
|
||||
by itself. You can just leave it empty, Recoll will then propose to start
|
||||
the configuration editor.
|
||||
|
||||
You can find a more complete and general explanation about using shortcuts,
|
||||
for example http://www.rjlsoftware.com/support/faq/sa.cfm?q=6&n=61[on this
|
||||
page].
|
||||
|
||||
|
||||
== File name character case sensitivity
|
||||
|
||||
_This should be fixed as of the the November 2016 version. Please report
|
||||
the problem if you still see case sensitivity issues_
|
||||
|
||||
Recoll was born on Unix, on which file names are case-sensitive. At the
|
||||
moment this is also the case for path-related queries on Windows, including
|
||||
the drive letters.
|
||||
|
||||
When filtering results on location (e.g. with a 'dir:' clause), you need to
|
||||
enter all path elements as they appear in the URLs in result lists (and use
|
||||
forward slashes).
|
||||
|
||||
It is also advisable to enter configuration filenames with their actual
|
||||
case (e.g. _topdirs_).
|
||||
|
||||
I am looking into fixing this, but this made a bit complicated by non ASCII
|
||||
character sets issues.
|
||||
@ -1,191 +0,0 @@
|
||||
= Recoll on Windows
|
||||
Jean-Francois Dockes <jf at dockes.org>
|
||||
:date:
|
||||
|
||||
:recollversion: 1.23.0-2017-01-07-78b8ad
|
||||
:windir: downwin-12e3f
|
||||
|
||||
image:recoll-windows10-thumb.png[link="recoll-windows10.png"]
|
||||
|
||||
Recoll for Windows was built on Windows 7, and tried on Windows 7 and
|
||||
10. It does not work on Windows XP.
|
||||
|
||||
Recoll is free and licensed under the GPL. You will be asked to accept the
|
||||
license during the installation. For a regular user, and in a nutshell, the
|
||||
license means that you are free to do what you want with the program (use,
|
||||
copy, share, etc.). If you are a developper and intend to modify and
|
||||
distribute the program, you probably know the GPL, else you should read it.
|
||||
|
||||
NOTE: As much as I have fun writing software, producing the Windows version is
|
||||
just tedious. If you use Recoll on Windows, please consider contributing to
|
||||
its availability: image:/donations/btn_donate_LG.gif[link="/donations/index.html"]
|
||||
|
||||
Actually I'm tired of nobody ever using the donate button among thousands
|
||||
of downloads, so recoll for windows is gone for now.
|
||||
|
||||
== Note for updating
|
||||
|
||||
Recoll versions 1.23.0-9c5e32-20161216 and 1.23.0-2bfd80-20161115 had been
|
||||
switched to using Xapian 1.4 which has a new and different index
|
||||
format. Due to issues in Xapian 1.4, I have switched back to using Xapian
|
||||
1.2 as of Recoll 1.23.0-2017-01-07-78b8ad.
|
||||
|
||||
This simply means that, if your index was created by one of the above
|
||||
versions, it will have to be recreated from scratch after installing the
|
||||
current Recoll version. I advise explicitely deleting
|
||||
$HOME/AppData/Local/Recoll/xapiandb, as this will avoid leaving around 1.4
|
||||
files which would take space for nothing otherwise.
|
||||
|
||||
== Installation
|
||||
|
||||
- Download and install Python 2.7.10 or 2.7.11 (e.g.
|
||||
https://www.python.org/ftp/python/2.7.11/python-2.7.11.msi[Python
|
||||
2.7.11]). Recoll currently does not work with Python3. *_On the
|
||||
`Customize installation` screen, select "Add python.exe to Path"_*
|
||||
|
||||
- Optional: download and install the 7-zip program from
|
||||
http://www.7-zip.org/. This is only useful if you need to index files
|
||||
compressed with Unix methods (not needed for zip files).
|
||||
|
||||
- Download the
|
||||
http://www.recoll.org/downloads/{windir}/recoll-setup-{recollversion}.exe[Recoll
|
||||
setup file]. - Not possible right now -
|
||||
|
||||
- Execute the setup file. This is a vanilla installer generated by Inno
|
||||
Setup, and it will ask the usual questions.
|
||||
|
||||
//NOTE: The installer needs administrator rights in order to install to
|
||||
//`C:\Program Files`. If you want to install on a machine where you have no
|
||||
//administrator rights, you can use the
|
||||
//http://www.recoll.org/downloads/{windir}/recoll-{recollversion}.7z[installation
|
||||
//directory archive] instead and extract it anywhere, this works just the
|
||||
//same (you will need the free http://www.7-zip.org/[7z] to extract it). If
|
||||
//you are in this case, you can ignore the setup-related steps of the
|
||||
//procedure of course.
|
||||
|
||||
== Configuration
|
||||
|
||||
- Start recoll. It will ask if you want to customize the configuration.
|
||||
The default is to index the content of your user directory. Then start
|
||||
indexing. This can take some time.
|
||||
- The default result list font is particularly ugly. Change it from
|
||||
`Preferences->GUI Configuration->Result List->Result List Font`
|
||||
|
||||
- Have a look at the
|
||||
https://www.lesbonscomptes.com/recoll/usermanual/webhelp/docs/index.html[Recoll
|
||||
manual] !
|
||||
- I have also started a small link:recoll-windows-faq.html[Recoll on
|
||||
MS-Windows FAQ].
|
||||
|
||||
== Support
|
||||
|
||||
Please use the
|
||||
https://opensourceprojects.eu/p/recoll1/tickets/new/[Recoll issues tracker]
|
||||
for reporting problems, or contact me by email: jfd at recoll.org.
|
||||
|
||||
|
||||
== Known problems:
|
||||
|
||||
- Having a drive root (e.g.: c:/) in the topdirs (things to index) list
|
||||
does not work (it indexes nothing). You need to list the sub-directories
|
||||
to index. This will be fixed in a future release.
|
||||
|
||||
- Setting the log level to 4 or higher can cause the GUI to deadlock while
|
||||
displaying results. This will be fixed in a future release.
|
||||
|
||||
- Indexing is very slow, especially when using external commands (e.g. for
|
||||
PDF files). I don't know if this is a case of my doing something stupid,
|
||||
or if the general architecture is really bad fitted for Windows. If
|
||||
someone with good Windows programming knowledge reads this, I'd be very
|
||||
interested by a discussion. Windows indexing can be ten times slower than
|
||||
the Linux version. The index formats are compatible, so, if you have
|
||||
shared Linux/Windows data, it's best to process it on Linux.
|
||||
|
||||
- Filtering by directory location ('dir:' clauses) used to be
|
||||
case-sensitive, including drive letters. This is hopefully fixed by the
|
||||
November 2016 version.
|
||||
|
||||
- Also, when filtering the search with a `dir:` clause, an absolute path
|
||||
should be specified as `/c/mydir` instead of `c:/mydir`
|
||||
|
||||
- There is no real-time or scheduled indexing as on Linux. For now, you
|
||||
create and update the index by using the `File` menu (or executing
|
||||
`recollindex.exe` from a command window).
|
||||
|
||||
== Change Log
|
||||
|
||||
Changes in 20161115
|
||||
|
||||
- File path names case sensitivity and other small path issues should be fixed.
|
||||
- Based on Xapian 1.4. New stemming languages are available (e.g. Arabic).
|
||||
- Fixed date display encoding issues.
|
||||
|
||||
Changes in 20160414
|
||||
|
||||
- The setup script has changed back to needing administrator rights,
|
||||
because this is what is convenient for most people. Use the installation
|
||||
directory archive to install in a non-standard location without admin
|
||||
rights.
|
||||
- Fixed a bug which had the whole indexing stop if a script would time out
|
||||
on a specific file (it will very rarely happen that a pathologically bad
|
||||
file can throw an input handler in a loop).
|
||||
|
||||
|
||||
Changes in 20160317
|
||||
|
||||
- Small change to the setup script so that administrative rights are not
|
||||
required.
|
||||
|
||||
Changes/fixes in 20160129
|
||||
|
||||
- Changed the method used for checking that index data is up to date with
|
||||
documents. This will impose a re-indexing of all data, but it was
|
||||
necessary because the previous method was incorrect.
|
||||
- Fixed crash which occured after changing some configuration parameters.
|
||||
- Warn when editing a temporary copy of a document (e.g. a temp file
|
||||
extracted from a zip archive.
|
||||
|
||||
Changes in 20151202
|
||||
|
||||
- Fixed mbox parsing. This was getting the message separators completely
|
||||
wrong, and taking a lot of time to do it. This should be especially
|
||||
welcome by Thunderbird users.
|
||||
|
||||
- Fixed email attachement processing. A fault in the code which saved
|
||||
attachment data to disk for further processing resulted in a practical
|
||||
fuzzing experiment on the input processors. Especially, frequent crashes
|
||||
in the image tag extractor caused very ennoying Windows popups about
|
||||
a Python error.
|
||||
|
||||
Fixed in 20151115 and later
|
||||
|
||||
- A relatively rare crash which seemed to occur mostly on some email
|
||||
messages
|
||||
- Forgotten MIME settings for .cs, .js and .css
|
||||
|
||||
Fixed in 20151112 and later
|
||||
|
||||
- Forgotten dll prevents the unrtf program to work, so no rtf indexing.
|
||||
|
||||
Fixed in 20151109 (hopefully?)
|
||||
|
||||
- The GUI sometimes crashes when you click `Preview` or `Open`. This does
|
||||
not occur often, and usually for one of the first tries after starting
|
||||
the program. Don't despair. This seems to be fixed in the latest version
|
||||
(20151109), but I am not 100% certain that it is gone.
|
||||
|
||||
++++
|
||||
<h2 id="comments">Comments</h2>
|
||||
|
||||
<div id="disqus_thread"></div>
|
||||
<script type="text/javascript">
|
||||
var disqus_shortname = 'lesbonscomptes';
|
||||
(function() {
|
||||
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
|
||||
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
|
||||
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
|
||||
})();
|
||||
</script>
|
||||
<noscript>Please enable JavaScript to view the <a href="http://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
|
||||
<a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
|
||||
++++
|
||||
@ -1,416 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>RECOLL indexing performance and index sizes</title>
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search,fulltext,desktop search,unix,linux,solaris,open source,free">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content=
|
||||
"text/html; charset=iso-8859-1">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
<link type="text/css" rel="stylesheet" href="styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="rightlinks">
|
||||
<ul>
|
||||
<li><a href="index.html">Home</a></li>
|
||||
<li><a href="pics/index.html">Screenshots</a></li>
|
||||
<li><a href="download.html">Downloads</a></li>
|
||||
<li><a href="doc.html">Documentation</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h1>Recoll: Indexing performance and index sizes</h1>
|
||||
|
||||
<p>The time needed to index a given set of documents, and the
|
||||
resulting index size depend of many factors.
|
||||
|
||||
<p>The index size depends almost only on the size of the
|
||||
uncompressed input text, and you can expect it to be roughly
|
||||
of the same order of magnitude. Depending on the type of file,
|
||||
the proportion of text to file size varies very widely, going
|
||||
from close to 1 for pure text files to a very small factor
|
||||
for, e.g., metadata tags in mp3 files.</p>
|
||||
|
||||
<p>Estimating indexing time is a much more complicated issue,
|
||||
depending on the type and size of input and on system
|
||||
performance. There is no general way to determine what part of
|
||||
the hardware should be optimized. Depending on the type of
|
||||
input, performance may be bound by I/O read or write
|
||||
performance, CPU single-processing speed, or combined
|
||||
multi-processing speed.</p>
|
||||
|
||||
<p>It should be noted that Recoll performance will not be an
|
||||
issue for most people. The indexer can process 1000 typical
|
||||
PDF files per minute, or 500 Wikipedia HTML pages per second
|
||||
on medium-range hardware, meaning that the initial indexing of
|
||||
a typical dataset will need a few dozen minutes at
|
||||
most. Further incremental index updates will be much faster
|
||||
because most files will not need to be processed again.</p>
|
||||
|
||||
<p>However, there are Recoll installations with
|
||||
terabyte-sized datasets, on which indexing can take days. For
|
||||
such operations (or even much smaller ones), it is very
|
||||
important to know what kind of performance can be expected,
|
||||
and what aspects of the hardware should be optimized.</p>
|
||||
|
||||
<p>In order to provide some reference points, I have run a
|
||||
number of benchs on medium-sized datasets, using typical
|
||||
mid-range desktop hardware, and varying the indexing
|
||||
configuration parameters to show how they affect the results.</p>
|
||||
|
||||
<p>The following may help you check that you are getting typical
|
||||
performance for your indexing, and give some indications about
|
||||
what to adjust to improve it.</p>
|
||||
|
||||
<p>From time to time, I receive a report about a system becoming
|
||||
unusable during indexing. As far as I know, with the default
|
||||
Recoll configuration, and barring an exceptional issue (bug),
|
||||
this is always due to a system problem (typically bad hardware
|
||||
such as a disk doing retries). The tests below were mostly run
|
||||
while I was using the desktop, which never became
|
||||
unusable. However, some tests rendered it less responsive and
|
||||
this is noted with the results.</p>
|
||||
|
||||
<p>The following text refers to the indexing parameters without
|
||||
further explanation. Here follow links to more explanation about the
|
||||
<a href="http://www.lesbonscomptes.com/recoll/idxthreads/threadingRecoll.html#recoll.idxthreads.multistage">processing
|
||||
model</a> and
|
||||
<a href="https://www.lesbonscomptes.com/recoll/usermanual/webhelp/docs/RCL.INSTALL.CONFIG.RECOLLCONF.PERFS.html">configuration
|
||||
parameters</a>.</p>
|
||||
|
||||
|
||||
<p>All text were run without generating the stemming database or
|
||||
aspell dictionary. These phases are relatively short and there
|
||||
is nothing which can be optimized about them.</p>
|
||||
|
||||
<h2>Hardware</h2>
|
||||
|
||||
<p>The tests were run on what could be considered a mid-range
|
||||
desktop PC:
|
||||
<ul>
|
||||
<li>Intel Core I7-4770T CPU: 2.5 Ghz, 4 physical cores, and
|
||||
hyper-threading for a total of 8 hardware threads</li>
|
||||
<li>8 GBytes of RAM</li>
|
||||
<li>Asus H87I-Plus motherboard, Samsung 850 EVO SSD storage</li>
|
||||
</ul>
|
||||
</p>
|
||||
|
||||
<p>This is usually a fanless PC, but I did run a fan on the
|
||||
external case fins during some of the tests (esp. PDF
|
||||
indexing), because the CPU was running a bit too hot.</p>
|
||||
|
||||
|
||||
<h2>Indexing PDF files</h2>
|
||||
|
||||
|
||||
<p>The tests were run on 18000 random PDFs harvested on
|
||||
Google, with a total size of around 30 GB, using Recoll 1.22.3
|
||||
and Xapian 1.2.22. The resulting index size was 1.2 GB.</p>
|
||||
|
||||
<h3>PDF: storage</h3>
|
||||
|
||||
<p>Typical PDF files have a low text to file size ratio, and a
|
||||
lot of data needs to be read for indexing. With the test
|
||||
configuration, the indexer needs to read around 45 MBytes / S
|
||||
from multiple files. This means that input storage makes a
|
||||
difference and that you need an SSD or a fast array for
|
||||
optimal performance.</p>
|
||||
|
||||
<table border=1>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Storage</th>
|
||||
<th>idxflushmb</th>
|
||||
<th>thrTCounts</th>
|
||||
<th>Real Time</th>
|
||||
</tr>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>NFS drive (gigabit)</td>
|
||||
<td>200</td>
|
||||
<td>6/4/1</td>
|
||||
<td>24m40</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>local SSD</td>
|
||||
<td>200</td>
|
||||
<td>6/4/1</td>
|
||||
<td>11m40</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
|
||||
<h3>PDF: threading</h3>
|
||||
|
||||
<p>Because PDF files are bulky and complicated to process, the
|
||||
dominant step for indexing them is input processing. PDF text
|
||||
extraction is performed by multiple instances
|
||||
the <i>pdftotext</i> program, and parallelisation works very
|
||||
well.</p>
|
||||
|
||||
<p>The following table shows the indexing times with a variety
|
||||
of threading parameters.</p>
|
||||
|
||||
<table border=1>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>idxflushmb</th>
|
||||
<th>thrQSizes</th>
|
||||
<th>thrTCounts</th>
|
||||
<th>Time R/U/S</th>
|
||||
</tr>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>200</td>
|
||||
<td>2/2/2</td>
|
||||
<td>2/1/1</td>
|
||||
<td>19m21</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>200</td>
|
||||
<td>2/2/2</td>
|
||||
<td>10/10/1</td>
|
||||
<td>10m38</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>200</td>
|
||||
<td>2/2/2</td>
|
||||
<td>100/10/1</td>
|
||||
<td>11m</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p>10/10/1 was the best value for thrTCounts for this test. The
|
||||
total CPU time was around 78 mn.</p>
|
||||
|
||||
<p>The last line shows the effect of a ridiculously high thread
|
||||
count value for the input step, which is not much. Using
|
||||
sligthly lower values than the optimum has not much impact
|
||||
either. The only thing which really degrades performance is
|
||||
configuring less threads than available from the hardware.</p>
|
||||
|
||||
<p>With the optimal parameters above, the peak recollindex
|
||||
resident memory size is around 930 MB, to which we should add
|
||||
ten instances of pdftotext (10MB typical), and of the
|
||||
rclpdf.py Python input handler (around 15 MB each). This means
|
||||
that the total resident memory used by indexing is around 1200
|
||||
MB, quite a modest value in 2016.</p>
|
||||
|
||||
|
||||
<h3>PDF: Xapian flushes</h3>
|
||||
|
||||
<p>idxflushmb has practically no influence on the indexing time
|
||||
(tested from 40 to 1000), which is not too surprising because
|
||||
the Xapian index size is very small relatively to the input
|
||||
size, so that the cost of Xapian flushes to disk is not very
|
||||
significant. The value of 200 used for the threading tests
|
||||
could be lowered in practise, which would decrease memory
|
||||
usage and not change the indexing time significantly.</p>
|
||||
|
||||
<h3>PDF: conclusion</h3>
|
||||
|
||||
<p>For indexing PDF files, you need many cores and a fast
|
||||
input storage system. Neither single-thread performance nor
|
||||
amount of memory will be critical aspects.</p>
|
||||
|
||||
<p>Running the PDF indexing tests had no influence on the system
|
||||
"feel", I could work on it just as if it were quiescent.</p>
|
||||
|
||||
|
||||
<h2>Indexing HTML files</h2>
|
||||
|
||||
<p>The tests were run on an (old) French Wikipedia dump: 2.9
|
||||
million HTML files stored in 42000 directories, for an
|
||||
approximate total size of 41 GB (average file size
|
||||
14 KB).
|
||||
|
||||
<p>The files are stored on a local SSD. Just reading them with
|
||||
find+cpio takes close to 8 mn.</p>
|
||||
|
||||
<p>The resulting index has a size of around 30 GB.</p>
|
||||
|
||||
<p>I was too lazy to extract 3 million entries tar file on a
|
||||
spinning disk, so all tests were performed with the data
|
||||
stored on a local SSD.</p>
|
||||
|
||||
<p>For this test, the indexing time is dominated by the Xapian
|
||||
index updates. As these are single threaded, only the flush
|
||||
interval has a real influence.</p>
|
||||
|
||||
<table border=1>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>idxflushmb</th>
|
||||
<th>thrQSizes</th>
|
||||
<th>thrTCounts</th>
|
||||
<th>Time R/U/S</th>
|
||||
</tr>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>200</td>
|
||||
<td>2/2/2</td>
|
||||
<td>2/1/1</td>
|
||||
<td>88m</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>200</td>
|
||||
<td>2/2/2</td>
|
||||
<td>6/4/1</td>
|
||||
<td>91m</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>200</td>
|
||||
<td>2/2/2</td>
|
||||
<td>1/1/1</td>
|
||||
<td>96m</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>100</td>
|
||||
<td>2/2/2</td>
|
||||
<td>1/2/1</td>
|
||||
<td>120m</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>100</td>
|
||||
<td>2/2/2</td>
|
||||
<td>6/4/1</td>
|
||||
<td>121m</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>40</td>
|
||||
<td>2/2/2</td>
|
||||
<td>1/2/1</td>
|
||||
<td>173m</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
|
||||
<p>The indexing process becomes quite big (resident size around
|
||||
4GB), and the combination of high I/O load and high memory
|
||||
usage makes the system less responsive at times (but not
|
||||
unusable). As this happens principally when switching
|
||||
applications, my guess would be that some program pages
|
||||
(e.g. from the window manager and X) get flushed out, and take
|
||||
time being read in, during which time the display appears
|
||||
frozen.</p>
|
||||
|
||||
<p>For this kind of data, single-threaded CPU performance and
|
||||
storage write speed can make a difference. Multithreading does
|
||||
not help.</p>
|
||||
|
||||
<h2>Adjusting hardware to improve indexing performance</h2>
|
||||
|
||||
<p>I think that the following multi-step approach has a good
|
||||
chance to improve performance:
|
||||
<ul>
|
||||
<li>Check that multithreading is enabled (it is, by default
|
||||
with recent Recoll versions).</li>
|
||||
<li>Increase the flush threshold until the machine begins to
|
||||
have memory issues. Maybe add memory.</li>
|
||||
<li>Store the index on an SSD. If possible, also store the
|
||||
data on an SSD. Actually, when using many threads, it is
|
||||
probably almost more important to have the data on an
|
||||
SSD.</li>
|
||||
<li>If you have many files which will need temporary copies
|
||||
(email attachments, archive members, compressed files): use
|
||||
a memory temporary directory. Add memory.</li>
|
||||
<li>More CPUs...</li>
|
||||
</ul>
|
||||
</p>
|
||||
|
||||
<p>At some point, the index updating and writing may become the
|
||||
bottleneck (this depends on the data mix, very quickly with
|
||||
HTML or text files). As far as I can think, the only possible
|
||||
approach is then to partition the index. You can query the
|
||||
multiple Xapian indices either by using the Recoll external
|
||||
index capability, or by actually merging the results with
|
||||
xapian-compact.</p>
|
||||
|
||||
|
||||
|
||||
<h5>Old benchmarks</h5>
|
||||
|
||||
<p>To provide a point of comparison for the evolution of
|
||||
hardware and software...</p>
|
||||
|
||||
<p>The following very old data was obtained (around 2007?) on a
|
||||
machine with a 1800 Mhz AMD Duron CPU, 768Mb of Ram, and a
|
||||
7200 RPM 160 GBytes IDE disk, running Suse 10.1.</p>
|
||||
|
||||
<p><b>recollindex</b> (version 1.8.2 with xapian 1.0.0) is
|
||||
executed with the default flush threshold value.
|
||||
The process memory usage is the one given by <b>ps</b></p>
|
||||
|
||||
<table border=1>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Data</th>
|
||||
<th>Data size</th>
|
||||
<th>Indexing time</th>
|
||||
<th>Index size</th>
|
||||
<th>Peak process memory usage</th>
|
||||
</tr>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Random pdfs harvested on Google</td>
|
||||
<td>1.7 GB, 3564 files</td>
|
||||
<td>27 mn</td>
|
||||
<td>230 MB</td>
|
||||
<td>225 MB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Ietf mailing list archive</td>
|
||||
<td>211 MB, 44,000 messages</td>
|
||||
<td>8 mn</td>
|
||||
<td>350 MB</td>
|
||||
<td>90 MB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Partial Wikipedia dump</td>
|
||||
<td>15 GB, one million files</td>
|
||||
<td>6H30</td>
|
||||
<td>10 GB</td>
|
||||
<td>324 MB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<!-- DB: ndocs 3564 lastdocid 3564 avglength 6460.71 -->
|
||||
<td>Random pdfs harvested on Google<br>
|
||||
Recoll 1.9, <em>idxflushmb</em> set to 10</td>
|
||||
<td>1.7 GB, 3564 files</td>
|
||||
<td>25 mn</td>
|
||||
<td>262 MB</td>
|
||||
<td>65 MB</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p>Notice how the index size for the mail archive is bigger than
|
||||
the data size. Myriads of small pure text documents will do
|
||||
this. The factor of expansion would be even much worse with
|
||||
compressed folders of course (the test was on uncompressed
|
||||
data).</p>
|
||||
|
||||
<p>The last test was performed with Recoll 1.9.0 which has an
|
||||
ajustable flush threshold (<em>idxflushmb</em> parameter), here
|
||||
set to 10 MB. Notice the much lower peak memory usage, with no
|
||||
performance degradation. The resulting index is bigger though,
|
||||
the exact reason is not known to me, possibly because of
|
||||
additional fragmentation </p>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@ -1,2 +0,0 @@
|
||||
#!/bin/sh
|
||||
onlylist=1 photalb . .
|
||||
@ -1,44 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>Recoll screenshots</title>
|
||||
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux
|
||||
based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search, desktop search, unix, linux">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
|
||||
<link type="text/css" rel="stylesheet" href="../styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h1>Recoll Screenshots</h1>
|
||||
<li><a href="../index.html">Back to Recoll home</a></li>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td align="center"><a href="recoll0.html"><img src="recoll0-thumb.png"></a></td>
|
||||
<td align="center"><a href="result-table.html"><img src="result-table-thumb.png"></a></td>
|
||||
<td align="center"><a href="recoll1.html"><img src="recoll1-thumb.png"></a></td>
|
||||
<td align="center"><a href="recoll2.html"><img src="recoll2-thumb.png"></a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center"><a href="recoll3.html"><img src="recoll3-thumb.png"></a></td>
|
||||
<td align="center"><a href="recoll4.html"><img src="recoll4-thumb.png"></a></td>
|
||||
<td align="center"><a href="recoll5.html"><img src="recoll5-thumb.png"></a></td>
|
||||
<td align="center"><a href="recoll_chinese.html"><img src="recoll_chinese-thumb.png"></a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center"><a href="recoll-HTML_search_results.html"><img src="recoll-HTML_search_results-thumb.png"></a></td>
|
||||
</tr></table>
|
||||
</body></html>
|
||||
@ -1,27 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>Recoll screenshots</title>
|
||||
|
||||
<meta name="generator" content="HTML Tidy, see www.w3.org">
|
||||
<meta name="Author" content="Jean-Francois Dockes">
|
||||
<meta name="Description" content=
|
||||
"recoll is a simple full-text search system for unix and linux
|
||||
based on the powerful and mature xapian engine">
|
||||
<meta name="Keywords" content=
|
||||
"full text search, desktop search, unix, linux">
|
||||
<meta http-equiv="Content-language" content="en">
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="robots" content="All,Index,Follow">
|
||||
|
||||
<link type="text/css" rel="stylesheet" href="../styles/style.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="content">
|
||||
|
||||
<h1>Recoll Screenshots</h1>
|
||||
<li><a href="../index.html">Back to Recoll home</a></li>
|
||||
|
||||
|
Before Width: | Height: | Size: 5.6 KiB |
@ -1,13 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html>
|
||||
<head>
|
||||
<title>Photo</title>
|
||||
</head>
|
||||
<body>
|
||||
<p><a href="recoll-HTML_search_results.html">Prev</a> <a href="../index.html">Up</a>
|
||||
<a href="smile.html">Next</a>
|
||||
<a href="mario.png">Image</a></p>
|
||||
<p></p>
|
||||
<p><img height="90%" src="mario.png"></p>
|
||||
</body>
|
||||
</html>
|
||||
|
Before Width: | Height: | Size: 1.8 KiB |
@ -1,9 +0,0 @@
|
||||
recoll0.png
|
||||
result-table.png
|
||||
recoll1.png
|
||||
recoll2.png
|
||||
recoll3.png
|
||||
recoll4.png
|
||||
recoll5.png
|
||||
recoll_chinese.png
|
||||
recoll-HTML_search_results.png
|
||||
|
Before Width: | Height: | Size: 178 KiB |
@ -1,40 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html>
|
||||
<head>
|
||||
<title>Photo</title>
|
||||
</head>
|
||||
<body>
|
||||
<p><a href="recoll_chinese.html">Prev</a> <a href=".">Up</a>
|
||||
<a href="recoll0.html">Next</a>
|
||||
<a href="recoll-HTML_search_results.png">Image</a></p>
|
||||
<p>A customized result list, thanks to Michael Croes. The html code follows,
|
||||
it should be pasted into the
|
||||
<i>Preferences->Query Configuration->Result paragraph format string</i> entry.
|
||||
|
||||
<pre>
|
||||
<table border="1" bgcolor="lightyellow">
|
||||
<tr>
|
||||
<td rowspan="4" width="40px" align="center"
|
||||
valign="center">
|
||||
<img src="%I" width="32" height="32">
|
||||
<p><b>%R</b></p>
|
||||
<p><a href="P%N">Aperçu</a></p>
|
||||
</td>
|
||||
<th colspan="3" bgcolor="lightgrey">%T</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center">%M</td>
|
||||
<td align="center">%D</td>
|
||||
<td align="center">%S</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="3"><a href="E%N">%U</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="3">%A</td>
|
||||
</tr>
|
||||
</table>
|
||||
</pre></p>
|
||||
<p><img height="90%" src="recoll-HTML_search_results.png"></p>
|
||||
</body>
|
||||
</html>
|
||||
|
Before Width: | Height: | Size: 62 KiB |
@ -1,28 +0,0 @@
|
||||
A customized result list, thanks to Michael Croes. The html code follows,
|
||||
it should be pasted into the
|
||||
<i>Preferences->Query Configuration->Result paragraph format string</i> entry.
|
||||
|
||||
<pre>
|
||||
<table border="1" bgcolor="lightyellow">
|
||||
<tr>
|
||||
<td rowspan="4" width="40px" align="center"
|
||||
valign="center">
|
||||
<img src="%I" width="32" height="32">
|
||||
<p><b>%R</b></p>
|
||||
<p><a href="P%N">Aperçu</a></p>
|
||||
</td>
|
||||
<th colspan="3" bgcolor="lightgrey">%T</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center">%M</td>
|
||||
<td align="center">%D</td>
|
||||
<td align="center">%S</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="3"><a href="E%N">%U</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="3">%A</td>
|
||||
</tr>
|
||||
</table>
|
||||
</pre>
|
||||
|
Before Width: | Height: | Size: 124 KiB |
@ -1,13 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html>
|
||||
<head>
|
||||
<title>Photo</title>
|
||||
</head>
|
||||
<body>
|
||||
<p><a href=".">Prev</a> <a href=".">Up</a>
|
||||
<a href="result-table.html">Next</a>
|
||||
<a href="recoll0.png">Image</a></p>
|
||||
<p>Search results.</p>
|
||||
<p><img height="90%" src="recoll0.png"></p>
|
||||
</body>
|
||||
</html>
|
||||
|
Before Width: | Height: | Size: 128 KiB |
@ -1,2 +0,0 @@
|
||||
Search results.
|
||||
|
||||
|
Before Width: | Height: | Size: 154 KiB |
@ -1,13 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html>
|
||||
<head>
|
||||
<title>Photo</title>
|
||||
</head>
|
||||
<body>
|
||||
<p><a href="result-table.html">Prev</a> <a href=".">Up</a>
|
||||
<a href="recoll2.html">Next</a>
|
||||
<a href="recoll1.png">Image</a></p>
|
||||
<p>A result list with a preview window open.</p>
|
||||
<p><img height="90%" src="recoll1.png"></p>
|
||||
</body>
|
||||
</html>
|
||||
|
Before Width: | Height: | Size: 181 KiB |
@ -1,4 +0,0 @@
|
||||
A result list with a preview window open.
|
||||
|
||||
|
||||
|
||||
|
Before Width: | Height: | Size: 25 KiB |
@ -1,13 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html>
|
||||
<head>
|
||||
<title>Photo</title>
|
||||
</head>
|
||||
<body>
|
||||
<p><a href="recoll1.html">Prev</a> <a href=".">Up</a>
|
||||
<a href="recoll3.html">Next</a>
|
||||
<a href="recoll2.png">Image</a></p>
|
||||
<p>The two tabs in the advanced search dialog.</p>
|
||||
<p><img height="90%" src="recoll2.png"></p>
|
||||
</body>
|
||||
</html>
|
||||
|
Before Width: | Height: | Size: 46 KiB |
@ -1 +0,0 @@
|
||||
The two tabs in the advanced search dialog.
|
||||
|
Before Width: | Height: | Size: 55 KiB |
@ -1,14 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html>
|
||||
<head>
|
||||
<title>Photo</title>
|
||||
</head>
|
||||
<body>
|
||||
<p><a href="recoll2.html">Prev</a> <a href=".">Up</a>
|
||||
<a href="recoll4.html">Next</a>
|
||||
<a href="recoll3.png">Image</a></p>
|
||||
<p>A result list from which the native application (firefox)
|
||||
was started by clicking the Edit link.</p>
|
||||
<p><img height="90%" src="recoll3.png"></p>
|
||||
</body>
|
||||
</html>
|
||||
|
Before Width: | Height: | Size: 166 KiB |
@ -1,2 +0,0 @@
|
||||
A result list from which the native application (firefox)
|
||||
was started by clicking the Edit link.
|
||||
|
Before Width: | Height: | Size: 65 KiB |
@ -1,14 +0,0 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html>
|
||||
<head>
|
||||
<title>Photo</title>
|
||||
</head>
|
||||
<body>
|
||||
<p><a href="recoll3.html">Prev</a> <a href=".">Up</a>
|
||||
<a href="recoll5.html">Next</a>
|
||||
<a href="recoll4.png">Image</a></p>
|
||||
<p>The document history window looks a little like a result list
|
||||
I'm afraid...</p>
|
||||
<p><img height="90%" src="recoll4.png"></p>
|
||||
</body>
|
||||
</html>
|
||||
|
Before Width: | Height: | Size: 54 KiB |