Movable datasets support

This commit is contained in:
Jean-Francois Dockes 2017-12-06 11:34:04 +01:00
parent 329ab7b90d
commit 09acb5687c
5 changed files with 4611 additions and 4270 deletions

View File

@ -1318,17 +1318,85 @@ string RclConfig::getPidfile() const
return path_cat(getCacheDir(), "index.pid"); return path_cat(getCacheDir(), "index.pid");
} }
/* Eliminate the common leaf part of file paths p1 and p2. Example:
* /mnt1/common/part /mnt2/common/part -> /mnt1 /mnt2. This is used
* for computing translations for paths when the dataset has been
* moved. Of course this could be done more efficiently than by splitting
* into vectors, but we don't care.*/
static string path_diffstems(const string& p1, const string& p2,
string& r1, string& r2)
{
string reason;
r1.clear();
r2.clear();
vector<string> v1, v2;
stringToTokens(p1, v1, "/");
stringToTokens(p2, v2, "/");
unsigned int l1 = v1.size();
unsigned int l2 = v2.size();
// Search for common leaf part
unsigned int cl = 0;
for (; cl < MIN(l1, l2); cl++) {
if (v1[l1-cl-1] != v2[l2-cl-1]) {
break;
}
}
//cerr << "Common length = " << cl << endl;
if (cl == 0) {
reason = "Input paths are empty or have no common part";
return reason;
}
for (unsigned i = 0; i < l1 - cl; i++) {
r1 += "/" + v1[i];
}
for (unsigned i = 0; i < l2 - cl; i++) {
r2 += "/" + v2[i];
}
return reason;
}
void RclConfig::urlrewrite(const string& dbdir, string& url) const void RclConfig::urlrewrite(const string& dbdir, string& url) const
{ {
LOGDEB2("RclConfig::urlrewrite: dbdir [" << dbdir << "] url [" << url << LOGDEB("RclConfig::urlrewrite: dbdir [" << dbdir << "] url [" << url <<
"]\n"); "]\n");
// If orgidxconfdir is set, we assume that this index is for a
// movable dataset, with the configuration directory stored inside
// the dataset tree. This allows computing automatic path
// translations if the dataset has been moved.
string orig_confdir;
string cur_confdir;
string confstemorg, confstemrep;
if (m_conf->get("orgidxconfdir", orig_confdir, "")) {
if (!m_conf->get("curidxconfdir", cur_confdir, "")) {
cur_confdir = m_confdir;
}
LOGDEB("RclConfig::urlrewrite: orgidxconfdir: " << orig_confdir <<
" cur_confdir " << cur_confdir << endl);
string reason = path_diffstems(orig_confdir, cur_confdir,
confstemorg, confstemrep);
if (!reason.empty()) {
LOGERR("urlrewrite: path_diffstems failed: " << reason <<
" : orig_confdir [" << orig_confdir <<
"] cur_confdir [" << cur_confdir << endl);
confstemorg = confstemrep = "";
}
}
// Do path translations exist for this index ? // Do path translations exist for this index ?
bool needptrans = true;
if (m_ptrans == 0 || !m_ptrans->hasSubKey(dbdir)) { if (m_ptrans == 0 || !m_ptrans->hasSubKey(dbdir)) {
LOGDEB2("RclConfig::urlrewrite: no paths translations (m_ptrans " << LOGDEB2("RclConfig::urlrewrite: no paths translations (m_ptrans " <<
m_ptrans << ")\n"); m_ptrans << ")\n");
needptrans = false;
}
if (!needptrans && confstemorg.empty()) {
return; return;
} }
bool computeurl = false;
string path = fileurltolocalpath(url); string path = fileurltolocalpath(url);
if (path.empty()) { if (path.empty()) {
@ -1336,21 +1404,33 @@ void RclConfig::urlrewrite(const string& dbdir, string& url) const
return; return;
} }
// Do the movable volume thing.
if (!confstemorg.empty() && confstemorg.size() <= path.size() &&
!path.compare(0, confstemorg.size(), confstemorg)) {
path = path.replace(0, confstemorg.size(), confstemrep);
computeurl = true;
}
if (needptrans) {
// For each translation check if the prefix matches the input path, // For each translation check if the prefix matches the input path,
// replace and return the result if it does. // replace and return the result if it does.
vector<string> opaths = m_ptrans->getNames(dbdir); vector<string> opaths = m_ptrans->getNames(dbdir);
for (vector<string>::const_iterator it = opaths.begin(); for (const auto& opath: opaths) {
it != opaths.end(); it++) { if (opath.size() <= path.size() &&
if (it->size() <= path.size() && !path.compare(0, it->size(), *it)) { !path.compare(0, opath.size(), opath)) {
string npath; string npath;
// This call always succeeds because the key comes from getNames() // Key comes from getNames()=> call must succeed
if (m_ptrans->get(*it, npath, dbdir)) { if (m_ptrans->get(opath, npath, dbdir)) {
path = path.replace(0, it->size(), npath); path = path.replace(0, opath.size(), npath);
url = path_pathtofileurl(path); computeurl = true;
} }
break; break;
} }
} }
}
if (computeurl) {
url = path_pathtofileurl(path);
}
} }
bool RclConfig::sourceChanged() const bool RclConfig::sourceChanged() const

View File

@ -471,6 +471,25 @@ the log... values.</para></listitem></varlistentry>
<listitem><para>Override logfilename for the indexer in real time <listitem><para>Override logfilename for the indexer in real time
mode. The default is to use the idx... values if set, else mode. The default is to use the idx... values if set, else
the log... values.</para></listitem></varlistentry> the log... values.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ORGIDXCONFDIR">
<term><varname>orgidxconfdir</varname></term>
<listitem><para>Original location of the configuration directory. This is used exclusively for movable datasets. Locating the
configuration directory inside the directory tree makes it possible to
provide automatic query time path translations once the data set has
moved (for example, because it has been mounted on another
location).</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.CURIDXCONFDIR">
<term><varname>curidxconfdir</varname></term>
<listitem><para>Current location of the configuration directory. Complement orgidxconfdir for movable datasets. This should be used
if the configuration directory has been copied from the dataset to
another location, either because the dataset is readonly and an r/w copy
is desired, or for performance reasons. This records the original moved
location before copy, to allow path translation computations. For
example if a dataset originally indexed as '/home/me/mydata/config' has
been mounted to '/media/me/mydata', and the GUI is running from a copied
configuration, orgidxconfdir would be '/home/me/mydata/config', and
curidxconfdir (as set in the copied configuration) would be
'/media/me/mydata/config'.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.IDXRUNDIR"> <varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.IDXRUNDIR">
<term><varname>idxrundir</varname></term> <term><varname>idxrundir</varname></term>
<listitem><para>Indexing process current directory. The input <listitem><para>Indexing process current directory. The input

File diff suppressed because it is too large Load Diff

View File

@ -498,12 +498,12 @@
indexed (no others will be indexed), by settting indexed (no others will be indexed), by settting
the <link linkend="RCL.INSTALL.CONFIG.RECOLLCONF.INDEXEDMIMETYPES"> the <link linkend="RCL.INSTALL.CONFIG.RECOLLCONF.INDEXEDMIMETYPES">
indexedmimetypes</link> configuration variable. Example:<programlisting> indexedmimetypes</link> configuration variable. Example:<programlisting>
indexedmimetypes = text/html application/pdf indexedmimetypes = text/html application/pdf
</programlisting> </programlisting>
It is possible to redefine this parameter for It is possible to redefine this parameter for
subdirectories. Example:<programlisting> subdirectories. Example:<programlisting>
[/path/to/my/dir] [/path/to/my/dir]
indexedmimetypes = application/pdf indexedmimetypes = application/pdf
</programlisting> </programlisting>
(When using sections like this, don't forget that they remain (When using sections like this, don't forget that they remain
in effect until the end of the file or another section in effect until the end of the file or another section
@ -920,10 +920,10 @@ indexedmimetypes = application/pdf
processing their text, and one to update the index. This was processing their text, and one to update the index. This was
tested to be the best configuration on the test system tested to be the best configuration on the test system
(quadri-processor with multiple disks). (quadri-processor with multiple disks).
<programlisting> <programlisting>
thrQSizes = 2 2 2 thrQSizes = 2 2 2
thrTCounts = 4 2 1 thrTCounts = 4 2 1
</programlisting> </programlisting>
</para> </para>
<para>The following example would use a single queue, and the <para>The following example would use a single queue, and the
@ -936,18 +936,18 @@ thrTCounts = 4 2 1
would be performed purely sequentially), so the previous would be performed purely sequentially), so the previous
approach is preferred. YMMV... The 2 last values for approach is preferred. YMMV... The 2 last values for
thrTCounts are ignored. thrTCounts are ignored.
<programlisting> <programlisting>
thrQSizes = 2 -1 -1 thrQSizes = 2 -1 -1
thrTCounts = 6 1 1 thrTCounts = 6 1 1
</programlisting> </programlisting>
</para> </para>
<para>The following example would disable <para>The following example would disable
multithreading. Indexing will be performed by a single multithreading. Indexing will be performed by a single
thread. thread.
<programlisting> <programlisting>
thrQSizes = -1 -1 -1 thrQSizes = -1 -1 -1
</programlisting> </programlisting>
</para> </para>
</sect2> </sect2>
@ -1113,7 +1113,7 @@ thrQSizes = -1 -1 -1
configuration file:</para> configuration file:</para>
<programlisting>[/some/area/of/the/fs] <programlisting>[/some/area/of/the/fs]
metadatacmds = ; tags = tmsu tags %f metadatacmds = ; tags = tmsu tags %f
</programlisting> </programlisting>
<note><para>Depending on the <application>tmsu</application> version, <note><para>Depending on the <application>tmsu</application> version,
@ -1154,7 +1154,7 @@ metadatacmds = ; tags = tmsu tags %f
couple the tag update with a <literal>recollindex -e -i couple the tag update with a <literal>recollindex -e -i
filename.</literal></para> filename.</literal></para>
</sect1> </sect1>
<sect1 id="RCL.INDEXING.PDF"> <sect1 id="RCL.INDEXING.PDF">
@ -1216,9 +1216,9 @@ metadatacmds = ; tags = tmsu tags %f
the metadata fields (available for &RCL; 1.23.3 and later. 1.23.2 the metadata fields (available for &RCL; 1.23.3 and later. 1.23.2
has equivalent code inside the handler script). Example:</para> has equivalent code inside the handler script). Example:</para>
<programlisting>import sys <programlisting>import sys
import re import re
class MetaFixer(object): class MetaFixer(object):
def __init__(self): def __init__(self):
pass pass
@ -1367,13 +1367,13 @@ class MetaFixer(object):
PATH): PATH):
<screen><![CDATA[ <screen><![CDATA[
30 3 * * * recollindex > /some/tmp/dir/recolltrace 2>&1 30 3 * * * recollindex > /some/tmp/dir/recolltrace 2>&1
]]></screen> ]]></screen>
Or, using <command>anacron</command>: Or, using <command>anacron</command>:
<screen><![CDATA[ <screen><![CDATA[
1 15 su mylogin -c "recollindex recollindex > /tmp/rcltraceme 2>&1" 1 15 su mylogin -c "recollindex recollindex > /tmp/rcltraceme 2>&1"
]]></screen> ]]></screen>
</para> </para>
<para>As of version 1.17 the &RCL; GUI has dialogs to manage <para>As of version 1.17 the &RCL; GUI has dialogs to manage
@ -1435,12 +1435,12 @@ class MetaFixer(object):
at the end:</para> at the end:</para>
<programlisting>recollconf=$HOME/.recoll-home <programlisting>recollconf=$HOME/.recoll-home
recolldata=/usr/local/share/recoll recolldata=/usr/local/share/recoll
RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh start RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh start
fvwm fvwm
</programlisting> </programlisting>
<para>The indexing daemon gets started, then the window manager, <para>The indexing daemon gets started, then the window manager,
for which the session waits.</para> <para>By default the for which the session waits.</para> <para>By default the
@ -1487,17 +1487,17 @@ fvwm
increasing the resources available to inotify, which are increasing the resources available to inotify, which are
normally defined in <filename>/etc/sysctl.conf</filename>. normally defined in <filename>/etc/sysctl.conf</filename>.
<programlisting> <programlisting>
### inotify ### inotify
# #
# cat /proc/sys/fs/inotify/max_queued_events - 16384 # cat /proc/sys/fs/inotify/max_queued_events - 16384
# cat /proc/sys/fs/inotify/max_user_instances - 128 # cat /proc/sys/fs/inotify/max_user_instances - 128
# cat /proc/sys/fs/inotify/max_user_watches - 16384 # cat /proc/sys/fs/inotify/max_user_watches - 16384
# #
# -- Change to: # -- Change to:
# #
fs.inotify.max_queued_events=32768 fs.inotify.max_queued_events=32768
fs.inotify.max_user_instances=256 fs.inotify.max_user_instances=256
fs.inotify.max_user_watches=32768 fs.inotify.max_user_watches=32768
</programlisting> </programlisting>
</para> </para>
@ -1915,11 +1915,11 @@ fs.inotify.max_user_watches=32768
<filename>~/.recoll/scripts/myscript.desktop</filename> (the exact <filename>~/.recoll/scripts/myscript.desktop</filename> (the exact
file name inside the directory is irrelevant): file name inside the directory is irrelevant):
<programlisting> <programlisting>
[Desktop Entry] [Desktop Entry]
Type=Application Type=Application
Name=MyFirstScript Name=MyFirstScript
Exec=/home/me/bin/tryscript %F Exec=/home/me/bin/tryscript %F
MimeType=*/* MimeType=*/*
</programlisting> </programlisting>
The <literal>Name</literal> attribute defines the label which will The <literal>Name</literal> attribute defines the label which will
appear inside the <guilabel>Run Script</guilabel> menu. The appear inside the <guilabel>Run Script</guilabel> menu. The
@ -2084,10 +2084,10 @@ MimeType=*/*
history.</para> history.</para>
<para>Here follows an example: <para>Here follows an example:
<programlisting> <programlisting>
&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt; &lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;
&lt;fragbuts version=&quot;1.0&quot;&gt; &lt;fragbuts version=&quot;1.0&quot;&gt;
&lt;radiobuttons&gt; &lt;radiobuttons&gt;
@ -2121,8 +2121,8 @@ MimeType=*/*
&lt;/fragbut&gt; &lt;/fragbut&gt;
&lt;/buttons&gt; &lt;/buttons&gt;
&lt;/fragbuts&gt; &lt;/fragbuts&gt;
</programlisting> </programlisting>
</para> </para>
<para>Each <literal>radiobuttons</literal> or <para>Each <literal>radiobuttons</literal> or
@ -3162,27 +3162,27 @@ MimeType=*/*
"<span style='white-space:nowrap'><i>%M</i>&nbsp;%D</span>&nbsp;&nbsp;&nbsp; <i>%U</i>&nbsp;%i<br>\n" "<span style='white-space:nowrap'><i>%M</i>&nbsp;%D</span>&nbsp;&nbsp;&nbsp; <i>%U</i>&nbsp;%i<br>\n"
"%A %K</td>\n" "%A %K</td>\n"
"</tr></table>\n" "</tr></table>\n"
]]></screen> ]]></screen>
You may, for example, try the following for a more web-like You may, for example, try the following for a more web-like
experience: experience:
<screen><![CDATA[ <screen><![CDATA[
<u><b><a href="P%N">%T</a></b></u><br> <u><b><a href="P%N">%T</a></b></u><br>
%A<font color=#008000>%U - %S</font> - %L %A<font color=#008000>%U - %S</font> - %L
]]></screen> ]]></screen>
Note that the P%N link in the above paragraph makes the title a Note that the P%N link in the above paragraph makes the title a
preview link. Or the clean looking: preview link. Or the clean looking:
<screen><![CDATA[ <screen><![CDATA[
<img src="%I" align="left">%L <font color="#900000">%R</font> <img src="%I" align="left">%L <font color="#900000">%R</font>
&nbsp;&nbsp;<b>%T&</b><br>%S&nbsp; &nbsp;&nbsp;<b>%T&</b><br>%S&nbsp;
<font color="#808080"><i>%U</i></font> <font color="#808080"><i>%U</i></font>
<table bgcolor="#e0e0e0"> <table bgcolor="#e0e0e0">
<tr><td><div>%A</div></td></tr> <tr><td><div>%A</div></td></tr>
</table>%K </table>%K
]]></screen> ]]></screen>
</para> </para>
<para>These samples, and some others are <para>These samples, and some others are
@ -3258,11 +3258,11 @@ MimeType=*/*
window.location.href = 'recoll://search/query?qtp=a&amp;p=0&amp;q=' + window.location.href = 'recoll://search/query?qtp=a&amp;p=0&amp;q=' +
encodeURIComponent(t); encodeURIComponent(t);
} }
&lt;/script> &lt;/script>
.... ....
&lt;body ondblclick="recollsearch()"> &lt;body ondblclick="recollsearch()">
</programlisting> </programlisting>
</sect2> </sect2>
</sect1> </sect1>
@ -3303,8 +3303,8 @@ MimeType=*/*
<para><command>recollq</command> has a man page (not installed by <para><command>recollq</command> has a man page (not installed by
default, look in the <filename>doc/man</filename> directory). The default, look in the <filename>doc/man</filename> directory). The
Usage string is as follows:</para> Usage string is as follows:</para>
<programlisting> <programlisting>
recollq: usage: recollq: usage:
-P: Show the date span for all the documents present in the index -P: Show the date span for all the documents present in the index
[-o|-a|-f] [-q] &lt;query string&gt; [-o|-a|-f] [-q] &lt;query string&gt;
Runs a recoll query and displays result lines. Runs a recoll query and displays result lines.
@ -3317,7 +3317,7 @@ recollq: usage:
-a Emulate the GUI simple search in ALL TERMS mode -a Emulate the GUI simple search in ALL TERMS mode
-f Emulate the GUI simple search in filename mode -f Emulate the GUI simple search in filename mode
-q is just ignored (compatibility with the recoll GUI command line) -q is just ignored (compatibility with the recoll GUI command line)
Common options: Common options:
-c &lt;configdir&gt; : specify config directory, overriding $RECOLL_CONFDIR -c &lt;configdir&gt; : specify config directory, overriding $RECOLL_CONFDIR
-d also dump file contents -d also dump file contents
-n [first-]&lt;cnt&gt; define the result slice. The default value for [first] -n [first-]&lt;cnt&gt; define the result slice. The default value for [first]
@ -3338,18 +3338,18 @@ Common options:
separated by one space character. This is the recommended format separated by one space character. This is the recommended format
for use by other programs. Use a normal query with option -m to for use by other programs. Use a normal query with option -m to
see the field names. see the field names.
</programlisting> </programlisting>
<para>Sample execution:</para> <para>Sample execution:</para>
<programlisting>recollq 'ilur -nautique mime:text/html' <programlisting>recollq 'ilur -nautique mime:text/html'
Recoll query: ((((ilur:(wqf=11) OR ilurs) AND_NOT (nautique:(wqf=11) Recoll query: ((((ilur:(wqf=11) OR ilurs) AND_NOT (nautique:(wqf=11)
OR nautiques OR nautiqu OR nautiquement)) FILTER Ttext/html)) OR nautiques OR nautiqu OR nautiquement)) FILTER Ttext/html))
4 results 4 results
text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/comptes.html] [comptes.html] 18593 bytes text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/comptes.html] [comptes.html] 18593 bytes
text/html [file:///Users/uncrypted-dockes/projets/nautique/webnautique/articles/ilur1/index.html] [Constructio... text/html [file:///Users/uncrypted-dockes/projets/nautique/webnautique/articles/ilur1/index.html] [Constructio...
text/html [file:///Users/uncrypted-dockes/projets/pagepers/index.html] [psxtcl/writemime/recoll]... text/html [file:///Users/uncrypted-dockes/projets/pagepers/index.html] [psxtcl/writemime/recoll]...
text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/recu-chasse-maree.... text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/recu-chasse-maree....
</programlisting> </programlisting>
</sect1> </sect1>
<sect1 id="RCL.SEARCH.SYNONYMS"> <sect1 id="RCL.SEARCH.SYNONYMS">
@ -3380,10 +3380,10 @@ text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/r
<para>Example: <para>Example:
<programlisting> <programlisting>
hi hello "good morning" hi hello "good morning"
# not sure about "au revoir" though. Is this english ? # not sure about "au revoir" though. Is this english ?
bye goodbye "see you" \ bye goodbye "see you" \
"au revoir" "au revoir"
</programlisting> </programlisting>
</para> </para>
@ -3680,7 +3680,7 @@ bye goodbye "see you" \
<para>Several <literal>dir</literal> clauses can be specified, <para>Several <literal>dir</literal> clauses can be specified,
both positive and negative. For example the following makes sense: both positive and negative. For example the following makes sense:
<programlisting> <programlisting>
dir:recoll dir:src -dir:utils -dir:common dir:recoll dir:src -dir:utils -dir:common
</programlisting> This would select results which have both </programlisting> This would select results which have both
<filename>recoll</filename> and <filename>src</filename> in the <filename>recoll</filename> and <filename>src</filename> in the
path (in any order), and which have not either path (in any order), and which have not either
@ -4118,6 +4118,88 @@ dir:recoll dir:src -dir:utils -dir:common
</chapter> <!-- Search --> </chapter> <!-- Search -->
<chapter id="RCL.MOVABLE">
<title>Movable datasets</title>
<para>As of &RCL; 1.24, it has become easy to build self-contained
datasets including a &RCL; configuration directory and index together
with the indexed documents, and to move such a dataset around (for
example copying it to an USB drive), without having to adjust the
configuration for querying the index.</para>
<note><para>This is a query-time feature only. The index must only be
updated in its original location. If an update is necessary in a
different location, the index must be reset.</para></note>
<para>The examples below will assume that you have a dataset under
<filename>/home/me/mydata/</filename>, with the index configuration and
data stored inside
<filename>/home/me/mydata/recoll-confdir</filename>.</para>
<para>In order to be able to run queries after the dataset has been
moved, you must ensure the following:
<itemizedlist>
<listitem><para>The main configuration file must define the <link
linkend="RCL.INSTALL.CONFIG.RECOLLCONF.ORGIDXCONFDIR">orgidxconfdir</link>
variable to be the original location of the configuration directory
(<filename>orgidxconfdir=/home/me/mydata/recoll-confdir</filename>
must be set inside
<filename>/home/me/mydata/recoll-confdir/recoll.conf</filename> in
the example above).</para></listitem>
<listitem><para>The configuration directory must exist with the
documents, somewhere under the directory which will be
moved. E.g. if you are moving <filename>/home/me/mydata</filename>
around, the configuration directory must exist somewhere below this
point, for example
<filename>/home/me/mydata/recoll-confdir</filename>, or
<filename>/home/me/mydata/sub/recoll-confdir</filename>.</para></listitem>
<listitem><para>You should keep the default locations for the index
elements (they are relative to the configuration directory by
default). Only the paths referring to the documents themselves
(e.g. <literal>topdirs</literal> values) should be
absolute (in general, they are only used when indexing
anyway).</para></listitem>
</itemizedlist>
</para>
<para>Only the first point needs an explicit user action, the &RCL;
defaults are compatible with the second one, and the third is
natural.</para>
<para>If, after the move, the configuration directory needs to be
copied out of the dataset (for example because the thumb drive is too
slow), you can set the <link
linkend="RCL.INSTALL.CONFIG.RECOLLCONF.CURIDXCONFDIR">
curidxconfdir</link>, variable inside the copied configuration to
define the location of the moved one. For example if
<filename>/home/me/mydata</filename> is now mounted onto
<filename>/media/me/somelabel</filename>, but the configuration
directory and index has been copied to
<filename>/tmp/tempconfig</filename>, you would set
<literal>curidxconfdir</literal> to
<filename>/media/me/somelabel/recoll-confdir</filename> inside
<filename>/tmp/tempconfig/recoll.conf</filename>.
<literal>orgidxconfdir</literal> would still be
<filename>/home/me/mydata/recoll-confdir</filename> in the original and
the copy.</para>
<para>If you are regularly copying the configuration out of the
dataset, it will be useful to write a script to automate the
procedure. This can't really be done inside &RCL; because there are
probably many possible variants. One example would be to copy the
configuration to make it writable, but keep the index data on the
medium because it is too big - in this case, the script would also need
to set <literal>dbdir</literal> in the copied configuration.</para>
<para>The same set of modifications (&RCL; 1.24) has also made it
possible to run queries from a readonly configuration directory (with
slightly reduced function of course, such as not recording the query
history).</para>
</chapter>
<chapter id="RCL.PROGRAM"> <chapter id="RCL.PROGRAM">
<title>Programming interface</title> <title>Programming interface</title>
@ -4329,10 +4411,10 @@ dir:recoll dir:src -dir:utils -dir:common
name suffixes. The types are defined inside the name suffixes. The types are defined inside the
<link linkend="RCL.INSTALL.CONFIG.MIMEMAP"> <link linkend="RCL.INSTALL.CONFIG.MIMEMAP">
<filename>mimemap</filename> file</link>. Example: <filename>mimemap</filename> file</link>. Example:
<programlisting> <programlisting>
.doc = application/msword .doc = application/msword
</programlisting> </programlisting>
If no suffix association is found for the file name, &RCL; will try If no suffix association is found for the file name, &RCL; will try
to execute a system command (typically <command>file -i</command> or to execute a system command (typically <command>file -i</command> or
<command>xdg-mime</command>) to determine a MIME type.</para> <command>xdg-mime</command>) to determine a MIME type.</para>
@ -4341,18 +4423,18 @@ dir:recoll dir:src -dir:utils -dir:common
in the <link linkend="RCL.INSTALL.CONFIG.MIMECONF"> in the <link linkend="RCL.INSTALL.CONFIG.MIMECONF">
<filename>mimeconf</filename> file</link>. A sample will probably be <filename>mimeconf</filename> file</link>. A sample will probably be
better than a long explanation:</para> better than a long explanation:</para>
<programlisting> <programlisting>
[index] [index]
application/msword = exec antiword -t -i 1 -m UTF-8;\ application/msword = exec antiword -t -i 1 -m UTF-8;\
mimetype = text/plain ; charset=utf-8 mimetype = text/plain ; charset=utf-8
application/ogg = exec rclogg application/ogg = exec rclogg
text/rtf = exec unrtf --nopict --html; charset=iso-8859-1; mimetype=text/html text/rtf = exec unrtf --nopict --html; charset=iso-8859-1; mimetype=text/html
application/x-chm = execm rclchm application/x-chm = execm rclchm
</programlisting> </programlisting>
<para>The fragment specifies that: <para>The fragment specifies that:
@ -4409,14 +4491,14 @@ application/x-chm = execm rclchm
<para>For filters producing HTML, the output could be very minimal <para>For filters producing HTML, the output could be very minimal
like the following example: like the following example:
<programlisting> <programlisting>
&lt;html> &lt;html>
&lt;head> &lt;head>
&lt;meta http-equiv="Content-Type" content="text/html;charset=UTF-8"> &lt;meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
&lt;/head> &lt;/head>
&lt;body> &lt;body>
Some text content Some text content
&lt;/body> &lt;/body>
&lt;/html> &lt;/html>
</programlisting> </programlisting>
</para> </para>
@ -4460,13 +4542,13 @@ application/x-chm = execm rclchm
date (for display and sorting), in preference to the file date (for display and sorting), in preference to the file
modification date. The date format should be as follows: modification date. The date format should be as follows:
<programlisting> <programlisting>
&lt;meta name="date" content="YYYY-mm-dd HH:MM:SS"> &lt;meta name="date" content="YYYY-mm-dd HH:MM:SS">
or or
&lt;meta name="date" content="YYYY-mm-ddTHH:MM:SS"> &lt;meta name="date" content="YYYY-mm-ddTHH:MM:SS">
</programlisting> </programlisting>
Example: Example:
<programlisting> <programlisting>
&lt;meta name="date" content="2013-02-24 17:50:00"> &lt;meta name="date" content="2013-02-24 17:50:00">
</programlisting> </programlisting>
</para> </para>
@ -4474,8 +4556,8 @@ or
names. This should also be output as meta tags:</para> names. This should also be output as meta tags:</para>
<programlisting> <programlisting>
&lt;meta name="somefield" content="Some textual data" /&gt; &lt;meta name="somefield" content="Some textual data" /&gt;
</programlisting> </programlisting>
<para>You can embed HTML markup inside the content of custom <para>You can embed HTML markup inside the content of custom
fields, for improving the display inside result lists. In this fields, for improving the display inside result lists. In this
@ -4484,8 +4566,8 @@ or
be escaped for display.</para> be escaped for display.</para>
<programlisting> <programlisting>
&lt;meta name="somefield" markup="html" content="Some &lt;i>textual&lt;/i> data" /&gt; &lt;meta name="somefield" markup="html" content="Some &lt;i>textual&lt;/i> data" /&gt;
</programlisting> </programlisting>
<para>As written above, the processing of fields is described <para>As written above, the processing of fields is described
in a <link linkend="RCL.PROGRAM.FIELDS">further in a <link linkend="RCL.PROGRAM.FIELDS">further
@ -4677,17 +4759,17 @@ or
features.</para> features.</para>
<programlisting><![CDATA[ <programlisting><![CDATA[
#!/usr/bin/env python #!/usr/bin/env python
from recoll import recoll from recoll import recoll
db = recoll.connect() db = recoll.connect()
query = db.query() query = db.query()
nres = query.execute("some query") nres = query.execute("some query")
results = query.fetchmany(20) results = query.fetchmany(20)
for doc in results: for doc in results:
print(doc.url, doc.title) print(doc.url, doc.title)
]]></programlisting> ]]></programlisting>
</sect2> </sect2>
@ -5145,12 +5227,12 @@ for doc in results:
text/html according to doc.mimetype. The typical use text/html according to doc.mimetype. The typical use
would be as follows: would be as follows:
<programlisting> <programlisting>
qdoc = query.fetchone() qdoc = query.fetchone()
extractor = recoll.Extractor(qdoc) extractor = recoll.Extractor(qdoc)
doc = extractor.textextract(qdoc.ipath) doc = extractor.textextract(qdoc.ipath)
# use doc.text, e.g. for previewing # use doc.text, e.g. for previewing
</programlisting> </programlisting>
</para></listitem> </para></listitem>
</varlistentry> </varlistentry>
<varlistentry> <varlistentry>
<term>Extractor.idoctofile(ipath, targetmtype, outfile='')</term> <term>Extractor.idoctofile(ipath, targetmtype, outfile='')</term>
@ -5158,11 +5240,11 @@ doc = extractor.textextract(qdoc.ipath)
which can be given explicitly or will be created as a which can be given explicitly or will be created as a
temporary file to be deleted by the caller. Typical use: temporary file to be deleted by the caller. Typical use:
<programlisting> <programlisting>
qdoc = query.fetchone() qdoc = query.fetchone()
extractor = recoll.Extractor(qdoc) extractor = recoll.Extractor(qdoc)
filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype)</programlisting> filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype)</programlisting>
</para></listitem> </para></listitem>
</varlistentry> </varlistentry>
</variablelist> </variablelist>
@ -5182,9 +5264,9 @@ filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype)</programlisting>
highlighting and data extraction functions.</para> highlighting and data extraction functions.</para>
<programlisting> <programlisting>
#!/usr/bin/env python #!/usr/bin/env python
<![CDATA[ <![CDATA[
from recoll import recoll from recoll import recoll
db = recoll.connect() db = recoll.connect()
db.setAbstractParams(maxchars=80, contextwords=4) db.setAbstractParams(maxchars=80, contextwords=4)
@ -5193,18 +5275,18 @@ query = db.query()
nres = query.execute("some user question") nres = query.execute("some user question")
print "Result count: ", nres print "Result count: ", nres
if nres > 5: if nres > 5:
nres = 5 nres = 5
for i in range(nres): for i in range(nres):
doc = query.fetchone() doc = query.fetchone()
print "Result #%d" % (query.rownumber,) print "Result #%d" % (query.rownumber,)
for k in ("title", "size"): for k in ("title", "size"):
print k, ":", getattr(doc, k).encode('utf-8') print k, ":", getattr(doc, k).encode('utf-8')
abs = db.makeDocAbstract(doc, query).encode('utf-8') abs = db.makeDocAbstract(doc, query).encode('utf-8')
print abs print abs
print print
]]> ]]>
</programlisting> </programlisting>
</sect3> </sect3>
</sect2> </sect2>
@ -5348,8 +5430,8 @@ for i in range(nres):
indexing sample found in the Recoll source (which sets indexing sample found in the Recoll source (which sets
<literal>rclbes="MBOX"</literal>):</para> <literal>rclbes="MBOX"</literal>):</para>
<programlisting>[MBOX] <programlisting>[MBOX]
fetch = /path/to/recoll/src/python/samples/rclmbox.py fetch fetch = /path/to/recoll/src/python/samples/rclmbox.py fetch
makesig = path/to/recoll/src/python/samples/rclmbox.py makesig makesig = path/to/recoll/src/python/samples/rclmbox.py makesig
</programlisting> </programlisting>
<para><literal>fetch</literal> and <literal>makesig</literal> <para><literal>fetch</literal> and <literal>makesig</literal>
define two commands to execute to respectively retrieve the define two commands to execute to respectively retrieve the
@ -5390,15 +5472,15 @@ makesig = path/to/recoll/src/python/samples/rclmbox.py makesig
<para>Adapting to the new package structure:</para> <para>Adapting to the new package structure:</para>
<programlisting> <programlisting>
<![CDATA[ <![CDATA[
try: try:
from recoll import recoll from recoll import recoll
from recoll import rclextract from recoll import rclextract
hasextract = True hasextract = True
except: except:
import recoll import recoll
hasextract = False hasextract = False
]]> ]]>
</programlisting> </programlisting>
<para>Adapting to the change of nature of <para>Adapting to the change of nature of
@ -5408,10 +5490,10 @@ except:
the <literal>next</literal> value (old).</para> the <literal>next</literal> value (old).</para>
<programlisting> <programlisting>
<![CDATA[ <![CDATA[
rownum = query.next if type(query.next) == int else \ rownum = query.next if type(query.next) == int else \
query.rownumber query.rownumber
]]> ]]>
</programlisting> </programlisting>
</sect2> <!-- compat with previous version --> </sect2> <!-- compat with previous version -->
@ -5719,7 +5801,8 @@ except:
very much welcome patches</ulink>.</para> very much welcome patches</ulink>.</para>
<formalpara><title>Configure options:</title> <formalpara>
<title>Configure options:</title>
<para> <para>
<itemizedlist> <itemizedlist>
@ -5983,9 +6066,9 @@ except:
character. Long lines can be continued by escaping the character. Long lines can be continued by escaping the
physical newline with backslash, even inside quoted strings.</para> physical newline with backslash, even inside quoted strings.</para>
<programlisting> <programlisting>
astringlist = "some string \ astringlist = "some string \
with spaces" with spaces"
thesame = "some string with spaces" thesame = "some string with spaces"
</programlisting> </programlisting>
<para>Parameters which are not part of string lists can't be <para>Parameters which are not part of string lists can't be
@ -6166,25 +6249,25 @@ thesame = "some string with spaces"
only plain ascii headers can be indexed, and only the only plain ascii headers can be indexed, and only the
first occurrence will be used for headers that occur several times). first occurrence will be used for headers that occur several times).
<programlisting>[prefixes] <programlisting>[prefixes]
# Index mailmytag contents (with the given prefix) # Index mailmytag contents (with the given prefix)
mailmytag = XMTAG mailmytag = XMTAG
[stored] [stored]
# Store mailmytag inside the document data record (so that it can be # Store mailmytag inside the document data record (so that it can be
# displayed - as %(mailmytag) - in result lists). # displayed - as %(mailmytag) - in result lists).
mailmytag = mailmytag =
[queryaliases] [queryaliases]
filename = fn filename = fn
containerfilename = cfn containerfilename = cfn
[mail] [mail]
# Extract the X-My-Tag mail header, and use it internally with the # Extract the X-My-Tag mail header, and use it internally with the
# mailmytag field name # mailmytag field name
x-my-tag = mailmytag x-my-tag = mailmytag
</programlisting> </programlisting>
</para> </para>
<sect3 id="RCL.INSTALL.CONFIG.FIELDS.XATTR"> <sect3 id="RCL.INSTALL.CONFIG.FIELDS.XATTR">
@ -6231,7 +6314,7 @@ x-my-tag = mailmytag
should be handled specially, which is possible because they should be handled specially, which is possible because they
are usually all located in one place. Example: are usually all located in one place. Example:
<programlisting>[~/.kde/share/apps/okular/docdata] <programlisting>[~/.kde/share/apps/okular/docdata]
.xml = application/x-okular-notes</programlisting></para> .xml = application/x-okular-notes</programlisting></para>
<para>The <varname>recoll_noindex</varname> <para>The <varname>recoll_noindex</varname>
<filename>mimemap</filename> variable has been moved to <filename>mimemap</filename> variable has been moved to
@ -6305,7 +6388,7 @@ x-my-tag = mailmytag
application tag to specialize the choice for an area of the application tag to specialize the choice for an area of the
filesystem (using a <varname>localfields</varname> specification filesystem (using a <varname>localfields</varname> specification
in <filename>mimeconf</filename>). The syntax for the key is in <filename>mimeconf</filename>). The syntax for the key is
<replaceable>mimetype</replaceable><literal>|</literal><replaceable>tag</replaceable></para> <replaceable>mimetype</replaceable><literal>|</literal><replaceable>tag</replaceable></para>
<para>The <varname>nouncompforviewmts</varname> entry, (placed at <para>The <varname>nouncompforviewmts</varname> entry, (placed at
the top level, outside of the <literal>[view]</literal> section), the top level, outside of the <literal>[view]</literal> section),
@ -6415,8 +6498,8 @@ x-my-tag = mailmytag
<listitem><para>In <filename>$RECOLL_CONFDIR/mimemap</filename> <listitem><para>In <filename>$RECOLL_CONFDIR/mimemap</filename>
(typically <filename>~/.recoll/mimemap</filename>), add the (typically <filename>~/.recoll/mimemap</filename>), add the
following line:<programlisting> following line:<programlisting>
.blob = application/x-blobapp .blob = application/x-blobapp
</programlisting> </programlisting>
Note that the MIME type is made up here, and you could Note that the MIME type is made up here, and you could
call it <replaceable>diesel/oil</replaceable> just the call it <replaceable>diesel/oil</replaceable> just the
same.</para> same.</para>
@ -6424,8 +6507,8 @@ x-my-tag = mailmytag
<listitem><para>In <filename>$RECOLL_CONFDIR/mimeview</filename> <listitem><para>In <filename>$RECOLL_CONFDIR/mimeview</filename>
under the <literal>[view]</literal> section, add:</para> under the <literal>[view]</literal> section, add:</para>
<programlisting> <programlisting>
application/x-blobapp = blobviewer %f application/x-blobapp = blobviewer %f
</programlisting> </programlisting>
<para>We are supposing <para>We are supposing
that <replaceable>blobviewer</replaceable> wants a file that <replaceable>blobviewer</replaceable> wants a file
name parameter here, you would use <literal>%u</literal> if name parameter here, you would use <literal>%u</literal> if
@ -6458,8 +6541,8 @@ application/x-blobapp = blobviewer %f
section, add the following line (more about the section, add the following line (more about the
<replaceable>rclblob</replaceable> indexing script <replaceable>rclblob</replaceable> indexing script
later):<programlisting> later):<programlisting>
application/x-blobapp = exec rclblob application/x-blobapp = exec rclblob
</programlisting></para> </programlisting></para>
</listitem> </listitem>
<listitem><para>Under the <literal>[icons]</literal> <listitem><para>Under the <literal>[icons]</literal>
section, you should choose an icon to be displayed for the section, you should choose an icon to be displayed for the
@ -6489,4 +6572,3 @@ application/x-blobapp = exec rclblob
</sect1> </sect1>
</chapter> </chapter>
</book> </book>

View File

@ -571,6 +571,31 @@ logfilename = stderr
# the log... values.</descr></var> # the log... values.</descr></var>
#daemlogfilename = /dev/null #daemlogfilename = /dev/null
# <var name="orgidxconfdir" type="dfn">
#
# <brief>Original location of the configuration directory.</brief>
# <descr>This is used exclusively for movable datasets. Locating the
# configuration directory inside the directory tree makes it possible to
# provide automatic query time path translations once the data set has
# moved (for example, because it has been mounted on another
# location).</descr></var>
#orgidxconfdir =
# <var name="curidxconfdir" type="dfn">
#
# <brief>Current location of the configuration directory.</brief>
# <descr>Complement orgidxconfdir for movable datasets. This should be used
# if the configuration directory has been copied from the dataset to
# another location, either because the dataset is readonly and an r/w copy
# is desired, or for performance reasons. This records the original moved
# location before copy, to allow path translation computations. For
# example if a dataset originally indexed as '/home/me/mydata/config' has
# been mounted to '/media/me/mydata', and the GUI is running from a copied
# configuration, orgidxconfdir would be '/home/me/mydata/config', and
# curidxconfdir (as set in the copied configuration) would be
# '/media/me/mydata/config'.</descr></var>
#curidxconfdir =
# <var name="idxrundir" type="dfn"> # <var name="idxrundir" type="dfn">
# #
# <brief>Indexing process current directory.</brief> <descr>The input # <brief>Indexing process current directory.</brief> <descr>The input