Movable datasets support

This commit is contained in:
Jean-Francois Dockes 2017-12-06 11:34:04 +01:00
parent 329ab7b90d
commit 09acb5687c
5 changed files with 4611 additions and 4270 deletions

View File

@ -1318,17 +1318,85 @@ string RclConfig::getPidfile() const
return path_cat(getCacheDir(), "index.pid");
}
/* Eliminate the common leaf part of file paths p1 and p2. Example:
* /mnt1/common/part /mnt2/common/part -> /mnt1 /mnt2. This is used
* for computing translations for paths when the dataset has been
* moved. Of course this could be done more efficiently than by splitting
* into vectors, but we don't care.*/
static string path_diffstems(const string& p1, const string& p2,
string& r1, string& r2)
{
string reason;
r1.clear();
r2.clear();
vector<string> v1, v2;
stringToTokens(p1, v1, "/");
stringToTokens(p2, v2, "/");
unsigned int l1 = v1.size();
unsigned int l2 = v2.size();
// Search for common leaf part
unsigned int cl = 0;
for (; cl < MIN(l1, l2); cl++) {
if (v1[l1-cl-1] != v2[l2-cl-1]) {
break;
}
}
//cerr << "Common length = " << cl << endl;
if (cl == 0) {
reason = "Input paths are empty or have no common part";
return reason;
}
for (unsigned i = 0; i < l1 - cl; i++) {
r1 += "/" + v1[i];
}
for (unsigned i = 0; i < l2 - cl; i++) {
r2 += "/" + v2[i];
}
return reason;
}
void RclConfig::urlrewrite(const string& dbdir, string& url) const
{
LOGDEB2("RclConfig::urlrewrite: dbdir [" << dbdir << "] url [" << url <<
LOGDEB("RclConfig::urlrewrite: dbdir [" << dbdir << "] url [" << url <<
"]\n");
// If orgidxconfdir is set, we assume that this index is for a
// movable dataset, with the configuration directory stored inside
// the dataset tree. This allows computing automatic path
// translations if the dataset has been moved.
string orig_confdir;
string cur_confdir;
string confstemorg, confstemrep;
if (m_conf->get("orgidxconfdir", orig_confdir, "")) {
if (!m_conf->get("curidxconfdir", cur_confdir, "")) {
cur_confdir = m_confdir;
}
LOGDEB("RclConfig::urlrewrite: orgidxconfdir: " << orig_confdir <<
" cur_confdir " << cur_confdir << endl);
string reason = path_diffstems(orig_confdir, cur_confdir,
confstemorg, confstemrep);
if (!reason.empty()) {
LOGERR("urlrewrite: path_diffstems failed: " << reason <<
" : orig_confdir [" << orig_confdir <<
"] cur_confdir [" << cur_confdir << endl);
confstemorg = confstemrep = "";
}
}
// Do path translations exist for this index ?
bool needptrans = true;
if (m_ptrans == 0 || !m_ptrans->hasSubKey(dbdir)) {
LOGDEB2("RclConfig::urlrewrite: no paths translations (m_ptrans " <<
m_ptrans << ")\n");
needptrans = false;
}
if (!needptrans && confstemorg.empty()) {
return;
}
bool computeurl = false;
string path = fileurltolocalpath(url);
if (path.empty()) {
@ -1336,21 +1404,33 @@ void RclConfig::urlrewrite(const string& dbdir, string& url) const
return;
}
// Do the movable volume thing.
if (!confstemorg.empty() && confstemorg.size() <= path.size() &&
!path.compare(0, confstemorg.size(), confstemorg)) {
path = path.replace(0, confstemorg.size(), confstemrep);
computeurl = true;
}
if (needptrans) {
// For each translation check if the prefix matches the input path,
// replace and return the result if it does.
vector<string> opaths = m_ptrans->getNames(dbdir);
for (vector<string>::const_iterator it = opaths.begin();
it != opaths.end(); it++) {
if (it->size() <= path.size() && !path.compare(0, it->size(), *it)) {
for (const auto& opath: opaths) {
if (opath.size() <= path.size() &&
!path.compare(0, opath.size(), opath)) {
string npath;
// This call always succeeds because the key comes from getNames()
if (m_ptrans->get(*it, npath, dbdir)) {
path = path.replace(0, it->size(), npath);
url = path_pathtofileurl(path);
// Key comes from getNames()=> call must succeed
if (m_ptrans->get(opath, npath, dbdir)) {
path = path.replace(0, opath.size(), npath);
computeurl = true;
}
break;
}
}
}
if (computeurl) {
url = path_pathtofileurl(path);
}
}
bool RclConfig::sourceChanged() const

View File

@ -471,6 +471,25 @@ the log... values.</para></listitem></varlistentry>
<listitem><para>Override logfilename for the indexer in real time
mode. The default is to use the idx... values if set, else
the log... values.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.ORGIDXCONFDIR">
<term><varname>orgidxconfdir</varname></term>
<listitem><para>Original location of the configuration directory. This is used exclusively for movable datasets. Locating the
configuration directory inside the directory tree makes it possible to
provide automatic query time path translations once the data set has
moved (for example, because it has been mounted on another
location).</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.CURIDXCONFDIR">
<term><varname>curidxconfdir</varname></term>
<listitem><para>Current location of the configuration directory. Complement orgidxconfdir for movable datasets. This should be used
if the configuration directory has been copied from the dataset to
another location, either because the dataset is readonly and an r/w copy
is desired, or for performance reasons. This records the original moved
location before copy, to allow path translation computations. For
example if a dataset originally indexed as '/home/me/mydata/config' has
been mounted to '/media/me/mydata', and the GUI is running from a copied
configuration, orgidxconfdir would be '/home/me/mydata/config', and
curidxconfdir (as set in the copied configuration) would be
'/media/me/mydata/config'.</para></listitem></varlistentry>
<varlistentry id="RCL.INSTALL.CONFIG.RECOLLCONF.IDXRUNDIR">
<term><varname>idxrundir</varname></term>
<listitem><para>Indexing process current directory. The input

File diff suppressed because it is too large Load Diff

View File

@ -498,12 +498,12 @@
indexed (no others will be indexed), by settting
the <link linkend="RCL.INSTALL.CONFIG.RECOLLCONF.INDEXEDMIMETYPES">
indexedmimetypes</link> configuration variable. Example:<programlisting>
indexedmimetypes = text/html application/pdf
indexedmimetypes = text/html application/pdf
</programlisting>
It is possible to redefine this parameter for
subdirectories. Example:<programlisting>
[/path/to/my/dir]
indexedmimetypes = application/pdf
[/path/to/my/dir]
indexedmimetypes = application/pdf
</programlisting>
(When using sections like this, don't forget that they remain
in effect until the end of the file or another section
@ -920,10 +920,10 @@ indexedmimetypes = application/pdf
processing their text, and one to update the index. This was
tested to be the best configuration on the test system
(quadri-processor with multiple disks).
<programlisting>
thrQSizes = 2 2 2
thrTCounts = 4 2 1
</programlisting>
<programlisting>
thrQSizes = 2 2 2
thrTCounts = 4 2 1
</programlisting>
</para>
<para>The following example would use a single queue, and the
@ -936,18 +936,18 @@ thrTCounts = 4 2 1
would be performed purely sequentially), so the previous
approach is preferred. YMMV... The 2 last values for
thrTCounts are ignored.
<programlisting>
thrQSizes = 2 -1 -1
thrTCounts = 6 1 1
</programlisting>
<programlisting>
thrQSizes = 2 -1 -1
thrTCounts = 6 1 1
</programlisting>
</para>
<para>The following example would disable
multithreading. Indexing will be performed by a single
thread.
<programlisting>
thrQSizes = -1 -1 -1
</programlisting>
<programlisting>
thrQSizes = -1 -1 -1
</programlisting>
</para>
</sect2>
@ -1113,7 +1113,7 @@ thrQSizes = -1 -1 -1
configuration file:</para>
<programlisting>[/some/area/of/the/fs]
metadatacmds = ; tags = tmsu tags %f
metadatacmds = ; tags = tmsu tags %f
</programlisting>
<note><para>Depending on the <application>tmsu</application> version,
@ -1154,7 +1154,7 @@ metadatacmds = ; tags = tmsu tags %f
couple the tag update with a <literal>recollindex -e -i
filename.</literal></para>
</sect1>
</sect1>
<sect1 id="RCL.INDEXING.PDF">
@ -1216,9 +1216,9 @@ metadatacmds = ; tags = tmsu tags %f
the metadata fields (available for &RCL; 1.23.3 and later. 1.23.2
has equivalent code inside the handler script). Example:</para>
<programlisting>import sys
import re
import re
class MetaFixer(object):
class MetaFixer(object):
def __init__(self):
pass
@ -1367,13 +1367,13 @@ class MetaFixer(object):
PATH):
<screen><![CDATA[
30 3 * * * recollindex > /some/tmp/dir/recolltrace 2>&1
]]></screen>
30 3 * * * recollindex > /some/tmp/dir/recolltrace 2>&1
]]></screen>
Or, using <command>anacron</command>:
<screen><![CDATA[
1 15 su mylogin -c "recollindex recollindex > /tmp/rcltraceme 2>&1"
]]></screen>
<screen><![CDATA[
1 15 su mylogin -c "recollindex recollindex > /tmp/rcltraceme 2>&1"
]]></screen>
</para>
<para>As of version 1.17 the &RCL; GUI has dialogs to manage
@ -1435,12 +1435,12 @@ class MetaFixer(object):
at the end:</para>
<programlisting>recollconf=$HOME/.recoll-home
recolldata=/usr/local/share/recoll
RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh start
recolldata=/usr/local/share/recoll
RECOLL_CONFDIR=$recollconf $recolldata/examples/rclmon.sh start
fvwm
fvwm
</programlisting>
</programlisting>
<para>The indexing daemon gets started, then the window manager,
for which the session waits.</para> <para>By default the
@ -1487,17 +1487,17 @@ fvwm
increasing the resources available to inotify, which are
normally defined in <filename>/etc/sysctl.conf</filename>.
<programlisting>
### inotify
#
# cat /proc/sys/fs/inotify/max_queued_events - 16384
# cat /proc/sys/fs/inotify/max_user_instances - 128
# cat /proc/sys/fs/inotify/max_user_watches - 16384
#
# -- Change to:
#
fs.inotify.max_queued_events=32768
fs.inotify.max_user_instances=256
fs.inotify.max_user_watches=32768
### inotify
#
# cat /proc/sys/fs/inotify/max_queued_events - 16384
# cat /proc/sys/fs/inotify/max_user_instances - 128
# cat /proc/sys/fs/inotify/max_user_watches - 16384
#
# -- Change to:
#
fs.inotify.max_queued_events=32768
fs.inotify.max_user_instances=256
fs.inotify.max_user_watches=32768
</programlisting>
</para>
@ -1915,11 +1915,11 @@ fs.inotify.max_user_watches=32768
<filename>~/.recoll/scripts/myscript.desktop</filename> (the exact
file name inside the directory is irrelevant):
<programlisting>
[Desktop Entry]
Type=Application
Name=MyFirstScript
Exec=/home/me/bin/tryscript %F
MimeType=*/*
[Desktop Entry]
Type=Application
Name=MyFirstScript
Exec=/home/me/bin/tryscript %F
MimeType=*/*
</programlisting>
The <literal>Name</literal> attribute defines the label which will
appear inside the <guilabel>Run Script</guilabel> menu. The
@ -2084,10 +2084,10 @@ MimeType=*/*
history.</para>
<para>Here follows an example:
<programlisting>
&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;
<programlisting>
&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;
&lt;fragbuts version=&quot;1.0&quot;&gt;
&lt;fragbuts version=&quot;1.0&quot;&gt;
&lt;radiobuttons&gt;
@ -2121,8 +2121,8 @@ MimeType=*/*
&lt;/fragbut&gt;
&lt;/buttons&gt;
&lt;/fragbuts&gt;
</programlisting>
&lt;/fragbuts&gt;
</programlisting>
</para>
<para>Each <literal>radiobuttons</literal> or
@ -3162,27 +3162,27 @@ MimeType=*/*
"<span style='white-space:nowrap'><i>%M</i>&nbsp;%D</span>&nbsp;&nbsp;&nbsp; <i>%U</i>&nbsp;%i<br>\n"
"%A %K</td>\n"
"</tr></table>\n"
]]></screen>
]]></screen>
You may, for example, try the following for a more web-like
experience:
<screen><![CDATA[
<u><b><a href="P%N">%T</a></b></u><br>
%A<font color=#008000>%U - %S</font> - %L
]]></screen>
<u><b><a href="P%N">%T</a></b></u><br>
%A<font color=#008000>%U - %S</font> - %L
]]></screen>
Note that the P%N link in the above paragraph makes the title a
preview link. Or the clean looking:
<screen><![CDATA[
<img src="%I" align="left">%L <font color="#900000">%R</font>
&nbsp;&nbsp;<b>%T&</b><br>%S&nbsp;
<font color="#808080"><i>%U</i></font>
<table bgcolor="#e0e0e0">
<tr><td><div>%A</div></td></tr>
</table>%K
]]></screen>
<img src="%I" align="left">%L <font color="#900000">%R</font>
&nbsp;&nbsp;<b>%T&</b><br>%S&nbsp;
<font color="#808080"><i>%U</i></font>
<table bgcolor="#e0e0e0">
<tr><td><div>%A</div></td></tr>
</table>%K
]]></screen>
</para>
<para>These samples, and some others are
@ -3258,11 +3258,11 @@ MimeType=*/*
window.location.href = 'recoll://search/query?qtp=a&amp;p=0&amp;q=' +
encodeURIComponent(t);
}
&lt;/script>
&lt;/script>
....
&lt;body ondblclick="recollsearch()">
&lt;body ondblclick="recollsearch()">
</programlisting>
</programlisting>
</sect2>
</sect1>
@ -3303,8 +3303,8 @@ MimeType=*/*
<para><command>recollq</command> has a man page (not installed by
default, look in the <filename>doc/man</filename> directory). The
Usage string is as follows:</para>
<programlisting>
recollq: usage:
<programlisting>
recollq: usage:
-P: Show the date span for all the documents present in the index
[-o|-a|-f] [-q] &lt;query string&gt;
Runs a recoll query and displays result lines.
@ -3317,7 +3317,7 @@ recollq: usage:
-a Emulate the GUI simple search in ALL TERMS mode
-f Emulate the GUI simple search in filename mode
-q is just ignored (compatibility with the recoll GUI command line)
Common options:
Common options:
-c &lt;configdir&gt; : specify config directory, overriding $RECOLL_CONFDIR
-d also dump file contents
-n [first-]&lt;cnt&gt; define the result slice. The default value for [first]
@ -3338,18 +3338,18 @@ Common options:
separated by one space character. This is the recommended format
for use by other programs. Use a normal query with option -m to
see the field names.
</programlisting>
</programlisting>
<para>Sample execution:</para>
<programlisting>recollq 'ilur -nautique mime:text/html'
Recoll query: ((((ilur:(wqf=11) OR ilurs) AND_NOT (nautique:(wqf=11)
<programlisting>recollq 'ilur -nautique mime:text/html'
Recoll query: ((((ilur:(wqf=11) OR ilurs) AND_NOT (nautique:(wqf=11)
OR nautiques OR nautiqu OR nautiquement)) FILTER Ttext/html))
4 results
text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/comptes.html] [comptes.html] 18593 bytes
text/html [file:///Users/uncrypted-dockes/projets/nautique/webnautique/articles/ilur1/index.html] [Constructio...
text/html [file:///Users/uncrypted-dockes/projets/pagepers/index.html] [psxtcl/writemime/recoll]...
text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/recu-chasse-maree....
</programlisting>
4 results
text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/comptes.html] [comptes.html] 18593 bytes
text/html [file:///Users/uncrypted-dockes/projets/nautique/webnautique/articles/ilur1/index.html] [Constructio...
text/html [file:///Users/uncrypted-dockes/projets/pagepers/index.html] [psxtcl/writemime/recoll]...
text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/recu-chasse-maree....
</programlisting>
</sect1>
<sect1 id="RCL.SEARCH.SYNONYMS">
@ -3380,10 +3380,10 @@ text/html [file:///Users/uncrypted-dockes/projets/bateaux/ilur/factEtCie/r
<para>Example:
<programlisting>
hi hello "good morning"
hi hello "good morning"
# not sure about "au revoir" though. Is this english ?
bye goodbye "see you" \
# not sure about "au revoir" though. Is this english ?
bye goodbye "see you" \
"au revoir"
</programlisting>
</para>
@ -3680,7 +3680,7 @@ bye goodbye "see you" \
<para>Several <literal>dir</literal> clauses can be specified,
both positive and negative. For example the following makes sense:
<programlisting>
dir:recoll dir:src -dir:utils -dir:common
dir:recoll dir:src -dir:utils -dir:common
</programlisting> This would select results which have both
<filename>recoll</filename> and <filename>src</filename> in the
path (in any order), and which have not either
@ -4118,6 +4118,88 @@ dir:recoll dir:src -dir:utils -dir:common
</chapter> <!-- Search -->
<chapter id="RCL.MOVABLE">
<title>Movable datasets</title>
<para>As of &RCL; 1.24, it has become easy to build self-contained
datasets including a &RCL; configuration directory and index together
with the indexed documents, and to move such a dataset around (for
example copying it to an USB drive), without having to adjust the
configuration for querying the index.</para>
<note><para>This is a query-time feature only. The index must only be
updated in its original location. If an update is necessary in a
different location, the index must be reset.</para></note>
<para>The examples below will assume that you have a dataset under
<filename>/home/me/mydata/</filename>, with the index configuration and
data stored inside
<filename>/home/me/mydata/recoll-confdir</filename>.</para>
<para>In order to be able to run queries after the dataset has been
moved, you must ensure the following:
<itemizedlist>
<listitem><para>The main configuration file must define the <link
linkend="RCL.INSTALL.CONFIG.RECOLLCONF.ORGIDXCONFDIR">orgidxconfdir</link>
variable to be the original location of the configuration directory
(<filename>orgidxconfdir=/home/me/mydata/recoll-confdir</filename>
must be set inside
<filename>/home/me/mydata/recoll-confdir/recoll.conf</filename> in
the example above).</para></listitem>
<listitem><para>The configuration directory must exist with the
documents, somewhere under the directory which will be
moved. E.g. if you are moving <filename>/home/me/mydata</filename>
around, the configuration directory must exist somewhere below this
point, for example
<filename>/home/me/mydata/recoll-confdir</filename>, or
<filename>/home/me/mydata/sub/recoll-confdir</filename>.</para></listitem>
<listitem><para>You should keep the default locations for the index
elements (they are relative to the configuration directory by
default). Only the paths referring to the documents themselves
(e.g. <literal>topdirs</literal> values) should be
absolute (in general, they are only used when indexing
anyway).</para></listitem>
</itemizedlist>
</para>
<para>Only the first point needs an explicit user action, the &RCL;
defaults are compatible with the second one, and the third is
natural.</para>
<para>If, after the move, the configuration directory needs to be
copied out of the dataset (for example because the thumb drive is too
slow), you can set the <link
linkend="RCL.INSTALL.CONFIG.RECOLLCONF.CURIDXCONFDIR">
curidxconfdir</link>, variable inside the copied configuration to
define the location of the moved one. For example if
<filename>/home/me/mydata</filename> is now mounted onto
<filename>/media/me/somelabel</filename>, but the configuration
directory and index has been copied to
<filename>/tmp/tempconfig</filename>, you would set
<literal>curidxconfdir</literal> to
<filename>/media/me/somelabel/recoll-confdir</filename> inside
<filename>/tmp/tempconfig/recoll.conf</filename>.
<literal>orgidxconfdir</literal> would still be
<filename>/home/me/mydata/recoll-confdir</filename> in the original and
the copy.</para>
<para>If you are regularly copying the configuration out of the
dataset, it will be useful to write a script to automate the
procedure. This can't really be done inside &RCL; because there are
probably many possible variants. One example would be to copy the
configuration to make it writable, but keep the index data on the
medium because it is too big - in this case, the script would also need
to set <literal>dbdir</literal> in the copied configuration.</para>
<para>The same set of modifications (&RCL; 1.24) has also made it
possible to run queries from a readonly configuration directory (with
slightly reduced function of course, such as not recording the query
history).</para>
</chapter>
<chapter id="RCL.PROGRAM">
<title>Programming interface</title>
@ -4329,10 +4411,10 @@ dir:recoll dir:src -dir:utils -dir:common
name suffixes. The types are defined inside the
<link linkend="RCL.INSTALL.CONFIG.MIMEMAP">
<filename>mimemap</filename> file</link>. Example:
<programlisting>
<programlisting>
.doc = application/msword
</programlisting>
.doc = application/msword
</programlisting>
If no suffix association is found for the file name, &RCL; will try
to execute a system command (typically <command>file -i</command> or
<command>xdg-mime</command>) to determine a MIME type.</para>
@ -4341,18 +4423,18 @@ dir:recoll dir:src -dir:utils -dir:common
in the <link linkend="RCL.INSTALL.CONFIG.MIMECONF">
<filename>mimeconf</filename> file</link>. A sample will probably be
better than a long explanation:</para>
<programlisting>
<programlisting>
[index]
application/msword = exec antiword -t -i 1 -m UTF-8;\
[index]
application/msword = exec antiword -t -i 1 -m UTF-8;\
mimetype = text/plain ; charset=utf-8
application/ogg = exec rclogg
application/ogg = exec rclogg
text/rtf = exec unrtf --nopict --html; charset=iso-8859-1; mimetype=text/html
text/rtf = exec unrtf --nopict --html; charset=iso-8859-1; mimetype=text/html
application/x-chm = execm rclchm
</programlisting>
application/x-chm = execm rclchm
</programlisting>
<para>The fragment specifies that:
@ -4409,14 +4491,14 @@ application/x-chm = execm rclchm
<para>For filters producing HTML, the output could be very minimal
like the following example:
<programlisting>
&lt;html>
&lt;html>
&lt;head>
&lt;meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
&lt;/head>
&lt;body>
Some text content
&lt;/body>
&lt;/html>
&lt;/html>
</programlisting>
</para>
@ -4460,13 +4542,13 @@ application/x-chm = execm rclchm
date (for display and sorting), in preference to the file
modification date. The date format should be as follows:
<programlisting>
&lt;meta name="date" content="YYYY-mm-dd HH:MM:SS">
or
&lt;meta name="date" content="YYYY-mm-ddTHH:MM:SS">
&lt;meta name="date" content="YYYY-mm-dd HH:MM:SS">
or
&lt;meta name="date" content="YYYY-mm-ddTHH:MM:SS">
</programlisting>
Example:
<programlisting>
&lt;meta name="date" content="2013-02-24 17:50:00">
&lt;meta name="date" content="2013-02-24 17:50:00">
</programlisting>
</para>
@ -4474,8 +4556,8 @@ or
names. This should also be output as meta tags:</para>
<programlisting>
&lt;meta name="somefield" content="Some textual data" /&gt;
</programlisting>
&lt;meta name="somefield" content="Some textual data" /&gt;
</programlisting>
<para>You can embed HTML markup inside the content of custom
fields, for improving the display inside result lists. In this
@ -4484,8 +4566,8 @@ or
be escaped for display.</para>
<programlisting>
&lt;meta name="somefield" markup="html" content="Some &lt;i>textual&lt;/i> data" /&gt;
</programlisting>
&lt;meta name="somefield" markup="html" content="Some &lt;i>textual&lt;/i> data" /&gt;
</programlisting>
<para>As written above, the processing of fields is described
in a <link linkend="RCL.PROGRAM.FIELDS">further
@ -4677,17 +4759,17 @@ or
features.</para>
<programlisting><![CDATA[
#!/usr/bin/env python
#!/usr/bin/env python
from recoll import recoll
from recoll import recoll
db = recoll.connect()
query = db.query()
nres = query.execute("some query")
results = query.fetchmany(20)
for doc in results:
db = recoll.connect()
query = db.query()
nres = query.execute("some query")
results = query.fetchmany(20)
for doc in results:
print(doc.url, doc.title)
]]></programlisting>
]]></programlisting>
</sect2>
@ -5145,12 +5227,12 @@ for doc in results:
text/html according to doc.mimetype. The typical use
would be as follows:
<programlisting>
qdoc = query.fetchone()
extractor = recoll.Extractor(qdoc)
doc = extractor.textextract(qdoc.ipath)
# use doc.text, e.g. for previewing
</programlisting>
</para></listitem>
qdoc = query.fetchone()
extractor = recoll.Extractor(qdoc)
doc = extractor.textextract(qdoc.ipath)
# use doc.text, e.g. for previewing
</programlisting>
</para></listitem>
</varlistentry>
<varlistentry>
<term>Extractor.idoctofile(ipath, targetmtype, outfile='')</term>
@ -5158,11 +5240,11 @@ doc = extractor.textextract(qdoc.ipath)
which can be given explicitly or will be created as a
temporary file to be deleted by the caller. Typical use:
<programlisting>
qdoc = query.fetchone()
extractor = recoll.Extractor(qdoc)
filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype)</programlisting>
qdoc = query.fetchone()
extractor = recoll.Extractor(qdoc)
filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype)</programlisting>
</para></listitem>
</para></listitem>
</varlistentry>
</variablelist>
@ -5182,9 +5264,9 @@ filename = extractor.idoctofile(qdoc.ipath, qdoc.mimetype)</programlisting>
highlighting and data extraction functions.</para>
<programlisting>
#!/usr/bin/env python
<![CDATA[
from recoll import recoll
#!/usr/bin/env python
<![CDATA[
from recoll import recoll
db = recoll.connect()
db.setAbstractParams(maxchars=80, contextwords=4)
@ -5193,18 +5275,18 @@ query = db.query()
nres = query.execute("some user question")
print "Result count: ", nres
if nres > 5:
nres = 5
nres = 5
for i in range(nres):
doc = query.fetchone()
print "Result #%d" % (query.rownumber,)
for k in ("title", "size"):
print k, ":", getattr(doc, k).encode('utf-8')
abs = db.makeDocAbstract(doc, query).encode('utf-8')
print abs
print
doc = query.fetchone()
print "Result #%d" % (query.rownumber,)
for k in ("title", "size"):
print k, ":", getattr(doc, k).encode('utf-8')
abs = db.makeDocAbstract(doc, query).encode('utf-8')
print abs
print
]]>
</programlisting>
]]>
</programlisting>
</sect3>
</sect2>
@ -5348,8 +5430,8 @@ for i in range(nres):
indexing sample found in the Recoll source (which sets
<literal>rclbes="MBOX"</literal>):</para>
<programlisting>[MBOX]
fetch = /path/to/recoll/src/python/samples/rclmbox.py fetch
makesig = path/to/recoll/src/python/samples/rclmbox.py makesig
fetch = /path/to/recoll/src/python/samples/rclmbox.py fetch
makesig = path/to/recoll/src/python/samples/rclmbox.py makesig
</programlisting>
<para><literal>fetch</literal> and <literal>makesig</literal>
define two commands to execute to respectively retrieve the
@ -5390,15 +5472,15 @@ makesig = path/to/recoll/src/python/samples/rclmbox.py makesig
<para>Adapting to the new package structure:</para>
<programlisting>
<![CDATA[
try:
<![CDATA[
try:
from recoll import recoll
from recoll import rclextract
hasextract = True
except:
except:
import recoll
hasextract = False
]]>
]]>
</programlisting>
<para>Adapting to the change of nature of
@ -5408,10 +5490,10 @@ except:
the <literal>next</literal> value (old).</para>
<programlisting>
<![CDATA[
<![CDATA[
rownum = query.next if type(query.next) == int else \
query.rownumber
]]>
]]>
</programlisting>
</sect2> <!-- compat with previous version -->
@ -5719,7 +5801,8 @@ except:
very much welcome patches</ulink>.</para>
<formalpara><title>Configure options:</title>
<formalpara>
<title>Configure options:</title>
<para>
<itemizedlist>
@ -5983,9 +6066,9 @@ except:
character. Long lines can be continued by escaping the
physical newline with backslash, even inside quoted strings.</para>
<programlisting>
astringlist = "some string \
with spaces"
thesame = "some string with spaces"
astringlist = "some string \
with spaces"
thesame = "some string with spaces"
</programlisting>
<para>Parameters which are not part of string lists can't be
@ -6166,25 +6249,25 @@ thesame = "some string with spaces"
only plain ascii headers can be indexed, and only the
first occurrence will be used for headers that occur several times).
<programlisting>[prefixes]
# Index mailmytag contents (with the given prefix)
mailmytag = XMTAG
<programlisting>[prefixes]
# Index mailmytag contents (with the given prefix)
mailmytag = XMTAG
[stored]
# Store mailmytag inside the document data record (so that it can be
# displayed - as %(mailmytag) - in result lists).
mailmytag =
[stored]
# Store mailmytag inside the document data record (so that it can be
# displayed - as %(mailmytag) - in result lists).
mailmytag =
[queryaliases]
filename = fn
containerfilename = cfn
[queryaliases]
filename = fn
containerfilename = cfn
[mail]
# Extract the X-My-Tag mail header, and use it internally with the
# mailmytag field name
x-my-tag = mailmytag
</programlisting>
</para>
[mail]
# Extract the X-My-Tag mail header, and use it internally with the
# mailmytag field name
x-my-tag = mailmytag
</programlisting>
</para>
<sect3 id="RCL.INSTALL.CONFIG.FIELDS.XATTR">
@ -6231,7 +6314,7 @@ x-my-tag = mailmytag
should be handled specially, which is possible because they
are usually all located in one place. Example:
<programlisting>[~/.kde/share/apps/okular/docdata]
.xml = application/x-okular-notes</programlisting></para>
.xml = application/x-okular-notes</programlisting></para>
<para>The <varname>recoll_noindex</varname>
<filename>mimemap</filename> variable has been moved to
@ -6305,7 +6388,7 @@ x-my-tag = mailmytag
application tag to specialize the choice for an area of the
filesystem (using a <varname>localfields</varname> specification
in <filename>mimeconf</filename>). The syntax for the key is
<replaceable>mimetype</replaceable><literal>|</literal><replaceable>tag</replaceable></para>
<replaceable>mimetype</replaceable><literal>|</literal><replaceable>tag</replaceable></para>
<para>The <varname>nouncompforviewmts</varname> entry, (placed at
the top level, outside of the <literal>[view]</literal> section),
@ -6415,8 +6498,8 @@ x-my-tag = mailmytag
<listitem><para>In <filename>$RECOLL_CONFDIR/mimemap</filename>
(typically <filename>~/.recoll/mimemap</filename>), add the
following line:<programlisting>
.blob = application/x-blobapp
</programlisting>
.blob = application/x-blobapp
</programlisting>
Note that the MIME type is made up here, and you could
call it <replaceable>diesel/oil</replaceable> just the
same.</para>
@ -6424,8 +6507,8 @@ x-my-tag = mailmytag
<listitem><para>In <filename>$RECOLL_CONFDIR/mimeview</filename>
under the <literal>[view]</literal> section, add:</para>
<programlisting>
application/x-blobapp = blobviewer %f
</programlisting>
application/x-blobapp = blobviewer %f
</programlisting>
<para>We are supposing
that <replaceable>blobviewer</replaceable> wants a file
name parameter here, you would use <literal>%u</literal> if
@ -6458,8 +6541,8 @@ application/x-blobapp = blobviewer %f
section, add the following line (more about the
<replaceable>rclblob</replaceable> indexing script
later):<programlisting>
application/x-blobapp = exec rclblob
</programlisting></para>
application/x-blobapp = exec rclblob
</programlisting></para>
</listitem>
<listitem><para>Under the <literal>[icons]</literal>
section, you should choose an icon to be displayed for the
@ -6489,4 +6572,3 @@ application/x-blobapp = exec rclblob
</sect1>
</chapter>
</book>

View File

@ -571,6 +571,31 @@ logfilename = stderr
# the log... values.</descr></var>
#daemlogfilename = /dev/null
# <var name="orgidxconfdir" type="dfn">
#
# <brief>Original location of the configuration directory.</brief>
# <descr>This is used exclusively for movable datasets. Locating the
# configuration directory inside the directory tree makes it possible to
# provide automatic query time path translations once the data set has
# moved (for example, because it has been mounted on another
# location).</descr></var>
#orgidxconfdir =
# <var name="curidxconfdir" type="dfn">
#
# <brief>Current location of the configuration directory.</brief>
# <descr>Complement orgidxconfdir for movable datasets. This should be used
# if the configuration directory has been copied from the dataset to
# another location, either because the dataset is readonly and an r/w copy
# is desired, or for performance reasons. This records the original moved
# location before copy, to allow path translation computations. For
# example if a dataset originally indexed as '/home/me/mydata/config' has
# been mounted to '/media/me/mydata', and the GUI is running from a copied
# configuration, orgidxconfdir would be '/home/me/mydata/config', and
# curidxconfdir (as set in the copied configuration) would be
# '/media/me/mydata/config'.</descr></var>
#curidxconfdir =
# <var name="idxrundir" type="dfn">
#
# <brief>Indexing process current directory.</brief> <descr>The input